In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential , Model
from keras.layers import Conv2D , MaxPool2D , Dense , Dropout , Flatten
from PIL  import Image

In [2]:
from tensorflow.keras.utils import to_categorical
from keras.layers import LSTM , SimpleRNN , Embedding

In [None]:
## importing Libraries

In [41]:
reviews = pd.read_csv(r"F:\Python\dataset\amazonreviews.tsv" , sep = '\t')  ## file reading 

In [42]:
reviews.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [43]:
reviews.shape

(10000, 2)

In [None]:
## so here we are replacing negative and positive with 0 and 1 

In [44]:
reviews.label = reviews.label.replace({"pos" : 1 , "neg" : 0})

In [45]:
reviews.label.value_counts()

0    5097
1    4903
Name: label, dtype: int64

In [46]:
reviews.isnull().sum()

label     0
review    0
dtype: int64

In [47]:
## data cleaning done 

In [48]:
reviews_x = reviews.iloc[:: , 1]
reviews_y = reviews.iloc[:: , 0]

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
x_train , x_test , y_train , y_test = train_test_split(reviews_x , reviews_y , test_size=.2)

In [51]:
print(x_train.shape)
print(y_train.shape)
print("-----------")
print(x_test.shape)
print(y_test.shape)

(8000,)
(8000,)
-----------
(2000,)
(2000,)


In [52]:
## y into categorical form 

In [53]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [54]:
max_num_words = 10000  ## so here we are replacing negative and positive with 0 and 1 
seq_len = 50 
embedding_size = 100

In [None]:
## this max num words will work like sparse matrix so when we have words in that sentence then it will write their 1 else 0
## so we will give 10000 limit so till 10000 the columns of one hot encoding will add ( and adding 1 and 0 to respective column)

## seq_len = 50 will allow us to have atleast 50 words in our one sentence ( 50 words in one sentence)
## and we gave this input to padding so we will get a matrix of 50*50
## so when at some point when we dont have any value at that particular position so padding add their 0 
## and make this sentence size 50 

## embedding_size = 100 
## this embedding size will make a vector of that particular word and we gave their 100 so it will make or add 100 values with 
## respect to their combination of other words 
## example of embeddings = like gender word is related to boy and girl or queen or king size so for king and queen it will give 
## the good values because of high relation of words but when we comapre this word to mango and apple so it will not give the 
## large relational vallues it will give the low values beacuse mango and apple not related with gender 
## embedding vector representation [0.1 , 1 , 0.005 , -1 , 0.7 , -0.2 , .........................(till 100)] 

## max_num_words = 10000  ( working on 10000 words with one hot encoding)( working like sparse matrix)( assigning index to word)

## seq_len = 50 ( in one sentence allowing atleast 50 words )


## embedding_size = 100 ( creating vector with 100 values , and the values of words relation to each other ( gender and fruits))
## corelation values added in the vector 


In [55]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [56]:
tokenizer = Tokenizer(num_words=max_num_words)

In [57]:
tokenizer.fit_on_texts(reviews.review)   ## making 10,000 tokens from the datset

In [58]:
x_train = tokenizer.texts_to_sequences(x_train)   ## will convert text to sequence of ids ,converting words in tokens 

In [59]:
x_train = pad_sequences(x_train ,maxlen= seq_len)   ## creating 1 sentence with 50 words 

In [None]:
## in this we are giving 2 inputs in () because we want to apply padding on x_train data and with sequence of 50 words 

In [60]:
x_test = tokenizer.texts_to_sequences(x_test)  ## will convert text to sequence of ids ,converting words in tokens 

In [61]:
x_test = pad_sequences(x_test , maxlen= seq_len)  ## creating sentence with 50 words 

In [None]:
## in this we are giving 2 inputs in () because we want to apply padding on x_test data and with sequence of 50 words 

In [None]:
## with this two syntax we will convert our text in the numeric format so we will able to build the algorithm 

In [62]:
model = Sequential() ## intialising the model 

model.add(Embedding( input_dim = max_num_words , input_length = seq_len , output_dim = embedding_size  ))
## giving input as max num words and lenghth of 50 words which is stored in seq_len and output_dim as embedding size vector

model.add(LSTM(5))
## adding layer of long short term memory 
## LSTM(5) ## this 5 is the number of neuron in our single LSTM layer 

model.add(Dense(2 , activation = 'softmax'))
## output layer
## there are 2 output classes ham and spam which is denoted by 0 and 1 so in output layer layer we are giving 2

In [63]:
model.compile(optimizer= "adam" , loss = "categorical_crossentropy" , metrics=["accuracy"])

In [76]:
model.fit(x_train , y_train , epochs= 5 , batch_size = 32, validation_split= .2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26d0b380130>

In [None]:
## model fitting done and its time fopr prediction 

In [77]:
pred = model.predict(x_test)

In [78]:
pred

array([[9.5249641e-01, 4.7503594e-02],
       [9.9319780e-01, 6.8021882e-03],
       [6.3709100e-04, 9.9936289e-01],
       ...,
       [2.0623421e-03, 9.9793768e-01],
       [2.7346537e-03, 9.9726534e-01],
       [9.9925381e-01, 7.4622099e-04]], dtype=float32)

In [79]:
pred_classes = np.argmax(pred , 1)

In [80]:
pred_classes

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [81]:
from sklearn.metrics import confusion_matrix , accuracy_score

In [83]:
tab1 = confusion_matrix(y_test , pred_classes)
tab1

array([[759, 247],
       [204, 790]], dtype=int64)

In [84]:
accuracy_score(y_test , pred_classes)*100

77.45