In [1]:

import pandas as pd    
import numpy as np     
import nltk
from nltk.corpus import stopwords   
from sklearn.model_selection import train_test_split      
from tensorflow import keras 
from keras.preprocessing.text import Tokenizer  
from keras.utils import pad_sequences   
from keras.models import Sequential     
from keras.layers import Embedding, LSTM, Dense 
import re

In [2]:
data = pd.read_csv("IMDB Dataset.csv")
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
english_stops = set(stopwords.words('english'))


In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv("IMDB Dataset.csv")
x_data = df['review']
y_data = df['sentiment']

x_data = x_data.replace({'<.*?': ''}, regex=True)
x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     
x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
x_data = x_data.apply(lambda review: [w.lower() for w in review]) 
y_data = y_data.replace('positive', 1)

y_data = y_data.replace('negative', 0)

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)



Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, br, br, the...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

In [7]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))
a = get_max_length()
print(a)

134


In [8]:
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)


Encoded X Train
 [[ 309 2250   22 ...    0    0    0]
 [   2  658 1844 ...    0    0    0]
 [   2  889    2 ...    0    0    0]
 ...
 [ 205   24 4206 ...    0    0    0]
 [   2  102   29 ...    7   85  781]
 [   9  228    5 ...    0    0    0]] 

Encoded X Test
 [[  40 1899  300 ...    0    0    0]
 [ 623  337 1635 ...    0    0    0]
 [3225  124    3 ...    0    0    0]
 ...
 [  34    4   15 ...    0    0    0]
 [  79  167    7 ...    0    0    0]
 [   3  611   14 ...    0    0    0]] 

Maximum review length:  134


In [9]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 134, 32)           2934752   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,959,649
Trainable params: 2,959,649
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2211decb7f0>

In [11]:
model.save('model.h5')
