In [1]:
import pandas as pd
from tqdm import tqdm

from PreProcessing import TextCleaner
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from tensorflow.python.keras.layers import Embedding, Bidirectional, LSTM, Flatten
from tensorflow.python.keras.layers import Convolution1D, GlobalMaxPool1D, Dense, Dropout
from tensorflow.python.keras.models import Model, Sequential

from tensorflow.python.keras.callbacks import ModelCheckpoint

from keras import backend as K

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
df = pd.read_csv('/content/drive/My Drive/IMDB Reviews Sentiment Analysis/IMDB-Reviews.csv', encoding='utf-8')

df = df.dropna(axis=0) # remove reviews which contains NaN value
df = df.drop_duplicates() # remove repeated rows from training data. now df has just unique reviews.

reviews = list(df['review']) # make reviews exclusive

# clean each review based on TextCleaner
cleaner = TextCleaner()
for indx, text in enumerate(tqdm(reviews)):
    reviews[indx] = cleaner.clean_text(text)

df['review']    = reviews # replace clean reviews with column of reviews in dataframe
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1}) # encode posetive & negative

100%|██████████| 49582/49582 [01:27<00:00, 566.69it/s]


In [3]:
print('Maximum number of features that a sentence has: ' + str(df.review.map(len).max()))
print('the mean of the length of sentences is: ' + str(df.review.apply(lambda x: len(x.split(" "))).mean()))

Maximum number of features that a sentence has: 8761
the mean of the length of sentences is: 122.07668105360817


In [0]:
num_maxFeatures = 6500  # less than 8761
num_maxLen = 130 # more than 122.07
embed_size = 123 # nearly to 122.07

tokenizer = Tokenizer(num_words=num_maxFeatures)
tokenizer.fit_on_texts(df['review'])
training_data = tokenizer.texts_to_sequences(df['review'])

X_train = pad_sequences(training_data, maxlen=num_maxLen)
Y_train = df['sentiment']

In [0]:
# define our Gussian Error Linear Unit Activation Function
def gelu(x):
    return 1 + K.tanh(0.798086 * (x + K.pow(x, 3)))

In [0]:
def get_model():
    model = Sequential()

    model.add(Embedding(input_dim=num_maxFeatures, output_dim=embed_size))

    model.add(Convolution1D(128, kernel_size=5, padding='valid', activation=gelu, strides=1))
    model.add(Dropout(rate=0.5))

    model.add(Bidirectional(LSTM(32, return_sequences=True)))
    
    model.add(GlobalMaxPool1D())

    model.add(Dense(20, activation=gelu))
    model.add(Dropout(rate=0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [0]:
model = get_model()

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 123)         799500    
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         78848     
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 64)          41216     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 20)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0

In [0]:
filepath = 'model.hdf5'
mcp = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

In [10]:
history = model.fit(X_train, Y_train, batch_size=100, epochs=6, validation_split=0.2, callbacks=[mcp])

Epoch 1/6
Epoch 00001: val_accuracy improved from -inf to 0.83120, saving model to model.hdf5
Epoch 2/6
Epoch 00002: val_accuracy improved from 0.83120 to 0.86337, saving model to model.hdf5
Epoch 3/6
Epoch 00003: val_accuracy improved from 0.86337 to 0.87658, saving model to model.hdf5
Epoch 4/6
Epoch 00004: val_accuracy improved from 0.87658 to 0.87930, saving model to model.hdf5
Epoch 5/6
Epoch 00005: val_accuracy did not improve from 0.87930
Epoch 6/6
Epoch 00006: val_accuracy improved from 0.87930 to 0.88061, saving model to model.hdf5
