Import Libraries

In [29]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import SGDClassifier

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Flatten, Dropout,Bidirectional
from keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re
from keras.preprocessing.text import Tokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

Loading data from pickle file

In [3]:
#getting the data 
# importing pickled data 
sup_df=pd.read_pickle('Pickledfiles/cl_fs_imdb_df.pkl') 
#sup_df=sup_df[['feature_selected','label']] # cleaned and feature selected reviews 
#sup_df.shape 



In [4]:
sup_df.head()

Unnamed: 0,review,label
0,bromwel high cartoon comedi ran time program s...,pos
1,homeless or houseless georg carlin state issu ...,pos
2,brilliant over act lesley ann warren best dram...,pos
3,easili underr film inn brook cannon sure flaw ...,pos
4,typic mel brook film much le slapstick movi ac...,pos


In [5]:
sup_df.shape

(50000, 2)

In [6]:
#getting the training data 
# importing pickled data 
train_df=pd.read_pickle('Pickledfiles/cl_train_df.pkl') 
 


In [7]:
train_df.shape

(25000, 2)

In [8]:
#getting the test data 
# importing pickled data 
test_df=pd.read_pickle('Pickledfiles/cl_test_df.pkl') 
 

In [9]:
test_df.shape

(25000, 2)

Function to split the data

In [10]:
#since we are using cleaned data, only splitting is required.

def review_split(text):
    import string
    return text.split() 

Preparing data

In [11]:
# preparing data
x = sup_df.review
# encode the target strings
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(sup_df.label)

In [12]:
# preparing training data
x_train = train_df.review
# encode the target strings
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_train = le.fit_transform(train_df.label)

In [13]:
# preparing testing data
x_test = test_df.review
# encode the target strings
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_test = le.fit_transform(test_df.label)

In [14]:
# Total number of unique words
print("Number of unique words in consolidated dataset: ")
print(len(np.unique(np.hstack(x))))
print("Number of unique words in Training dataset: ")
print(len(np.unique(np.hstack(x_train))))
print("Number of unique words in Testing dataset: ")
print(len(np.unique(np.hstack(x_test))))

Number of unique words in consolidated dataset: 
49576
Number of unique words in Training dataset: 
24898
Number of unique words in Testing dataset: 
24795


In [15]:
x_train

0        high cartoon comedi ran time program school li...
1        or georg state issu year never plan help stree...
2        brilliant over act ann best dramat ladi ever s...
3        easili underr film brook sure flaw give realis...
4        typic brook film much le slapstick movi actual...
                               ...                        
24995    toward end movi felt technic felt like watch p...
24996    kind movi enemi content watch time bloodi true...
24997    saw last night film festiv one huge disappoint...
24998    film pick pound turn rather good rd centuri fi...
24999    one dumbest film ive ever seen rip near ever t...
Name: review, Length: 25000, dtype: object

Assign Train and Test Variables

In [16]:
msg_train = x_train
msg_test = x_test
label_train = y_train
label_test = y_test

print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

25000 25000 50000


In [17]:
y_train

array([1, 1, 1, ..., 0, 0, 0])

Function to get maximum length

In [18]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

Encode Review

In [19]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 136  787  113 ...    0    0    0]
 [ 493  642  557 ... 2317 2303  109]
 [ 430  377   33 ...    0    0    0]
 ...
 [ 147  144  236 ...  138  165 2195]
 [   2  549 2524 ...    0    0    0]
 [   3 2991    2 ...    0    0    0]] 

Encoded X Test
 [[ 425  147    1 ...    0    0    0]
 [  47  101   71 ...  529  206 1377]
 [1638 1377  454 ...    0    0    0]
 ...
 [ 426  448  571 ... 1668   10  412]
 [ 658   36 1026 ...    0    0    0]
 [ 584  355  325 ...   97 1038 1033]] 

Maximum review length:  106


Architetcure of LSTM Model

In [52]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64
model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model.add((LSTM(32, return_sequences = True)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 106, 32)           96032     
_________________________________________________________________
lstm_8 (LSTM)                (None, 106, 32)           8320      
_________________________________________________________________
dropout_8 (Dropout)          (None, 106, 32)           0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 3392)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 250)               848250    
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 251       
Total params: 952,853
Trainable params: 952,853
Non-trainable params: 0
________________________________________________

Use chekcpoint to save the epoch details

In [54]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h50',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [55]:
train_history = model.fit(x_train, y_train, batch_size = 128, epochs = 50, callbacks=[checkpoint])

Epoch 1/50

Epoch 00001: accuracy improved from -inf to 0.81576, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 2/50

Epoch 00002: accuracy improved from 0.81576 to 0.88676, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 3/50

Epoch 00003: accuracy improved from 0.88676 to 0.90672, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 4/50

Epoch 00004: accuracy improved from 0.90672 to 0.93008, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 5/50

Epoch 00005: accuracy improved from 0.93008 to 0.95168, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 6/50

Epoch 00006: accuracy improved from 0.95168 to 0.97100, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 7/50

Epoch 00007: accuracy improved from 0.97100 to 0.98280, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 8/50

Epoch 00008: accuracy improved from 0.98280 to 0.98840, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 9/50

Epoch 00009: accuracy improved from 0.98840 to 0.98884, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 10/50

Epoch 00010: accuracy improved from 0.98884 to 0.98964, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 11/50

Epoch 00011: accuracy improved from 0.98964 to 0.99400, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 12/50

Epoch 00012: accuracy improved from 0.99400 to 0.99592, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 13/50

Epoch 00013: accuracy improved from 0.99592 to 0.99608, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 14/50

Epoch 00014: accuracy did not improve from 0.99608
Epoch 15/50

Epoch 00015: accuracy did not improve from 0.99608
Epoch 16/50

Epoch 00016: accuracy did not improve from 0.99608
Epoch 17/50

Epoch 00017: accuracy did not improve from 0.99608
Epoch 18/50

Epoch 00018: accuracy improved from 0.99608 to 0.99704, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 19/50

Epoch 00019: accuracy did not improve from 0.99704
Epoch 20/50

Epoch 00020: accuracy did not improve from 0.99704
Epoch 21/50

Epoch 00021: accuracy improved from 0.99704 to 0.99728, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 22/50

Epoch 00022: accuracy improved from 0.99728 to 0.99800, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 23/50

Epoch 00023: accuracy did not improve from 0.99800
Epoch 24/50

Epoch 00024: accuracy did not improve from 0.99800
Epoch 25/50

Epoch 00025: accuracy did not improve from 0.99800
Epoch 26/50

Epoch 00026: accuracy did not improve from 0.99800
Epoch 27/50

Epoch 00027: accuracy did not improve from 0.99800
Epoch 28/50

Epoch 00028: accuracy did not improve from 0.99800
Epoch 29/50

Epoch 00029: accuracy did not improve from 0.99800
Epoch 30/50

Epoch 00030: accuracy did not improve from 0.99800
Epoch 31/50

Epoch 00031: accuracy improved from 0.99800 to 0.99892, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 32/50

Epoch 00032: accuracy did not improve from 0.99892
Epoch 33/50

Epoch 00033: accuracy did not improve from 0.99892
Epoch 34/50

Epoch 00034: accuracy did not improve from 0.99892
Epoch 35/50

Epoch 00035: accuracy did not improve from 0.99892
Epoch 36/50

Epoch 00036: accuracy did not improve from 0.99892
Epoch 37/50

Epoch 00037: accuracy did not improve from 0.99892
Epoch 38/50

Epoch 00038: accuracy did not improve from 0.99892
Epoch 39/50

Epoch 00039: accuracy did not improve from 0.99892
Epoch 40/50

Epoch 00040: accuracy improved from 0.99892 to 0.99932, saving model to models\LSTM.h50




INFO:tensorflow:Assets written to: models\LSTM.h50\assets


INFO:tensorflow:Assets written to: models\LSTM.h50\assets


Epoch 41/50

Epoch 00041: accuracy did not improve from 0.99932
Epoch 42/50

Epoch 00042: accuracy did not improve from 0.99932
Epoch 43/50

Epoch 00043: accuracy did not improve from 0.99932
Epoch 44/50

Epoch 00044: accuracy did not improve from 0.99932
Epoch 45/50

Epoch 00045: accuracy did not improve from 0.99932
Epoch 46/50

Epoch 00046: accuracy did not improve from 0.99932
Epoch 47/50

Epoch 00047: accuracy did not improve from 0.99932
Epoch 48/50

Epoch 00048: accuracy did not improve from 0.99932
Epoch 49/50

Epoch 00049: accuracy did not improve from 0.99932
Epoch 50/50

Epoch 00050: accuracy did not improve from 0.99932


In [56]:
scores = model.evaluate(x_test, y_test, verbose=0)
print(f'Accuracy: {scores[1]*100:.2f}%')

Accuracy: 81.64%


In [None]:
## End of Part 3 ##