Sentiment Analysis of IMDB Dataset using Deep Learning methods.

Dataset link: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
# importing pandas for data processing, numpy for linear algebra, nltk for natural lang processing
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
import keras
# from sklearn import preprocessing

import string
from string import punctuation
import re
# importing lemmatizer stopwords and punctuations

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# importing other keras and sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding,Conv1D,LSTM,GRU,BatchNormalization,Flatten,Dense

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Getting the dataset from Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
path = "/content/drive/MyDrive/dataset/IMDB_Dataset.csv"
df = pd.read_csv(path)

In [None]:
#  Dropping duplicate rows. Verified that duplicate entries are present.
df = df.drop_duplicates(subset=['review', 'sentiment'], keep='first')
df.describe(include='all')

Unnamed: 0,review,sentiment
count,49582,49582
unique,49582,2
top,"Okay, so it starts very unimaginatively with a...",positive
freq,1,24884


Functions for lower- casing, removing punctuation, linebreak tag, concatenated words, emojis, special characters.

In [None]:
count = 0
def sub_lower(text):
  # global count
  # count = count + 1
  # print(count)
  text = text.lower() # convert all text to lowercase
  text = re.sub("<br\s*/?>", " ", text) # remove line breaks
   # remove concatenations in english language
  text=re.sub("isn't",'is not',text)
  text=re.sub("he's",'he is',text)
  text=re.sub("wasn't",'was not',text)
  text=re.sub("there's",'there is',text)
  text=re.sub("couldn't",'could not',text)
  text=re.sub("won't",'will not',text)
  text=re.sub("they're",'they are',text)
  text=re.sub("she's",'she is',text)
  text=re.sub("there's",'there is',text)
  text=re.sub("wouldn't",'would not',text)
  text=re.sub("haven't",'have not',text)
  text=re.sub("that's",'that is',text)
  text=re.sub("you've",'you have',text)
  text=re.sub("he's",'he is',text)
  text=re.sub("what's",'what is',text)
  text=re.sub("weren't",'were not',text)
  text=re.sub("we're",'we are',text)
  text=re.sub("hasn't",'has not',text)
  text=re.sub("you'd",'you would',text)
  text=re.sub("shouldn't",'should not',text)
  text=re.sub("let's",'let us',text)
  text=re.sub("they've",'they have',text)
  text=re.sub("you'll",'you will',text)
  text=re.sub("i'm",'i am',text)
  text=re.sub("we've",'we have',text)
  text=re.sub("it's",'it is',text)
  text=re.sub("don't",'do not',text)
  text=re.sub("that's",'that is',text)
  text=re.sub("i'm",'i am',text)
  text=re.sub("it's",'it is',text)
  text=re.sub("she's",'she is',text)
  text=re.sub("he's'",'he is',text)
  text=re.sub("i'm",'i am',text)
  text=re.sub("i'd",'i did',text)
  text=re.sub("he's",'he is',text)
  text=re.sub('there’s','there is',text)
  text=re.sub("who'll",'who will',text)
  text=re.sub("you'll",'you will',text)
  
  # special characters, emojis, urls, numbers
  text=re.sub('\x91The','The',text)
  text=re.sub('\x97','',text)
  text=re.sub('\x84The','The',text)
  text=re.sub('\uf0b7','',text)
  text=re.sub('¡¨','',text)
  text=re.sub('\x95','',text)
  text=re.sub('\x8ei\x9eek','',text)
  text=re.sub('\xad','',text)
  text=re.sub('\x84bubble','bubble',text)
  text=re.sub(r'http\S+', '', text)
  text=re.sub(r'[0-9]', '', text)

  return text

Tokenization, removal of stopwords.

In [None]:
count = 0
def clean_doc(text):
  doc = sub_lower(text)
  global count
  count = count+1
  # print(count)
  # split into tokens by white space
  tokens = doc.split()
  # remove punctuation from each token
  table = str.maketrans('', '', punctuation)
  tokens = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  # filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  text = " ".join(tokens)
  return text

In [None]:
df["review"]=df.loc[:, "review"].apply(clean_doc)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake thinks zombie...,negative
4,petter matteis love time money visually stunni...,positive


One hot encoding the output

In [None]:
le=LabelEncoder()
df['sentiment']= le.fit_transform(df['sentiment'])
labels=to_categorical(df['sentiment'],num_classes=2)

In [None]:
labels

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter matteis love time money visually stunni...,1


Tokenizing the corpus of text. ~47k words appear more than twice but empirically 10k performs better. 

In [None]:
num_words = 10000 # number of words that occur more than 2 times is around 47k
tokenizer=Tokenizer(num_words=num_words,oov_token='<OOV>')
tokenizer.fit_on_texts(df['review'])
word_index=tokenizer.word_index
total_vocab=len(word_index)

In [None]:
print(total_vocab)
# print(word_index)

161952


Padding and converting text to sequence of numbers in vocabulary. Train test split.

In [None]:
max_len = 400 # 75% reviews are 100 % covered with length of 281
embeddings=256
sequences = tokenizer.texts_to_sequences(df['review'])
sequences_padded=pad_sequences(sequences,maxlen=max_len)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(sequences_padded,labels,test_size=0.20,random_state=42)

Defining the model. Including dropouts and regularizations.

In [None]:
model= keras.Sequential()
model.add(Embedding(num_words,embeddings,input_length=max_len))
model.add(Conv1D(256,10,activation='relu'))
model.add(keras.layers.Bidirectional(LSTM(128,return_sequences=True)))
model.add(keras.layers.Dropout(0.1))
model.add(LSTM(64))
model.add(keras.layers.Dropout(0.4))
model.add(Dense(2,activation='softmax'))

In [None]:
# model= keras.Sequential()
# model.add(Embedding(num_words,embeddings,input_length=max_len))
# model.add(Conv1D(256,10,activation='relu'))
# model.add(keras.layers.Bidirectional(LSTM(128,return_sequences=True,kernel_regularizer=tf.keras.regularizers.l1(0.01),activity_regularizer=tf.keras.regularizers.l2(0.01))))
# model.add(LSTM(64))
# model.add(keras.layers.Dropout(0.4))
# model.add(Dense(2,activation='softmax'))
# model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 400, 128)          1280000   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 391, 256)          327936    
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 391, 256)          394240    
_________________________________________________________________
dropout_18 (Dropout)         (None, 391, 256)          0         
_________________________________________________________________
lstm_28 (LSTM)               (None, 64)                82176     
_________________________________________________________________
dropout_19 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 2)               

In [None]:
# model.add(Embedding(num_words,embeddings,input_length=max_len))
# model.add(Conv1D(256,10,activation='relu'))
# model.add(keras.layers.Bidirectional(LSTM(128,return_sequences=True)))
# model.add(LSTM(64))
# model.add(keras.layers.Dropout(0.4))
# model.add(Dense(2,activation='softmax'))

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
             ) 
#adamax provides slight improvement over adam but increase in training time.

In [None]:
history=model.fit(X_train,y_train,epochs=4, batch_size=128, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
scores = model.evaluate(X_test, y_test, verbose=10)
# print(scores)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.49%


In [None]:
model.save('imdb_model1.h5')

5 epochs
85.1
3 epochs - saved as imdb_model1
88.7
adding removing bi-gru and bi-lstm layers
87 ish

---

10k tokens
87.51
adding regularizer
88.10
48k
87.00

---

adamax
87.6
adam
87.4

---
adding more layers
87.48
