In [23]:
#Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import tensorflow as tf

In [24]:
#importing the training data
imdb_data=pd.read_csv('IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [25]:
#Summary of the dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [26]:
#sentiment count
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [27]:
imdb_data = imdb_data[imdb_data.sentiment != 'unsup']
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [28]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    
    return text



#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

In [29]:
# function for text cleaning
 
def clean(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z0-9]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [30]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

In [31]:
imdb_data['review']

0        one of the other review ha mention that after ...
1        A wonder littl production. the film techniqu i...
2        I thought thi wa a wonder way to spend time on...
3        basic there' a famili where a littl boy (jake)...
4        petter mattei' "love in the time of money" is ...
5        probabl my all-tim favorit movie, a stori of s...
6        I sure would like to see a resurrect of a up d...
7        thi show wa an amazing, fresh & innov idea in ...
8        encourag by the posit comment about thi film o...
9        If you like origin gut wrench laughter you wil...
10       phil the alien is one of those quirki film whe...
11       I saw thi movi when I wa about 12 when it came...
12       So im not a big fan of boll' work but then aga...
13       the cast play shakespeare.shakespear lost.i ap...
14       thi a fantast movi of three prison who becom f...
15       kind of drawn in by the erot scenes, onli to r...
16       some film just simpli should not be remade. th.

In [32]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
nltk.download('stopwords')
stop=nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shuaich\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
print(stop)

#removing the stopwords
# function to remove stopwords
def removestopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop]
    return ' '.join(no_stopword_text)

#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(removestopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [34]:
imdb_data.review.apply(lambda x: len(x.split(" "))).mean()

137.19528

In [37]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(imdb_data['review'])
list_tokenized_train = tokenizer.texts_to_sequences(imdb_data['review'])

maxlen = 600
X = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = imdb_data['sentiment']

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#batch_size = 100
#epochs = 3
history=model.fit(X,y, batch_size=100, epochs=5, validation_split=0.2)

Train on 40000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
Train on 40000 samples, validate on 10000 samples
Epoch 1/5
40000/40000 [==============================] - 313s 8ms/step - loss: 0.3619 - accuracy: 0.8351 - val_loss: 0.2648 - val_accuracy: 0.8915
Epoch 2/5
40000/40000 [==============================] - 338s 8ms/step - loss: 0.2218 - accuracy: 0.9136 - val_loss: 0.2617 - val_accuracy: 0.8954
Epoch 3/5
40000/40000 [==============================] - 356s 9ms/step - loss: 0.1745 - accuracy: 0.9355 - val_loss: 0.2713 - val_accuracy: 0.8954
Epoch 4/5
40000/40000 [==============================] - 350s 9ms/step - loss: 0.1286 - accuracy: 0.9552 - val_loss: 0.3189 - val_accuracy: 0.8857
Epoch 5/5
40000/40000 [==============================] - 338s 8ms/step - loss: 0.0926 - accuracy: 0.9703 - val_loss: 0.3464 - val_accuracy: 0.8883