In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence #unique id

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Embedding,LSTM
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df=pd.read_excel("moviereviews.xlsx")

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
df["label"].value_counts()

neg    1000
pos    1000
Name: label, dtype: int64

In [5]:
df["label"].replace({"neg":0, "pos":1}, inplace=True)

In [6]:
df.head()

Unnamed: 0,label,review
0,0,how do films like mouse hunt get into theatres...
1,0,some talented actresses are blessed with a dem...
2,1,this has been an extraordinary year for austra...
3,1,according to hollywood movies made in last few...
4,0,my first press screening of 1998 and already i...


In [7]:
df.isnull().sum()

label      0
review    62
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.review

0       how do films like mouse hunt get into theatres...
1       some talented actresses are blessed with a dem...
2       this has been an extraordinary year for austra...
3       according to hollywood movies made in last few...
4       my first press screening of 1998 and already i...
                              ...                        
1995    i like movies with albert brooks , and i reall...
1996    it might surprise some to know that joel and e...
1997    the verdict : spine-chilling drama from horror...
1998    i want to correct what i wrote in a former ret...
1999    a couple of months ago , when i first download...
Name: review, Length: 1938, dtype: object

In [10]:
def cleantext(text):
  tokens = word_tokenize(text.lower())
  ftoken = [t for t in tokens if(t.isalpha())]
  stop = stopwords.words("english")
  ctoken = [t for t in ftoken if(t not in stop)]
  lemma = WordNetLemmatizer()
  ltoken = [lemma.lemmatize(t) for t in ctoken]
  return " ".join(ltoken)

In [11]:
df["clean_review"]=df["review"].apply(cleantext)

In [12]:
x = df["clean_review"]
y = df["label"]

In [13]:
df.head()

Unnamed: 0,label,review,clean_review
0,0,how do films like mouse hunt get into theatres...,film like mouse hunt get theatre law something...
1,0,some talented actresses are blessed with a dem...,talented actress blessed demonstrated wide act...
2,1,this has been an extraordinary year for austra...,extraordinary year australian film shine scoop...
3,1,according to hollywood movies made in last few...,according hollywood movie made last decade lif...
4,0,my first press screening of 1998 and already i...,first press screening already gotten prime can...


In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [15]:
sentlen = []

for sent in df["clean_review"]:
  sentlen.append(len(word_tokenize(sent)))

df["SentLen"] = sentlen 
df.head()

Unnamed: 0,label,review,clean_review,SentLen
0,0,how do films like mouse hunt get into theatres...,film like mouse hunt get theatre law something...,205
1,0,some talented actresses are blessed with a dem...,talented actress blessed demonstrated wide act...,317
2,1,this has been an extraordinary year for austra...,extraordinary year australian film shine scoop...,327
3,1,according to hollywood movies made in last few...,according hollywood movie made last decade lif...,504
4,0,my first press screening of 1998 and already i...,first press screening already gotten prime can...,380


In [16]:
max(sentlen)

1337

In [17]:
np.quantile(sentlen, 0.95)

612.0

In [18]:
# 95% of the review in doc has len equals to 612

In [19]:
max_len = np.quantile(sentlen, 0.95)

In [20]:
tok = Tokenizer(char_level=False, split=" ")
#char_level	if True, every character will be treated as a token.

tok.fit_on_texts(xtrain)
# tok.index_word

In [21]:
vocab_len = len(tok.index_word)
vocab_len

28725

In [22]:
seqtrain = tok.texts_to_sequences(xtrain) #step1
#seqtrain

In [23]:
seqmattrain = sequence.pad_sequences(seqtrain, maxlen= int(max_len)) #step2
seqmattrain

array([[   0,    0,    0, ...,    2,  120,  578],
       [   0,    0,    0, ...,  524, 2256,    2],
       [   0,    0,    0, ...,  340,  840,  568],
       ...,
       [   0,    0,    0, ...,    2,  358,   50],
       [   0,    0,    0, ...,   75,  339,    2],
       [   0,    0,    0, ...,  462,  288, 5994]])

In [24]:
seqtest = tok.texts_to_sequences(xtest)
seqmattest = sequence.pad_sequences(seqtest, maxlen=int(max_len))

In [25]:
vocab_len

28725

In [26]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,700, input_length=int(max_len), mask_zero=True))
rnn.add(LSTM(units=32, activation="tanh"))
rnn.add(Dense(units=32, activation="relu"))
rnn.add(Dropout(0.2))

rnn.add(Dense(units=1, activation="sigmoid"))

rnn.compile(optimizer="adam", loss="binary_crossentropy")

rnn.fit(seqmattrain, ytrain, batch_size=500, epochs=18)

ypred = rnn.predict(seqmattest)

ypred = ypred>0.5


Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


In [27]:
from sklearn.metrics import classification_report

In [28]:
print(classification_report(ypred,ytest))

              precision    recall  f1-score   support

       False       0.80      0.75      0.77       315
        True       0.72      0.78      0.75       267

    accuracy                           0.76       582
   macro avg       0.76      0.76      0.76       582
weighted avg       0.77      0.76      0.76       582

