In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from wordcloud import WordCloud

import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence #unique id

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df=pd.read_excel("movie.xlsx")

In [None]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [None]:
df["label"].value_counts()

neg    1000
pos    1000
Name: label, dtype: int64

In [None]:
df["label"].replace({"neg":0, "pos":1}, inplace=True)

In [None]:
df.head()

Unnamed: 0,label,review
0,0,how do films like mouse hunt get into theatres...
1,0,some talented actresses are blessed with a dem...
2,1,this has been an extraordinary year for austra...
3,1,according to hollywood movies made in last few...
4,0,my first press screening of 1998 and already i...


In [None]:
df.isnull().sum()

label      0
review    62
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
def cleantext(text):
  tokens = word_tokenize(text.lower())
  ftoken = [t for t in tokens if(t.isalpha())]
  stop = stopwords.words("english")
  ctoken = [t for t in ftoken if(t not in stop)]
  lemma = WordNetLemmatizer()
  ltoken = [lemma.lemmatize(t) for t in ctoken]
  return " ".join(ltoken)

In [None]:
df["clean_review"]=df["review"].apply(cleantext)

In [None]:
x = df["clean_review"]
y = df["label"]

In [None]:
df.head()

Unnamed: 0,label,review,clean_review
0,0,how do films like mouse hunt get into theatres...,film like mouse hunt get theatre law something...
1,0,some talented actresses are blessed with a dem...,talented actress blessed demonstrated wide act...
2,1,this has been an extraordinary year for austra...,extraordinary year australian film shine scoop...
3,1,according to hollywood movies made in last few...,according hollywood movie made last decade lif...
4,0,my first press screening of 1998 and already i...,first press screening already gotten prime can...


In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
sentlen = []

for sent in df["clean_review"]:
  sentlen.append(len(word_tokenize(sent)))

df["SentLen"] = sentlen 
df.head()

Unnamed: 0,label,review,clean_review,SentLen
0,0,how do films like mouse hunt get into theatres...,film like mouse hunt get theatre law something...,205
1,0,some talented actresses are blessed with a dem...,talented actress blessed demonstrated wide act...,317
2,1,this has been an extraordinary year for austra...,extraordinary year australian film shine scoop...,327
3,1,according to hollywood movies made in last few...,according hollywood movie made last decade lif...,504
4,0,my first press screening of 1998 and already i...,first press screening already gotten prime can...,380


In [None]:
max(sentlen)

1337

In [None]:
np.quantile(sentlen, 0.95)

612.0

In [None]:
# 95% of the review in doc has len equals to 612

In [None]:
max_len = np.quantile(sentlen, 0.95)

In [None]:
tok = Tokenizer(char_level=False, split=" ")
#char_level	if True, every character will be treated as a token.

tok.fit_on_texts(xtrain)
tok.index_word

{1: 'film',
 2: 'movie',
 3: 'one',
 4: 'character',
 5: 'like',
 6: 'time',
 7: 'get',
 8: 'scene',
 9: 'even',
 10: 'make',
 11: 'good',
 12: 'would',
 13: 'story',
 14: 'much',
 15: 'also',
 16: 'two',
 17: 'way',
 18: 'see',
 19: 'life',
 20: 'first',
 21: 'go',
 22: 'thing',
 23: 'well',
 24: 'could',
 25: 'take',
 26: 'year',
 27: 'really',
 28: 'little',
 29: 'come',
 30: 'people',
 31: 'know',
 32: 'plot',
 33: 'work',
 34: 'never',
 35: 'bad',
 36: 'man',
 37: 'performance',
 38: 'best',
 39: 'many',
 40: 'end',
 41: 'new',
 42: 'look',
 43: 'director',
 44: 'love',
 45: 'play',
 46: 'actor',
 47: 'u',
 48: 'role',
 49: 'show',
 50: 'action',
 51: 'great',
 52: 'find',
 53: 'another',
 54: 'give',
 55: 'back',
 56: 'audience',
 57: 'star',
 58: 'still',
 59: 'seems',
 60: 'made',
 61: 'say',
 62: 'something',
 63: 'want',
 64: 'however',
 65: 'think',
 66: 'world',
 67: 'comedy',
 68: 'better',
 69: 'though',
 70: 'part',
 71: 'day',
 72: 'enough',
 73: 'big',
 74: 'around',
 

In [None]:
vocab_len = len(tok.index_word)
vocab_len

28725

In [None]:
seqtrain = tok.texts_to_sequences(xtrain) #step1
seqtrain

[[71,
  4482,
  178,
  1121,
  427,
  2431,
  533,
  1050,
  688,
  2432,
  4,
  2253,
  2254,
  1950,
  40,
  19,
  172,
  9171,
  1904,
  17,
  76,
  14,
  2133,
  520,
  101,
  12,
  25,
  1292,
  1122,
  109,
  2255,
  1754,
  751,
  6,
  167,
  396,
  994,
  106,
  184,
  1121,
  427,
  71,
  4482,
  201,
  17,
  334,
  109,
  7512,
  892,
  100,
  1088,
  1069,
  6439,
  205,
  3093,
  1440,
  1279,
  13,
  14,
  173,
  4,
  1606,
  6440,
  68,
  71,
  4482,
  208,
  1261,
  995,
  205,
  3093,
  4,
  14,
  68,
  216,
  120,
  150,
  128,
  1671,
  335,
  17,
  7,
  18,
  286,
  1642,
  246,
  14376,
  1158,
  6923,
  9172,
  4731,
  3730,
  20,
  48,
  31,
  2,
  11,
  1643,
  7513,
  16,
  93,
  9,
  1905,
  54,
  366,
  329,
  815,
  664,
  17,
  1388,
  11957,
  19,
  258,
  4,
  12,
  816,
  34,
  18471,
  11958,
  8250,
  131,
  1123,
  91,
  2433,
  306,
  4,
  3896,
  13,
  815,
  143,
  3449,
  707,
  11957,
  250,
  4,
  27,
  4249,
  641,
  84,
  152,
  418,
  250,
  1

In [None]:
seqmattrain = sequence.pad_sequences(seqtrain, maxlen= int(max_len)) #step2
seqmattrain

array([[   0,    0,    0, ...,    2,  120,  578],
       [   0,    0,    0, ...,  524, 2256,    2],
       [   0,    0,    0, ...,  340,  840,  568],
       ...,
       [   0,    0,    0, ...,    2,  358,   50],
       [   0,    0,    0, ...,   75,  339,    2],
       [   0,    0,    0, ...,  462,  288, 5994]], dtype=int32)

In [None]:
seqtest = tok.texts_to_sequences(xtest)
seqmattest = sequence.pad_sequences(seqtest, maxlen=int(max_len))

In [None]:
vocab_len

28725

In [None]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,700, input_length=int(max_len), mask_zero=True))
rnn.add(LSTM(units=32, activation="tanh"))
rnn.add(Dense(units=32, activation="relu"))
rnn.add(Dropout(0.2))

rnn.add(Dense(units=1, activation="sigmoid"))

rnn.compile(optimizer="adam", loss="binary_crossentropy")

rnn.fit(seqmattrain, ytrain, batch_size=50, epochs=25)

ypred = rnn.predict(seqmattest)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
ypred = ypred>0.5
ypred = np.where(ypred<0.5,0,1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.83      0.68      0.75       293
           1       0.73      0.85      0.79       289

    accuracy                           0.77       582
   macro avg       0.78      0.77      0.77       582
weighted avg       0.78      0.77      0.77       582

