In [3]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('moviereviews.tsv',sep= '\t')

In [5]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [6]:
df.drop(['label'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,review
0,how do films like mouse hunt get into theatres...
1,some talented actresses are blessed with a dem...
2,this has been an extraordinary year for austra...
3,according to hollywood movies made in last few...
4,my first press screening of 1998 and already i...


In [8]:
df.isnull().sum()

review    16
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
import nltk
nltk.download("vader_lexicon")
from typing import Sequence
from nltk.tokenize import word_tokenize
nltk.download("punkt")

from nltk.corpus import stopwords
nltk.download("stopwords")

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet")
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout,Dense, SimpleRNN, LSTM, Embedding

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [11]:
def cleantext(text):
  token = [t for t in word_tokenize(text.lower())]
  ftoken = [t for t in token if(t.isalpha())]
  lemma = WordNetLemmatizer()
  ftoken = [lemma.lemmatize(t) for t in ftoken]
  return " ".join(ftoken)

In [12]:
df['review'] = df['review'].apply(cleantext)
df['review']

0       how do film like mouse hunt get into theatre i...
1       some talented actress are blessed with a demon...
2       this ha been an extraordinary year for austral...
3       according to hollywood movie made in last few ...
4       my first press screening of and already i gott...
                              ...                        
1083    is hand down one of the worst movie a person c...
1084    the premise of the new horror film final desti...
1085    i saw simon birch in a basically sold out thea...
1086    sometimes the mile seems sooooo long indeed an...
1087                                                     
Name: review, Length: 1072, dtype: object

In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [14]:
sid = SentimentIntensityAnalyzer()

In [15]:
def getscore(sent):
    sid = SentimentIntensityAnalyzer()
    d = sid.polarity_scores(sent)
    v = d["compound"]
    
    if(v>0):
        return 1
    else:
        return 0

In [16]:
df['Label'] = df["review"].apply(getscore)

In [17]:
df.head()

Unnamed: 0,review,Label
0,how do film like mouse hunt get into theatre i...,0
1,some talented actress are blessed with a demon...,1
2,this ha been an extraordinary year for austral...,1
3,according to hollywood movie made in last few ...,1
4,my first press screening of and already i gott...,1


In [18]:
df["Label"].value_counts()

1    805
0    267
Name: Label, dtype: int64

In [19]:
x = df["review"]
y = df["Label"]

In [20]:
from sklearn.model_selection import train_test_split
xtrain,xtest, ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [21]:
sent_lens = []
for sent in df["review"]:
  sent_lens.append(len(word_tokenize(sent)))
df["sent_lens"] = sent_lens
df.head()

Unnamed: 0,review,Label,sent_lens
0,how do film like mouse hunt get into theatre i...,0,370
1,some talented actress are blessed with a demon...,1,573
2,this ha been an extraordinary year for austral...,1,612
3,according to hollywood movie made in last few ...,1,832
4,my first press screening of and already i gott...,1,715


In [22]:
np.quantile(sent_lens,0.95)

1178.35

In [23]:
max_len = int(np.quantile(sent_lens,0.95))

In [24]:
tok = Tokenizer(char_level=False, split=" ")
tok.fit_on_texts(xtrain)

In [25]:
tok.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'to',
 6: 'is',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'film',
 11: 'with',
 12: 'for',
 13: 'his',
 14: 'this',
 15: 'but',
 16: 'i',
 17: 'he',
 18: 'are',
 19: 'on',
 20: 'movie',
 21: 'be',
 22: 'by',
 23: 'an',
 24: 'not',
 25: 'one',
 26: 'who',
 27: 'you',
 28: 'at',
 29: 'have',
 30: 'from',
 31: 'wa',
 32: 'ha',
 33: 'they',
 34: 'her',
 35: 'all',
 36: 'character',
 37: 'there',
 38: 'like',
 39: 'so',
 40: 'about',
 41: 'more',
 42: 'out',
 43: 'which',
 44: 'what',
 45: 'when',
 46: 'their',
 47: 'up',
 48: 'she',
 49: 'do',
 50: 'or',
 51: 'some',
 52: 'just',
 53: 'time',
 54: 'get',
 55: 'we',
 56: 'doe',
 57: 'if',
 58: 'into',
 59: 'scene',
 60: 'make',
 61: 'him',
 62: 'story',
 63: 'even',
 64: 'can',
 65: 'only',
 66: 'than',
 67: 'no',
 68: 'would',
 69: 'good',
 70: 'will',
 71: 'most',
 72: 'much',
 73: 'been',
 74: 'also',
 75: 'very',
 76: 'them',
 77: 'two',
 78: 'see',
 79: 'way',
 80: 'other',
 81: 'life',
 82: 'go'

In [26]:
vocab_len = len(tok.index_word)

In [27]:
vocab_len

23717

In [28]:
sequnces_train = tok.texts_to_sequences(xtrain)

sequnces_train

Output hidden; open in https://colab.research.google.com to view.

In [29]:
sequences_matrix_train = sequence.pad_sequences(sequnces_train,maxlen=max_len)

In [30]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,700,input_length=max_len,mask_zero=True))
rnn.add(SimpleRNN(32,activation='tanh'))
rnn.add(Dense(32,activation='relu'))
rnn.add(Dropout(0.2))

rnn.add(Dense(1,activation="sigmoid"))

In [31]:
rnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1178, 700)         16602600  
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                23456     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 16,627,145
Trainable params: 16,627,145
Non-trainable params: 0
_________________________________________________________________


In [32]:
rnn.compile(loss="binary_crossentropy",optimizer='adam')
rnn.fit(sequences_matrix_train,ytrain,batch_size=20,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7faef28f4ad0>

In [33]:
sequences_matrix_test = sequence.pad_sequences(tok.texts_to_sequences(xtest),maxlen=max_len)

In [34]:
ypred = rnn.predict(sequences_matrix_test)

In [35]:
ypred=ypred>0.5

In [36]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.27      0.54      0.36        50
           1       0.80      0.55      0.65       165

    accuracy                           0.55       215
   macro avg       0.53      0.55      0.50       215
weighted avg       0.67      0.55      0.58       215

