In [6]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from numpy import array
import tensorflow
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout,Activation,Embedding
from tensorflow.keras.layers import Flatten,GlobalMaxPooling1D,Convolution1D,LSTM
from sklearn.model_selection import train_test_split

In [7]:
df=pd.read_csv('/content/IMDB_Dataset.csv')
df.shape

(50000, 2)

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [11]:
tg=re.compile(r'[^>]+>')
def remove_tags(text):
    return tg.sub('',text)

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
def preprocess_text(sen):
    sentence=sen.lower()

    #removing html tags
    sentence=remove_tags(sentence)

    #remove punctuations and numbers
    sentence=re.sub('[^a-zA-Z]',' ',sentence)

    #single character removal
    sentence=re.sub(r"\s+[a-zA-Z]\s+"," ",sentence)

    #remove multiple spaces
    sentence=re.sub(r'\s+',' ',sentence)

    #Removing stopwords
    pattern=re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence=pattern.sub('',sentence)

    return sentence

In [16]:
X=[]
sentences=list(df['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [19]:
X[1]

'realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwell murals decorating every surface terribly well done '

In [20]:
y=df['sentiment']
y=np.array(list(map(lambda x:1 if x=='positive' else 0,y)))

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

**Preparing Embedding Layer**

In [23]:
word_tokenizer=Tokenizer()
word_tokenizer.fit_on_texts(X_train) #training the tokinizer
X_train=word_tokenizer.texts_to_sequences(X_train)# converting sentences to numeric sequences
X_test=word_tokenizer.texts_to_sequences(X_test)

In [24]:
vocab_length=len(word_tokenizer.word_index)+1
vocab_length

61769

In [25]:
#padding all reviews to fix length 100
maxlen=100
X_train=pad_sequences(X_train,padding='post',maxlen=maxlen)
X_test=pad_sequences(X_test,padding='post',maxlen=maxlen)

In [33]:
from numpy import asarray
from numpy import zeros
embeddings_dictionary = dict()
glove_file = open('/content/drive/MyDrive/Copy of a2_glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word= records[0]
    vector_dimensions = asarray (records [1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [35]:
#creating an embedding matrix having 100 columns
#containing 100 dimentional GloVe word embeddings for all words in our corpus
embedding_matrix = zeros((vocab_length, 100))
for word,index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [36]:
embedding_matrix.shape

(61769, 100)

**Model Training**

In [37]:
#Simple NN
smodel=Sequential()
embedding_layer=Embedding(vocab_length,100,weights=[embedding_matrix],input_length=maxlen,trainable=False)
smodel.add(embedding_layer)
smodel.add(Flatten())
smodel.add(Dense(1,activation='sigmoid'))



In [39]:
smodel.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
print(smodel.summary())

None


In [40]:
smodel_history=smodel.fit(X_train,y_train,batch_size=128,epochs=6,verbose=1,validation_split=0.2)

Epoch 1/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6228 - loss: 0.6397 - val_accuracy: 0.7086 - val_loss: 0.5617
Epoch 2/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7421 - loss: 0.5144 - val_accuracy: 0.7101 - val_loss: 0.5540
Epoch 3/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.7636 - loss: 0.4778 - val_accuracy: 0.7051 - val_loss: 0.5623
Epoch 4/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7753 - loss: 0.4585 - val_accuracy: 0.7107 - val_loss: 0.5621
Epoch 5/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7843 - loss: 0.4428 - val_accuracy: 0.7084 - val_loss: 0.5703
Epoch 6/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7905 - loss: 0.4295 - val_accuracy: 0.7070 - val_loss: 0.5779


In [42]:
score=smodel.evaluate(X_test,y_test,verbose=1)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7060 - loss: 0.5953


In [43]:
print('Test score:',score[0])
print('Test accuracy:',score[1])

Test score: 0.5988056659698486
Test accuracy: 0.7049000263214111


**LSTM Training**

In [44]:
model=Sequential()
embedding_layer=Embedding(vocab_length,100,weights=[embedding_matrix],input_length=maxlen,trainable=False)
model.add(embedding_layer)
model.add(LSTM(128))
model.add(Dense(1,activation='sigmoid'))



In [45]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
print(model.summary())

None


In [46]:
#model_training
model_history=model.fit(X_train,y_train,batch_size=128,epochs=6,verbose=1,validation_split=0.2)

Epoch 1/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 306ms/step - accuracy: 0.6265 - loss: 0.6288 - val_accuracy: 0.7013 - val_loss: 0.5702
Epoch 2/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 321ms/step - accuracy: 0.7351 - loss: 0.5320 - val_accuracy: 0.6911 - val_loss: 0.5488
Epoch 3/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 320ms/step - accuracy: 0.7447 - loss: 0.5073 - val_accuracy: 0.7625 - val_loss: 0.4874
Epoch 4/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 306ms/step - accuracy: 0.7694 - loss: 0.4725 - val_accuracy: 0.7563 - val_loss: 0.4716
Epoch 5/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 322ms/step - accuracy: 0.7822 - loss: 0.4456 - val_accuracy: 0.7822 - val_loss: 0.4429
Epoch 6/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 321ms/step - accuracy: 0.7977 - loss: 0.4215 - val_accuracy: 0.7886 - val_loss: 0.4395


In [47]:
score=model.evaluate(X_test,y_test,verbose=1)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.7894 - loss: 0.4350


In [48]:
print('Test score:',score[0])
print('Test accuracy:',score[1])

Test score: 0.4369640648365021
Test accuracy: 0.7854999899864197


**Making prediction now**

In [49]:
rev=pd.read_csv('/content/IMDb_Unseen_Reviews.csv')
rev.head()

Unnamed: 0.1,Unnamed: 0,Movie,Review Text,IMDb Rating
0,0,Ex Machina,Intelligent Movie.\nThis movie is obviously al...,9
1,1,Ex Machina,Extraordinary and thought-provoking.\n'Ex mach...,10
2,2,Ex Machina,"Poor story, only reasonable otherwise.\nIf I h...",3
3,3,Ex Machina,Had Great Potential.\nThis movie is one of the...,1
4,4,Eternals,Amazing visuals and philosophical concepts!\n\...,10


In [50]:
unseen_rev= rev['Review Text']
unseen_pro=[]
for review in unseen_rev:
    review=preprocess_text(review)
    unseen_pro.append(review)

In [51]:
unseen_tok=word_tokenizer.texts_to_sequences(unseen_pro)
unseen_pad=pad_sequences(unseen_tok,padding='post',maxlen=maxlen)

In [52]:
unseen_sentiments=model.predict(unseen_pad)
unseen_sentiments

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step


array([[0.613156  ],
       [0.94997394],
       [0.43471304],
       [0.22685051],
       [0.8103935 ],
       [0.03498825]], dtype=float32)

In [57]:
rev['Predicted Sentiments']=np.round(unseen_sentiments*10,1)
rev.sample(4)

Unnamed: 0.1,Unnamed: 0,Movie,Review Text,IMDb Rating,Predicted Sentiments
5,5,Eternals,Worst MCU film ever\n\nFollowing the events of...,3,0.3
4,4,Eternals,Amazing visuals and philosophical concepts!\n\...,10,8.1
1,1,Ex Machina,Extraordinary and thought-provoking.\n'Ex mach...,10,9.5
0,0,Ex Machina,Intelligent Movie.\nThis movie is obviously al...,9,6.1
