### Importing Libraries

In [2]:
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer, PorterStemmer

### Downloading nltk Packages

In [24]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Swarnadwip_Sarkar\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Swarnadwip_Sarkar\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Swarnadwip_Sarkar\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

### Loading Dataset

In [5]:
data = pd.read_csv("IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df = data.dropna()
df.shape

(50000, 2)

In [7]:
data.shape

(50000, 2)

In [8]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Data Preprocessing

In [12]:
punctuation = string.punctuation
stopwords = nltk.corpus.stopwords.words('english')
lm = WordNetLemmatizer()

In [13]:
text = df['review'][1]
text

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [14]:
BeautifulSoup(text,"html.parser").get_text()

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [29]:
def clean_text(text):
    # Removing all html tags
    only_text = BeautifulSoup(text,"html.parser").get_text()
    # Lowercasing and removing punctuation
    text = "".join([char.lower() for char in only_text if char not in punctuation])
    tokens = word_tokenize(text)
    # First removing stopwords and then applying lemmatization
    final_text = " ".join([lm.lemmatize(word) for word in tokens if word not in stopwords])
    return final_text

In [30]:
clean_text(df['review'][1])

'wonderful little production filming technique unassuming oldtimebbc fashion give comforting sometimes discomforting sense realism entire piece actor extremely well chosen michael sheen got polari voice pat truly see seamless editing guided reference williams diary entry well worth watching terrificly written performed piece masterful production one great master comedy life realism really come home little thing fantasy guard rather use traditional dream technique remains solid disappears play knowledge sens particularly scene concerning orton halliwell set particularly flat halliwells mural decorating every surface terribly well done'

In [31]:
X = df['review'].apply(lambda x: clean_text(x))
X

0        one reviewer mentioned watching 1 oz episode y...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: review, Length: 50000, dtype: object

In [32]:
Y = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
Y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

### Split Data in Training and Test Set

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25)

In [35]:
X_train.shape, Y_train.shape

((37500,), (37500,))

In [36]:
X_test.shape, Y_test.shape

((12500,), (12500,))

In [37]:
X_train

20975    say truth went see movie nicolas cage favorite...
33031    mindless movie piece crap boring like full hou...
4191     obviously film great influence buddy genre act...
27069    saddest thing tribute almost singer including ...
20238    overrated shortlived series measly two season ...
                               ...                        
29083    lose friend alienate people came 2008 bombed u...
10       phil alien one quirky film humour based around...
19459    occasionally talk show good topical debate spa...
5798     wow even discus movie without tear coming eye ...
37397    kim basinger mickey rourke star controversial ...
Name: review, Length: 37500, dtype: object

In [38]:
Y_train

20975    1
33031    0
4191     1
27069    0
20238    0
        ..
29083    1
10       0
19459    0
5798     1
37397    0
Name: sentiment, Length: 37500, dtype: int64

In [41]:
# Average Number of Tokens
sum = 0
for i in X.to_list():
    sum += len(i.split(" "))
avg = sum / len(X.to_list())
avg

119.84288

### Text Vectorization

In [39]:
from tensorflow.keras.layers import TextVectorization

In [42]:
vocab_size = 20000
max_len = 200
vectorize_layer = TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length = max_len
)

In [43]:
# Build Vocabulary
vectorize_layer.adapt(X_train)

In [44]:
vocab = np.array(vectorize_layer.get_vocabulary())
vocab[:50]

array(['', '[UNK]', 'movie', 'film', 'one', 'like', 'time', 'good',
       'character', 'get', 'even', 'story', 'would', 'see', 'make',
       'really', 'scene', 'much', 'well', 'people', 'great', 'bad',
       'also', 'show', 'first', 'dont', 'way', 'thing', 'made', 'could',
       'think', 'life', 'go', 'know', 'watch', 'love', 'many', 'seen',
       'actor', 'two', 'plot', 'never', 'say', 'look', 'end', 'little',
       'acting', 'best', 'year', 'ever'], dtype='<U18')

In [45]:
vocab.shape

(20000,)

In [46]:
vectorize_training_set = vectorize_layer(X_train)

In [47]:
X_train.shape

(37500,)

In [48]:
vectorize_training_set.shape

TensorShape([37500, 200])

In [49]:
vectorize_training_set

<tf.Tensor: shape=(37500, 200), dtype=int64, numpy=
array([[  42,  718,  313, ...,    0,    0,    0],
       [2553,    2,  233, ...,    0,    0,    0],
       [ 429,    3,   20, ...,    0,    0,    0],
       ...,
       [1665,  443,   23, ...,    6,   51,  853],
       [1352,   10, 3738, ...,    0,    0,    0],
       [2625, 6981, 2247, ...,    0,    0,    0]])>

In [50]:
vocab

array(['', '[UNK]', 'movie', ..., 'mccormick', 'mccann', 'mcbride'],
      dtype='<U18')

In [51]:
X_train.to_list()[2]

'obviously film great influence buddy genre action genre well george lucas fan flick much star war series seems homage gunga din character grant mclaglen fairbanks play precursor han solo luke skywalker chewbacca even sam jaffes gunga din morphed c3po r2d2 like jar jar binkstoday film viewed non pc speech eduardo ciannelli guru leader indian opposition british raj could echoed sentiment many today young boy great film three strong male lead hint romance time young boy deemed kissing girl saturday matinee film mush like today skin greeted delight late lament lost innocencehopefully film forgotten channel surfing stop tcm catch film action adventure cast thousand instead cgi actor'

In [52]:
vectorize_training_set[2,:]

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([  429,     3,    20,  1850,  1239,   405,   106,   405,    18,
         591,  3298,   109,   300,    17,   111,   188,   120,    97,
        2528,  9801,  7650,     8,  2161,  6733,  8286,    71, 10406,
        4550,  3929,  2502, 11986,     1,    10,  1085,     1,  9801,
        7650, 15784,     1,     1,     5,  7354,  7354,     1,     3,
        2107,  2831,  5734,  1604, 18513,     1,  8910,  1401,   913,
        7905,   588,  7174,    29, 16489,  4227,    36,   397,    98,
         204,    20,     3,   197,   467,   708,   216,  1940,   737,
           6,    98,   204,  7121,  4836,    84,  2160,     1,     3,
       15273,     5,   397,  2174, 13393,  2474,   441, 11772,   322,
           1,     3,  1412,   879,  4913,   427,  5531,   960,     3,
         106,   821,    89,  1583,   206,  1399,    38,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [53]:
np.where(vocab=='film')

(array([3]),)

In [54]:
len(vocab)

20000

### Building Embedding Layer

In [55]:
from tensorflow.keras.layers import Embedding

embed = Embedding(
    input_dim = len(vocab),
    output_dim = 100,
)

In [57]:
embed(vectorize_training_set[2,:])

<tf.Tensor: shape=(200, 100), dtype=float32, numpy=
array([[-0.00443739, -0.03519762,  0.04777107, ...,  0.0401245 ,
         0.0066235 ,  0.0440085 ],
       [-0.04792074,  0.00306188, -0.01716427, ...,  0.04804263,
        -0.04642345,  0.00703468],
       [-0.04668491, -0.00609167, -0.03071828, ..., -0.03132644,
         0.02055762,  0.03676274],
       ...,
       [-0.01740215, -0.02308092,  0.02307146, ...,  0.03784061,
        -0.04945008, -0.00153155],
       [-0.01740215, -0.02308092,  0.02307146, ...,  0.03784061,
        -0.04945008, -0.00153155],
       [-0.01740215, -0.02308092,  0.02307146, ...,  0.03784061,
        -0.04945008, -0.00153155]], dtype=float32)>

### Building LSTM Model

In [58]:
vectorize_layer

<TextVectorization name=text_vectorization, built=True>

In [59]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM,Dense,Dropout
model = Sequential()

model.add(Input(shape=(1,), dtype = tf.string))

model.add(vectorize_layer)
model.add(Embedding(input_dim = vocab_size, output_dim = 100))
model.add(LSTM(100))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))

In [60]:
model.summary()

In [62]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss='binary_crossentropy',metrics=['accuracy'])

### Model Training

In [68]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

X_train_vec = vectorize_layer(X_train)
X_test_vec = vectorize_layer(X_test)
history = model.fit(X_train_vec,Y_train,validation_data=(X_test_vec,Y_test),batch_size=32,epochs=10,
                    callbacks = [early_stop])

Epoch 1/10


ValueError: Exception encountered when calling TextVectorization.call().

[1mWhen using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 200) with rank=2[0m

Arguments received by TextVectorization.call():
  • inputs=tf.Tensor(shape=(None, 200), dtype=string)