In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("/content/IMDB_Dataset.csv")

In [3]:
df.shape

(50000, 2)

In [4]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.isnull().values.any()

False

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [12]:
df["review"][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

'basically theres family little boy jake thinks theres zombie closet parents fighting timethis movie slower soap opera suddenly jake decides rambo kill zombieok youre going film decide thriller drama drama movie watchable parents divorcing arguing like real life jake closet totally ruins film expected boogeyman similar movie instead watched drama meaningless thriller spots playing parents descent dialogs shots jake ignore'

In [None]:
# html tags
# punct removal
# number removal
# stopwords
#

In [16]:
from gensim.parsing.preprocessing import remove_stopwords

In [11]:
import re

In [13]:
html_pattern = re.compile(r"<[^>]+>")



In [15]:
remove_non_alpha = re.compile(r"[^a-zA-Z ]")

In [17]:
def preprocess_text(sent):

  '''
  rerurn cleaned text
    parameters :
      sent (str) : movie review
    Return :
      sentence (str) : cleaned movie review

  '''

  #lower casing
  sentence = sent.lower()

  #removing html tags
  sentence = html_pattern.sub("", sentence)

  #removing non-alpha characters
  sentence = remove_non_alpha.sub("", sentence)

  #removing stopwords
  sentence = remove_stopwords(sentence)

  return sentence

In [18]:
df['review'] = df['review'].apply(preprocess_text)

In [20]:
df["review"][3]

'basically theres family little boy jake thinks theres zombie closet parents fighting timethis movie slower soap opera suddenly jake decides rambo kill zombieok youre going film decide thriller drama drama movie watchable parents divorcing arguing like real life jake closet totally ruins film expected boogeyman similar movie instead watched drama meaningless thriller spots playing parents descent dialogs shots jake ignore'

In [21]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [22]:
tokenizer = Tokenizer(num_words=500, split=' ')
tokenizer.fit_on_texts(df['review'].values)

X = tokenizer.texts_to_sequences(df['review'].values)

X = pad_sequences(X)

In [24]:
# X[3]

In [25]:
y = df['sentiment']

In [28]:
y = np.array([1 if label=='positive' else 0 for label in y])

In [30]:
# y

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=65)

In [38]:
X.shape

(50000, 359)

In [42]:
X[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [35]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [None]:
#input dimension : size of vocab (500)
# output dimension :             (120)

In [39]:
model = Sequential()

model.add(Embedding(500, 120 ,input_length=359))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

In [None]:
parameters_of_lstm = 4 * (o + n + 1)* n

o ==> output dim
n ==> no. of lstm units

In [40]:
# 4 network

# input_size * hidden_layer_length

4*(120 + 64 + 1)*64

47360

In [41]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 359, 120)          60000     
                                                                 
 lstm (LSTM)                 (None, 64)                47360     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 107425 (419.63 KB)
Trainable params: 107425 (419.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics= ['accuracy'])

In [45]:
model.fit(x_train, y_train, batch_size=256, epochs=1)



<keras.src.callbacks.History at 0x7a29ec1b8d60>