In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
pwd

'C:\\Users\\Tejaswi\\Downloads'

In [3]:
data = pd.read_csv("IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
type(data)

pandas.core.frame.DataFrame

In [6]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [7]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
##one hot encoding
##label encoder

In [9]:
#positive >> 1
##negative >> 0

data.replace({"sentiment":{"positive":1,"negative":0}},inplace =True)

In [10]:
##LSTM >>> Long Short Term Memory
#RNN  >>>>> Textual Data

In [11]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences





In [12]:
train_data,test_data = train_test_split(data, test_size = 0.2, random_state=42)


In [13]:
train_data.shape

(40000, 2)

In [14]:
test_data.shape

(10000, 2)

In [15]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data['review'])

In [16]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [17]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]])

In [18]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]])

In [19]:
Y_train = train_data["sentiment"]
Y_test = test_data['sentiment']

In [20]:
Y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

In [25]:
####LSTM model building

model = Sequential()
model.add(Embedding(input_dim=5000,output_dim = 128,input_length = 200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = "sigmoid"))

In [26]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 128)          640000    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 771713 (2.94 MB)
Trainable params: 771713 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
model.compile(optimizer="adam",loss="binary_crossentropy", metrics=["accuracy"])





In [28]:
model.fit(X_train, Y_train,batch_size=4, validation_split=0.2)





<keras.src.callbacks.History at 0x1fe01d4b6d0>

In [29]:
model.save("model.h5")

In [30]:
import joblib
joblib.dump(tokenizer,"tokenizer.pkl")

['tokenizer.pkl']

In [31]:
loss, accuracy = model.evaluate(X_test,Y_test)



In [32]:
print(loss)

0.25417089462280273


In [33]:
print(accuracy)

0.8934000134468079


In [40]:
#Building predictive system

def predictive_system(review):
    sequences=tokenizer.texts_to_sequences([review])
    padded_sequence=pad_sequences(sequences,maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
    return sentiment

In [41]:
predictive_system("This movie was fantastic and amazing")



'positive'

In [42]:
predictive_system("this movie is very boring")



'negative'

In [44]:
predictive_system("this movie is half baked")



'negative'

In [45]:
predictive_system("this movie story is good but screenplay is bad")



'negative'