In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [5]:
df = pd.read_csv('dataset.csv')
df.shape

(50000, 2)

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [9]:
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [10]:
df.sentiment = df.sentiment.map({'positive':1,'negative':0})

In [11]:
df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.review,df.sentiment,test_size=0.2,random_state=42)

In [13]:
len(X_train)

40000

In [14]:
len(y_test)

10000

In [15]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test),maxlen=200)

In [16]:
X_train[0]

array([1935,    1, 1200,    4, 2946, 3749, 1828,    2,  147,  144,    3,
        228,    4,    3,  207,  323,    2,  144, 1083,   16,   88,    4,
        132, 2871,   18,   10,  153,   99,    4,    1, 4020,  302,   11,
         17, 1001,   35,    1,  496,  492, 2619,  249,   71,   77,  107,
        107,  698,   60,   86, 1047, 1363,    5,  229,  132,   23, 4360,
         31,  138,  209, 1154,   14, 4501,   31,    3, 2386,    2,    8,
         11,    6,    3,  445,   14,  624,    4,    1,  718, 2959,    1,
       1278,    2,   71, 3616,    1,  166, 1507,    1, 1245,    5, 1629,
          1,  879, 1268,    5,    1,  310,  140, 2894,    2,  410,  633,
          7,    7,    1,  269,    6, 3553, 1000,    5,   26,   39,   14,
       1381,  217,   65,    2,   46,    6,   30,  219,   27,  193, 1484,
          8, 1101,   18,   10, 4905,   84,    1,  226,   66,  356,   68,
         54,   27,    5, 3600,   15,   44,   21,  192,    5,    3,  889,
       3511, 1758,   22,   25,    5,  158,  196,  1

In [17]:
model = Sequential([
    Embedding(input_dim=5000,output_dim=128,input_shape=(200,),input_length=200),
    LSTM(128,dropout=0.2,recurrent_dropout=0.2),
    Dense(1,activation='sigmoid')
])

  super().__init__(**kwargs)


In [18]:
model.summary()

In [19]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [20]:
model.fit(X_train,y_train,epochs=5,batch_size=64,validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 167ms/step - accuracy: 0.7004 - loss: 0.5591 - val_accuracy: 0.8221 - val_loss: 0.4175
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 178ms/step - accuracy: 0.8419 - loss: 0.3797 - val_accuracy: 0.8366 - val_loss: 0.3853
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 187ms/step - accuracy: 0.8671 - loss: 0.3226 - val_accuracy: 0.8676 - val_loss: 0.3192
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 183ms/step - accuracy: 0.8555 - loss: 0.3364 - val_accuracy: 0.8622 - val_loss: 0.3387
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 180ms/step - accuracy: 0.8847 - loss: 0.2854 - val_accuracy: 0.8686 - val_loss: 0.3204


<keras.src.callbacks.history.History at 0x348d96150>

In [21]:
model.evaluate(X_test,y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 51ms/step - accuracy: 0.8709 - loss: 0.3084


[0.3079059422016144, 0.875]

In [22]:
def predict_sentiment(review):
    sequence=tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence,maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = 'positive' if prediction[0][0]>0.5 else 'negative'
    return sentiment

In [23]:
predict_sentiment("This movie was good")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


'positive'

In [24]:
predict_sentiment("This movie was very good i loved it.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


'positive'

In [42]:
model.save("trained_model.keras")

In [47]:
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)