<a href="https://colab.research.google.com/github/Princekumar9027/NLP-LAB/blob/main/movie_sentiment_analysis_and_review_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [None]:
data = pd.read_csv("/content/IDMB.csv", engine='python', on_bad_lines='skip')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.shape

(1612, 2)

In [None]:
type(data)

In [None]:
data.tail()

Unnamed: 0,review,sentiment
1607,"The movie follows the events of the novel ""Cel...",positive
1608,If it weren't for the editing out of curse wor...,negative
1609,I have seen The Running Man several times as I...,positive
1610,"A stuttering plot, uninteresting characters an...",negative
1611,"Yes, this movie is a real thief. It stole some...",negative


In [None]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,820
negative,792


In [None]:
# LSTM -> LONG SHORT TERM MEMORY
# RNN -> TEXTUAL DATA

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=42)

In [None]:
train_data.shape

(1289, 2)

In [None]:
test_data.shape

(323, 2)

In [None]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data["review"])

In [None]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [None]:
X_train

array([[   0,    0,    0, ...,   66, 2721,   40],
       [ 219,   88,    4, ...,    6,    8, 2724],
       [   0,    0,    0, ..., 3702,   70,  829],
       ...,
       [   0,    0,    0, ...,   98,  107, 3690],
       [   0,    0,    0, ...,    6,    3,  160],
       [   0,    0,    0, ...,   16,   20, 1338]], dtype=int32)

In [None]:
X_test

array([[4762,    2,  849, ...,   85,  358, 3148],
       [   0,    0,    0, ...,   87,    5,  127],
       [   0,    0,    0, ...,   52,  149, 1320],
       ...,
       [   0,    0,    0, ...,  538,    4,  118],
       [   6,    6, 1065, ...,    3,  365,    4],
       [   0,    0,    0, ...,   58,  332,   22]], dtype=int32)

In [None]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

# Convert sentiment labels to numerical (0 for negative, 1 for positive)
Y_train = Y_train.map({'negative': 0, 'positive': 1})
Y_test = Y_test.map({'negative': 0, 'positive': 1})

In [None]:
Y_train

Unnamed: 0,sentiment
1324,1
1080,1
963,1
907,1
985,1
...,...
1130,0
1294,1
860,1
1459,1


In [None]:
Y_test

Unnamed: 0,sentiment
29,1
259,1
383,0
425,1
239,0
...,...
999,0
1557,1
513,0
422,0


In [None]:
# LSTM MODEL BUILDING

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

model = Sequential()
model.add(Embedding(input_dim =5000, output_dim = 128, input_length = 200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = "sigmoid"))

In [None]:
model.summary()

In [None]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(X_train, Y_train, epochs = 5, batch_size = 64, validation_split = 0.2)

Epoch 1/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 818ms/step - accuracy: 0.5369 - loss: 0.6920 - val_accuracy: 0.5659 - val_loss: 0.6869
Epoch 2/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 775ms/step - accuracy: 0.7320 - loss: 0.6655 - val_accuracy: 0.7287 - val_loss: 0.6054
Epoch 3/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 765ms/step - accuracy: 0.8169 - loss: 0.5003 - val_accuracy: 0.7868 - val_loss: 0.4905
Epoch 4/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 769ms/step - accuracy: 0.8776 - loss: 0.3591 - val_accuracy: 0.7519 - val_loss: 0.5166
Epoch 5/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 768ms/step - accuracy: 0.9248 - loss: 0.2298 - val_accuracy: 0.7752 - val_loss: 0.4949


<keras.src.callbacks.history.History at 0x7ab190df9d60>

In [None]:
model.save("model.h5")



In [None]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step - accuracy: 0.7652 - loss: 0.5559


In [None]:
print(loss)

0.5993157029151917


In [None]:
print(accuracy)

0.7430340647697449


In [None]:
# Building Predictive System

In [None]:
def predictive_system(review):
  sequences = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [None]:
predictive_system("This movie was fantastic and amazing")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 606ms/step


'negative'

In [None]:
predictive_system("A trilling adventure with stunning visual")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step


'negative'

In [None]:
predictive_system("A visual masterpiece")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step


'negative'

In [None]:
predictive_system("Overall long and slow")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step


'negative'