**<h1>SENTIMENT ANALYSIS MODEL USING LSTM**

In [38]:
import pandas as pd
import numpy as np
from google.colab import drive
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout,GRU
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
drive.mount('/content/drive')
movie_data=pd.read_csv('/content/drive/MyDrive/BigDataAnalystic/PROJECT/IMDB_Dataset.csv')

Mounted at /content/drive


In [3]:
movie_data.head()

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Review,Sentiment
0,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,One of the other reviewers has mentioned that ...,positive
1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,A wonderful little production. <br /><br />The...,positive
2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,I thought this was a wonderful way to spend ti...,positive
3,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,Basically there's a family where a little boy ...,negative
4,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,"Petter Mattei's ""Love in the Time of Money"" is...",positive


**DATA PREPROCESSING**

In [4]:
# Checking Total Null values
movie_data.isnull().sum()

Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Review             2
Sentiment          2
dtype: int64

In [5]:
# Dropping NULL vlaues
movie_data1=movie_data.dropna(how='any')
movie_data1.isnull().sum()

Series_Title     0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Review           0
Sentiment        0
dtype: int64

In [6]:
#Checking duplicate
movie_data.duplicated().any()

False

**FEATURE EXRECTION**

In [26]:
X = movie_data['Review'].astype(str)
Y = movie_data['Sentiment']

In [27]:
print("X : ",X)
print("Y : ",Y)

X :  0      One of the other reviewers has mentioned that ...
1      A wonderful little production. <br /><br />The...
2      I thought this was a wonderful way to spend ti...
3      Basically there's a family where a little boy ...
4      Petter Mattei's "Love in the Time of Money" is...
                             ...                        
995    Nothing is sacred. Just ask Ernie Fosselius. T...
996    I hated it. I hate self-aware pretentious inan...
997    I usually try to be professional and construct...
998                                                  nan
999                                                  nan
Name: Review, Length: 1000, dtype: object
Y :  0      positive
1      positive
2      positive
3      negative
4      positive
         ...   
995    positive
996    negative
997    negative
998         NaN
999         NaN
Name: Sentiment, Length: 1000, dtype: object


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)

In [29]:
# Tokenizing the text data
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Padding sequences to the same length
max_len = max(len(x) for x in X_train)
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

# Encoding labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [30]:
#LSTM MODEL
model1 = Sequential()
model1.add(Embedding(max_features, 128, input_length=max_len))
model1.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model1.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1104, 128)         256000    
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 387713 (1.48 MB)
Trainable params: 387713 (1.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [32]:
model1.fit(X_train, y_train, epochs=5, batch_size=128, verbose=1, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x796a2c1d61d0>

In [33]:
y_pred_prob = model1.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()



In [36]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.69


In [39]:
# GRU MODEL
model2 = Sequential()
model2.add(Embedding(max_features, 128, input_length=max_len))
model2.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1104, 128)         256000    
                                                                 
 gru (GRU)                   (None, 128)               99072     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 355201 (1.35 MB)
Trainable params: 355201 (1.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [40]:
model2.fit(X_train, y_train, epochs=5, batch_size=128, verbose=1, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x796a291d8f70>

In [41]:
y_pred_prob = model2.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()



In [42]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.605


In [43]:
model1.save('/content/drive/MyDrive/BigDataAnalystic/PROJECT/LSTM_Model.h5')
print("LSTM Model saved.")
model2.save('/content/drive/MyDrive/BigDataAnalystic/PROJECT/GRU_Model.h5')
print("GRU Model saved.")

LSTM Model saved.
GRU Model saved.


  saving_api.save_model(
