# LSTM

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load the cleaned and processed dataset
file_path = 'cleaned_movie_data.csv'
cleaned_data = pd.read_csv(file_path)

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cleaned_data['processed_text'], cleaned_data['label'], test_size=0.2, random_state=42)


In [4]:
# Tokenize and pad sequences
max_features = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [5]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(max_features, 128, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [6]:
# Train the model
batch_size = 64
epochs = 5

history = model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=2)


Epoch 1/5
450/450 - 99s - 221ms/step - accuracy: 0.8177 - loss: 0.4093 - val_accuracy: 0.8572 - val_loss: 0.3315
Epoch 2/5
450/450 - 92s - 206ms/step - accuracy: 0.8794 - loss: 0.3005 - val_accuracy: 0.8647 - val_loss: 0.3207
Epoch 3/5
450/450 - 90s - 201ms/step - accuracy: 0.8973 - loss: 0.2638 - val_accuracy: 0.8553 - val_loss: 0.3490
Epoch 4/5
450/450 - 91s - 203ms/step - accuracy: 0.9037 - loss: 0.2455 - val_accuracy: 0.8659 - val_loss: 0.3291
Epoch 5/5
450/450 - 89s - 198ms/step - accuracy: 0.9193 - loss: 0.2094 - val_accuracy: 0.8675 - val_loss: 0.3552


In [7]:
# Evaluate the model
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 45ms/step


In [8]:
print("LSTM Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

LSTM Model Accuracy: 0.86375
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      3966
           1       0.88      0.85      0.86      4034

    accuracy                           0.86      8000
   macro avg       0.86      0.86      0.86      8000
weighted avg       0.86      0.86      0.86      8000



In [9]:
model.save('lstm_model.h5')



In [11]:
import pickle

# Save the model using pickle
with open('lstm_model.pkl', 'wb') as f:
    pickle.dump(model, f)