<a href="https://colab.research.google.com/github/Sinha-Abhinav-13/imdb_aiml/blob/lstm/imdb_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("/content/IMDB_Dataset.csv", quotechar='"',skiprows =[2418, 5505, 7089, 8684,11039,14210,15011,17401,18200,19792,20560,21344,22151,22913,23711,24515,26079,26908,27704,28507,29335,30097,30877,31642,32426,34067, 34832,36434,37224, 38010,39625, 42025,42780,44315, 45096,46680, 47479,49083])
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [4]:
data.duplicated().sum()

417

In [5]:
import tensorflow as tf
def preprocess_text(t):
  t = tf.strings.regex_replace(t, "<br\s*/?>", " ")
  t = tf.strings.regex_replace(t, "<[^a-zA-Z\s]", " ")
  t = tf.strings.lower(t)
  return t
data["review"] = data["review"].apply(lambda x: preprocess_text(x).numpy().decode('utf-8'))

print(data.head())

                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production.   the filming t...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically there's a family where a little boy ...  negative
4  petter mattei's "love in the time of money" is...  positive


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data["review"])
sequences = tokenizer.texts_to_sequences(data["review"])
max_length = 256
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
labels = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values
print(padded_sequences[0])

[  26    4    1   78 2098   44 1068   11   99  146   38  306 3181  397
  473   25 3188   32   22  202   13   10    6  620   47  595   15   67
    1   85  147   11 3235   67   41 3181   12   91 5390    2  134    4
  568   59  267    7  202   35    1  660  138 1739   67   10    6   20
    3  118   14    1 7881 2336   37   10  118 2590   53 5900   15 5503
    5 1477  375   37  568   91    6 3798    7    1  359  357    4    1
  660    8    6  432 3181   13   11    6    1  356    5    1 6805 2532
 1063    8 2711 1420   19  537   31 4660 2463    4    1 1207  116   28
    1 7013   24 2966    2  390   33    6   20  298   19    1 4929 7352
  537    6  343    5  105 8155 5045 7882 2451    2   50   33  326 9097
 7353    2 8686   22  109  224  242    9   58  130    1  279 1325    4
    1  118    6  692    5    1  191   11    8  268  116   78  275  588
 3025  833  179 1320 4153   14 2520 1241  833 1441  833  886 3181  148
  953  182    1   85  397    9  121  209 3235   67   13   33 1635    8
   12 

In [7]:
from sklearn.model_selection import train_test_split
train_padded, test_padded, train_labels, test_labels = train_test_split( padded_sequences, labels, test_size=0.2, random_state=42 )
train_padded, val_padded, train_labels, val_labels = train_test_split( train_padded, train_labels, test_size=0.2, random_state=42 )

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
# Build the LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape = (None, max_length))
# Summary of the model
model.summary()




In [9]:
# Train the model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
model_checkpoint = ModelCheckpoint(
    'best_model.keras',  # Filepath to save the model
    monitor='val_loss',  # Metric to monitor
    save_best_only=True  # Save only the model with the best value of the monitored metric
)
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(val_padded, val_labels), batch_size=64, callbacks = [early_stop, model_checkpoint])

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 453ms/step - accuracy: 0.5070 - loss: 0.6933 - val_accuracy: 0.5315 - val_loss: 0.6964
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 466ms/step - accuracy: 0.5394 - loss: 0.6916 - val_accuracy: 0.5665 - val_loss: 0.6604
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 464ms/step - accuracy: 0.6178 - loss: 0.6238 - val_accuracy: 0.7841 - val_loss: 0.5157
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 468ms/step - accuracy: 0.7906 - loss: 0.4912 - val_accuracy: 0.8174 - val_loss: 0.4631
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 457ms/step - accuracy: 0.8384 - loss: 0.4173 - val_accuracy: 0.5857 - val_loss: 0.8879
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 472ms/step - accuracy: 0.8359 - loss: 0.4260 - val_accuracy: 0.7459 - val_loss: 0.5465
Epoc

In [10]:
# Evaluate the model
loss, accuracy = model.evaluate(test_padded, test_labels)
print(f'Test Accuracy: {accuracy:.2f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 87ms/step - accuracy: 0.8475 - loss: 0.3966
Test Accuracy: 0.86


In [11]:
# Make predictions
new_reviews = ["The movie was fantastic!", "bad, i hated it, it was the worst"]
new_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding='post', truncating='post')

predictions = model.predict(new_padded)

# Convert probabilities to labels
labels = ['Positive' if pred >= 0.5 else 'Negative' for pred in predictions]
for review, pred, label in zip(new_reviews, predictions, labels):
    print(f"Review: '{review}'\nPredicted Sentiment: {label} (Probability: {pred[0]:.2f})\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step
Review: 'The movie was fantastic!'
Predicted Sentiment: Positive (Probability: 0.78)

Review: 'bad, i hated it, it was the worst'
Predicted Sentiment: Negative (Probability: 0.08)



In [12]:
from sklearn.metrics import classification_report
y_pred = model.predict(test_padded)
y_pred_labels = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(test_labels, y_pred_labels))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 86ms/step
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      5020
           1       0.84      0.88      0.86      4973

    accuracy                           0.86      9993
   macro avg       0.86      0.86      0.86      9993
weighted avg       0.86      0.86      0.86      9993

