In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
df = pd.read_csv('/content/IMDB Dataset.csv', engine='python')

In [73]:
df.shape


(50000, 2)

In [74]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [75]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [76]:
df.duplicated().sum()

np.int64(418)

In [77]:
df.drop_duplicates(inplace=True)

In [78]:
df.duplicated().sum()

np.int64(0)

In [79]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [80]:
df['sentiment'][0]

'positive'

In [81]:
df['review'] = df['review'].str.lower()


In [82]:
df['review'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

In [83]:
avg_length_words = df['review'].apply(lambda x: len(x.split())).mean()
print(f"Average review length (words): {avg_length_words:.2f}")


Average review length (words): 231.35


In [84]:
import re
def clean_review(review):
   review = re.sub(r'<[^>]+>', '', review)
   review = re.sub(r'[^a-zA-Z0-9\s]', '', review)
   return review
df['review'] = df['review'].apply(clean_review)

In [85]:
df['review'][0]

'one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictur

In [86]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [87]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')

In [88]:
tokenizer.fit_on_texts(df['review'])

In [89]:
sequences = tokenizer.texts_to_sequences(df['review'])

In [90]:
padded_sequences = pad_sequences(sequences, maxlen=200, padding='post')


In [91]:
print(padded_sequences[:5])

[[   1    9 2604 1370   20    1  532   33 4658 2512    5    2 1192  112
    31    2 7031   25 2959    1    3  408    1   38    1    7   21  318
    20    2 5027 3686  532    7  341    6    1 8258    1    1 5090 7696
  2439    3    1    1  329 9134 7401    1    3 8498    1   23  109  225
     1   57  125    2  270 1302    5    2  118    7  663    6    2  186
    12    9  261  112   77  255  547 2963  820  178 1257 4266   16 2464
  1091  820 1401  820    1  147  973  181    2   88  398   11  120  201
  3216   69   14   38 1567    9   13 2194   11  395  125   11   13 1543
    16    9   18   14   11  275   50   11 1450    4 1242   16 3335    3
   183    1    6    2  318 2071    5 2075  586   21   40  586   18 7767
  6968 4880    1   26 2932   44   16    4    1 6908    1  488   20  606
     3   75  239   15    9   73 9787  746  806 6908  106  656   78 1192
     1  663    6   63  552    5  928 1969   39 1192  553  145 3335   22
   194  410 3700   15   47    7 3285    1   45   22   68   75   

In [92]:
# If sentiment is 'positive' and 'negative', convert them
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})


In [93]:
df['sentiment'][0]

np.int64(1)

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)

MODEL BUILDING

In [95]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN,LSTM, Dense, Dropout

In [96]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=230))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # 1 output (positive or negative)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [97]:
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 0


In [98]:
history2 = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 139ms/step - accuracy: 0.5480 - loss: 0.6779 - val_accuracy: 0.6564 - val_loss: 0.6080
Epoch 2/10
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 136ms/step - accuracy: 0.7666 - loss: 0.5086 - val_accuracy: 0.7118 - val_loss: 0.5603
Epoch 3/10
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 133ms/step - accuracy: 0.8273 - loss: 0.4194 - val_accuracy: 0.8389 - val_loss: 0.3912
Epoch 4/10
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 132ms/step - accuracy: 0.8817 - loss: 0.3065 - val_accuracy: 0.8357 - val_loss: 0.3799
Epoch 5/10
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 135ms/step - accuracy: 0.8967 - loss: 0.2686 - val_accuracy: 0.8680 - val_loss: 0.3254
Epoch 6/10
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 134ms/step - accuracy: 0.9233 - loss: 0.2075 - val_accuracy: 0.8739 - val_loss:

In [99]:
python....

SyntaxError: invalid syntax (<ipython-input-99-653b857ba5ef>, line 1)

In [103]:
for epoch in range(len(history.history['accuracy'])):
    print(f"Epoch {epoch+1}:")
    print(f"  Training Accuracy: {history.history['accuracy'][epoch]:.4f}")
    print(f"  Training Loss: {history.history['loss'][epoch]:.4f}")
    print(f"  Validation Accuracy: {history.history['val_accuracy'][epoch]:.4f}")
    print(f"  Validation Loss: {history.history['val_loss'][epoch]:.4f}")
    print("-----")


Epoch 1:
  Training Accuracy: 0.5903
  Training Loss: 0.6517
  Validation Accuracy: 0.6564
  Validation Loss: 0.6080
-----
Epoch 2:
  Training Accuracy: 0.7665
  Training Loss: 0.5055
  Validation Accuracy: 0.7118
  Validation Loss: 0.5603
-----
Epoch 3:
  Training Accuracy: 0.8435
  Training Loss: 0.3887
  Validation Accuracy: 0.8389
  Validation Loss: 0.3912
-----
Epoch 4:
  Training Accuracy: 0.8799
  Training Loss: 0.3104
  Validation Accuracy: 0.8357
  Validation Loss: 0.3799
-----
Epoch 5:
  Training Accuracy: 0.9017
  Training Loss: 0.2616
  Validation Accuracy: 0.8680
  Validation Loss: 0.3254
-----
Epoch 6:
  Training Accuracy: 0.9231
  Training Loss: 0.2088
  Validation Accuracy: 0.8739
  Validation Loss: 0.3194
-----
Epoch 7:
  Training Accuracy: 0.9404
  Training Loss: 0.1698
  Validation Accuracy: 0.8762
  Validation Loss: 0.3350
-----
Epoch 8:
  Training Accuracy: 0.9565
  Training Loss: 0.1343
  Validation Accuracy: 0.8770
  Validation Loss: 0.3798
-----
Epoch 9:
  Train

In [100]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.8749 - loss: 0.4291
Test Loss: 0.44615665078163147
Test Accuracy: 0.8733487725257874


In [104]:
model.save('model_lstm2.h5')



In [105]:
model.save('model_lstm2.keras')

In [106]:
from tensorflow.keras.models import load_model


In [109]:
my_lstm2 = load_model('model_lstm2.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [111]:
# Example review
new_review = new_review = ["This movie was a huge disappointment. The plot was confusing and dragged on without any real purpose. "]


# Preprocess and tokenize the review
new_review_cleaned = clean_review(new_review[0])  # Use your custom cleaning function
new_review_sequence = tokenizer.texts_to_sequences([new_review_cleaned])
new_review_padded = pad_sequences(new_review_sequence, padding='post', maxlen=230)

# Make a prediction
prediction = my_lstm2.predict(new_review_padded)

# Print prediction (0 = Negative, 1 = Positive)
print("Raw Prediction:", prediction)

print("Prediction: Positive" if prediction > 0.5 else "Prediction: Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Raw Prediction: [[0.00201818]]
Prediction: Negative
