In [None]:
! pip install kaggle



In [None]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
kaggle_dictionary= json.load(open("kaggle.json"))

In [None]:
# setup kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [None]:
kaggle_dictionary.keys()

dict_keys(['username', 'key'])

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s]
100% 25.7M/25.7M [00:00<00:00, 704MB/s]


In [None]:
# unzip the dataset file
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

In [None]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
data.shape

(50000, 2)

In [None]:
import re
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def preprocess(text):
    # lowercase
    text = text.lower()
    # remove special characters, keep only alphabets and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # stemming word by word
    text = " ".join(ps.stem(word) for word in text.split())
    return text

# apply on reviews
data['review'] = data['review'].apply(preprocess)


In [None]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl product br br the film techniqu...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there a famili where a littl boy jake th...,negative
4,petter mattei love in the time of money is a v...,positive


In [None]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
# replacing positive with 1 and negative with 0
data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)

  data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)


In [None]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,1
1,a wonder littl product br br the film techniqu...,1
2,i thought thi wa a wonder way to spend time on...,1
3,basic there a famili where a littl boy jake th...,0
4,petter mattei love in the time of money is a v...,1


In [None]:
train_data,test_data= train_test_split(data,test_size=0.2,random_state=42)

In [None]:
train_data.shape

(40000, 2)

In [None]:
test_data.shape

(10000, 2)

In [None]:
import re

def clean_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z\s]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

cleaned_reviews = data['review'].apply(clean_text)

# Tokenizer
tr = Tokenizer()
tr.fit_on_texts(cleaned_reviews)
print("Total unique words:", len(tr.word_index))

# Find the review with max words
lengths = cleaned_reviews.apply(lambda r: len(r.split()))
idx = lengths.idxmax()   # index of longest review
print("Max number of words in a review:", lengths[idx])
print("\nLongest review:\n", data['review'][idx])   # original version
print("\nCleaned longest review:\n", cleaned_reviews[idx])  # cleaned version


Total unique words: 137631
Max number of words in a review: 2460

Longest review:
 match tag team tabl match bubba ray and spike dudley vs eddi guerrero and chri benoit bubba ray and spike dudley start thing off with a tag team tabl match against eddi guerrero and chri benoit accord to the rule of the match both oppon have to go through tabl in order to get the win benoit and guerrero heat up earli on by take turn hammer first spike and then bubba ray a german suplex by benoit to bubba took the wind out of the dudley brother spike tri to help hi brother but the refere restrain him while benoit and guerrero gang up on him in the corner with benoit stomp away on bubba guerrero set up a tabl outsid spike dash into the ring and somersault over the top rope onto guerrero on the outsid after recov and take care of spike guerrero slip a tabl into the ring and help the wolverin set it up the tandem then set up for a doubl superplex from the middl rope which would have put bubba through the tab

In [None]:
# tokenize the data
tokenizer = Tokenizer(num_words=5000) # most frequent 5000 words
tokenizer.fit_on_texts(train_data['review'])
x_train= pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)
x_test= pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [None]:
y_train= train_data['sentiment']
y_test = test_data['sentiment']

In [None]:
x_train

array([[1088,  191,    6, ...,  235,  125, 3340],
       [   1,  309,    6, ...,   97,   57,    7],
       [   0,    0,    0, ...,    2,  300,   62],
       ...,
       [   0,    0,    0, ...,  357,    2,  677],
       [   0,    0,    0, ...,   57,  106,  870],
       [   0,    0,    0, ...,   69,   81, 1887]], dtype=int32)

In [None]:
x_test

array([[   0,    0,    0, ...,  208,    7,  552],
       [  64,   21,  522, ...,   46,    9,   12],
       [   0,    0,    0, ...,   54,  998,   13],
       ...,
       [   0,    0,    0, ...,  106,  209, 1114],
       [   0,    0,    0, ..., 1054,    1, 1317],
       [   0,    0,    0, ...,    1,  323,   28]], dtype=int32)

In [None]:
y_train

Unnamed: 0,sentiment
39087,0
30893,0
45278,1
16398,0
13653,0
...,...
11284,1
44732,1
38158,0
860,1


In [None]:
y_test

Unnamed: 0,sentiment
33553,1
9427,1
199,0
12447,1
39489,0
...,...
28567,0
25079,1
18707,1
15200,0


# LSTM

In [None]:
model = Sequential()
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))# there are 5000 unique words
model.add(LSTM(128,dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))



In [None]:
# each review is represented by numbers like [23,48,7,...]
# each word in the review is again vectored into 128 numbers like 23=[-86,51,4,7,..]
# finally [[(128 nums)],[(128 nums)],...] is formed
# each word i.e each small vector is passed as input to LSTM
# finnaly an output layer is added that gives one output

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(x_train,y_train,epochs=5,batch_size=32,validation_split=0.2)

Epoch 1/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m455s[0m 451ms/step - accuracy: 0.7025 - loss: 0.5623 - val_accuracy: 0.8396 - val_loss: 0.3846
Epoch 2/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 440ms/step - accuracy: 0.8445 - loss: 0.3739 - val_accuracy: 0.8626 - val_loss: 0.3336
Epoch 3/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 448ms/step - accuracy: 0.8781 - loss: 0.3075 - val_accuracy: 0.8677 - val_loss: 0.3285
Epoch 4/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 443ms/step - accuracy: 0.8912 - loss: 0.2743 - val_accuracy: 0.8704 - val_loss: 0.3183
Epoch 5/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 437ms/step - accuracy: 0.9130 - loss: 0.2246 - val_accuracy: 0.8626 - val_loss: 0.3690


<keras.src.callbacks.history.History at 0x7e8c47faa840>

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 76ms/step - accuracy: 0.8651 - loss: 0.3552
Test Loss: 0.35390883684158325
Test Accuracy: 0.8676000237464905


In [None]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [None]:
# example usage
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 656ms/step
The sentiment of the review is: negative


In [None]:
# example usage
new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
The sentiment of the review is: negative


In [None]:
# example usage
new_review = "this movie's hero is not goodlooking and the heroine is too good to be with hero but the story is not progressing at all but my friends liked the movie.I also thought it was good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
The sentiment of the review is: positive


In [None]:
import joblib
import pickle

In [None]:
pickle.dump(model, open("model.pkl", "wb"))

In [None]:
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))