In [None]:
!pip install kaggle



In [None]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [None]:
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
100% 25.7M/25.7M [00:02<00:00, 19.8MB/s]
100% 25.7M/25.7M [00:02<00:00, 11.2MB/s]


In [None]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [None]:
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

In [None]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [None]:
df=pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [None]:
df.replace({'sentiment':{'positive':1, 'negative':0}},inplace=True)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
train_df, test_df = train_test_split(df, test_size=0.33, random_state=42)

In [None]:
print(train_df.shape)
print(test_df.shape)

(33500, 2)
(16500, 2)


In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_df["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df["review"]), maxlen=200)

In [None]:
print(X_train)

[[2641   41  136 ...   13   27 1181]
 [   0    0    0 ...   75  210   78]
 [   0    0    0 ...    7  705  155]
 ...
 [   0    0    0 ... 1635    2  606]
 [   0    0    0 ...  246  103  125]
 [   0    0    0 ...   69   71 2080]]


In [None]:
y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

In [None]:
model=Sequential()
model.add(Embedding(input_dim=5000,output_dim=128, input_length=200))
model.add(LSTM(128,dropout=0.3,recurrent_dropout=0.3))
model.add(Dense(1, activation="sigmoid"))




In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 771713 (2.94 MB)
Trainable params: 771713 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7bcf4e48a4d0>

In [None]:
loss,accuracy=model.evaluate(X_test,y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.38061201572418213
Test Accuracy: 0.8732120990753174


In [None]:
def predictive(review):
  sequence=tokenizer.texts_to_sequences([review])
  padded_sequence=pad_sequences(sequence, maxlen=200)
  predict=model.predict(padded_sequence)
  sentiment = "positive" if predict[0][0] > 0.5 else "negative"
  return sentiment

In [None]:
new_review = "This movie was very bad.But still I like it."
sentiment = predictive(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: positive
