In [96]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset/IMDB_Dataset.csv



# **Importing Libraries**


In [97]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import sys
import keras

# **Reading the dataset**

In [98]:
path= "/kaggle/input/imdb-dataset/IMDB_Dataset.csv"
data = pd.read_csv(path)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# **Checking the shape of the data**

In [99]:
data.shape

(50000, 2)

# **Checking the first few data and the last few data**

In [100]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [101]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


# **Checking if the data is balanced**

In [102]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# **Checking if the data has null values**

In [103]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

# **Preprocessing the data**

In [104]:
# removing special characters with the help of regular expression and beautiful soup
import re
from bs4 import BeautifulSoup

def clean_review(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Apply the cleaning function to the 'review' column
data['review'] = data['review'].apply(clean_review)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,Bad plot bad dialogue bad acting idiotic direc...,negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,Im going to have to disagree with the previous...,negative


# **Converting the categorical column(sentiment) into numbers**

In [125]:
data.replace({'sentiment':{'positive':1,'negative':0}}, inplace = True)

data.to_csv('preprocessed_data.csv', index=False)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tech...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically theres a family where a little boy J...,0
4,Petter Matteis Love in the Time of Money is a ...,1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,Bad plot bad dialogue bad acting idiotic direc...,0
49997,I am a Catholic taught in parochial elementary...,0
49998,Im going to have to disagree with the previous...,0


# **Splitting the data into train and test data**

In [106]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# **Checking the data split**

In [107]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


# **Tokenizing the words and padding them**

In [108]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)

# **Creating the Y column for train and test**

In [109]:
y_train= train_data['sentiment']
y_test= test_data['sentiment']

# **Building the model for LSTM**

In [110]:
import keras
model = Sequential()
model.add(keras.Input(shape=(200,)))
model.add(Embedding(input_dim= 5000, output_dim = 128, input_length = 200))
model.add(LSTM(128, dropout= 0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation= 'sigmoid'))

# **Printing model summary**

In [111]:
print(model.summary())

None


# **Compiling the model**

In [112]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

# **Training the model**

In [113]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split= 0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 220ms/step - accuracy: 0.7184 - loss: 0.5327 - val_accuracy: 0.8360 - val_loss: 0.3849
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 219ms/step - accuracy: 0.8484 - loss: 0.3620 - val_accuracy: 0.8416 - val_loss: 0.3659
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 221ms/step - accuracy: 0.8701 - loss: 0.3156 - val_accuracy: 0.8593 - val_loss: 0.3316
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 223ms/step - accuracy: 0.8933 - loss: 0.2676 - val_accuracy: 0.8749 - val_loss: 0.3176
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 225ms/step - accuracy: 0.9140 - loss: 0.2250 - val_accuracy: 0.8575 - val_loss: 0.3479


<keras.src.callbacks.history.History at 0x78eceeac6a10>

# **Evaluating the model**

In [114]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss:{loss}")
print(f"Test Accuracy:{accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 77ms/step - accuracy: 0.8636 - loss: 0.3412
Test Loss:0.3383336067199707
Test Accuracy:0.8651999831199646


# **Checking the model**

In [115]:
def predict_sentiment(review):
  #tokeninzing and padding the review before sending it to the model
  sequence= pad_sequences(tokenizer.texts_to_sequences([review]),maxlen=200)
  prediction = model.predict(sequence)
  sentiment= "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [116]:
example_review = "The movie was not that good "

sentiment = predict_sentiment(example_review)

print(f"The sentiment of this review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
The sentiment of this review is: negative


In [118]:
example_review1 = "The movie was great "

sentiment1 = predict_sentiment(example_review1)

print(f"The sentiment of this review is: {sentiment1}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
The sentiment of this review is: positive


# **Building model for GRU**

In [119]:
modelG = Sequential()
modelG.add(keras.Input(shape=(200,)))
modelG.add(Embedding(input_dim= 5000, output_dim = 128, input_length = 200))
modelG.add(GRU(128, dropout= 0.2, recurrent_dropout=0.2))
modelG.add(Dense(1, activation= 'sigmoid'))
print(modelG.summary())

None


# **Compiling the GRU model**

In [120]:
modelG.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

# **Training the GRU model**

In [121]:
modelG.fit(X_train, y_train, epochs=5, batch_size=64, validation_split= 0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 243ms/step - accuracy: 0.6416 - loss: 5848924160.0000 - val_accuracy: 0.6884 - val_loss: 0.5752
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 244ms/step - accuracy: 0.7480 - loss: 23.9974 - val_accuracy: 0.6909 - val_loss: 0.5728
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 241ms/step - accuracy: 0.7632 - loss: 40517572.0000 - val_accuracy: 0.6600 - val_loss: 0.5994
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 242ms/step - accuracy: 0.7339 - loss: 0.5223 - val_accuracy: 0.6699 - val_loss: 0.5973
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 241ms/step - accuracy: 0.7523 - loss: 0.5003 - val_accuracy: 0.6743 - val_loss: 0.5983


<keras.src.callbacks.history.History at 0x78eceff03be0>

# **Evaluating the GRU model**

In [122]:
lossG, accuracyG = modelG.evaluate(X_test, y_test)
print(f"Test Loss:{lossG}")
print(f"Test Accuracy:{accuracyG}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 81ms/step - accuracy: 0.6777 - loss: 0.5967
Test Loss:0.5974130034446716
Test Accuracy:0.6747000217437744


# **Checking the GRU model**

In [123]:
def predict_sentimentG(review):
  #tokeninzing and padding the review before sending it to the model
  sequence= pad_sequences(tokenizer.texts_to_sequences([review]),maxlen=200)
  predictionG = modelG.predict(sequence)
  sentimentG= "positive" if predictionG[0][0] > 0.5 else "negative"
  return sentimentG

In [124]:
example_review = "This movie is good"
sentimentG = predict_sentimentG(example_review)
print(f"The sentiment of this review is:{sentimentG}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
The sentiment of this review is:positive
