In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other


In [3]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [4]:
!pip install tensorflow



In [5]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [6]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
import numpy as np

In [7]:
with ZipFile("/content/imdb-dataset-of-50k-movie-reviews.zip", 'r') as zip_ref:
    zip_ref.extractall()

In [8]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [9]:
data.shape

(50000, 2)

In [10]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [12]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [13]:
data.replace({"sentiment":{'positive':1,'negative':0}},inplace = True)

  data.replace({"sentiment":{'positive':1,'negative':0}},inplace = True)


In [14]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


In [15]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [16]:
train_data,test_data = train_test_split(data,test_size = 0.2,random_state = 2)

In [17]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Initialize Stemmer
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenize words
    words = word_tokenize(text)

    # Apply stemming first
    stemmed_words = [stemmer.stem(word) for word in words]

    # Remove stopwords after stemming
    processed_words = [word for word in stemmed_words if word not in stop_words]

    # Join words back to string
    return " ".join(processed_words)

# Apply preprocessing to reviews
train_data['cleaned_review'] = train_data['review'].apply(preprocess_text)
test_data['cleaned_review'] = test_data['review'].apply(preprocess_text)

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['cleaned_review'])
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['cleaned_review']), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['cleaned_review']), maxlen=200)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [19]:
print(x_train)


[[   0    0    0 ...    3  400   32]
 [   0    0    0 ...    8   53 2537]
 [2854   67 2015 ...  100  810 1471]
 ...
 [  65   55  555 ...    3  638    7]
 [   0    0    0 ...    1    5 4722]
 [   0    0    0 ...   12  161 2504]]


In [20]:
print(x_test)

[[   0    0    0 ...  451  186 1152]
 [ 122    5   10 ...    1 3189 2248]
 [   0    0    0 ...   16  391  201]
 ...
 [   0    0    0 ... 2145   20   66]
 [   0    0    0 ...   19  665    5]
 [   0    0    0 ...   18    1    5]]


In [21]:
y_train = train_data["sentiment"]
y_test = test_data["sentiment"]

In [22]:
print(y_train)

5478     0
22132    0
33533    1
42605    1
48740    1
        ..
44566    0
30280    0
6637     1
35343    0
23720    0
Name: sentiment, Length: 40000, dtype: int64


In [23]:
print(y_test)

23656    0
27442    0
40162    1
8459     1
8051     1
        ..
44231    0
18034    0
33856    0
15906    1
40899    1
Name: sentiment, Length: 10000, dtype: int64


In [24]:
print(x_train.shape, x_test.shape)

(40000, 200) (10000, 200)


In [25]:
def build_model(hp):
  model = Sequential()
  model.add(Embedding(input_dim=5000, output_dim=200, input_length=200))
  model.add(LSTM(units=hp.Int('lstm_units', min_value=32, max_value=128, step=32)))
  counter = 0
  for i in range(hp.Int('num_layers',min_value = 1, max_value = 5)):
    if counter == 0:

      model.add(Dense(hp.Int('units'+str(i),min_value = 8,max_value = 128,step = 8),activation = hp.Choice('activation'+str(i),values = ['relu','tanh','sigmoid']),input_dim = 200))
      model.add(Dropout(hp.Choice('dropout'+str(i),values = [0.1,0.2,0.3,0.4,0.5])))
    else:

      model.add(Dense(hp.Int('units'+str(i),min_value = 8,max_value = 128,step = 8),activation = hp.Choice('activation'+str(i),values = ['relu','tanh','sigmoid'])))
      model.add(Dropout(hp.Choice('dropout'+str(i),values = [0.1,0.2,0.3,0.4,0.5])))
      counter+=1
  model.add(Dense(1, activation='sigmoid'))
  optimizer = hp.Choice('optimizer',values = ['adam','rmsprop'])
  model.compile(optimizer = optimizer,loss='binary_crossentropy',metrics=['accuracy'])
  return model

In [26]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    directory ='mydir',
    project_name = 'Sentiment_Analysis')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
tuner.search(x_train,y_train,epochs = 20,validation_data = (x_test,y_test))

Trial 5 Complete [00h 05m 40s]
val_accuracy: 0.890999972820282

Best val_accuracy So Far: 0.892300009727478
Total elapsed time: 00h 32m 11s


In [28]:
tuner.get_best_hyperparameters()[0].values

{'lstm_units': 32,
 'num_layers': 1,
 'units0': 88,
 'activation0': 'relu',
 'dropout0': 0.4,
 'optimizer': 'rmsprop',
 'units1': 120,
 'activation1': 'sigmoid',
 'dropout1': 0.5,
 'units2': 80,
 'activation2': 'tanh',
 'dropout2': 0.5,
 'units3': 112,
 'activation3': 'tanh',
 'dropout3': 0.1}

In [29]:
model = tuner.get_best_models(num_models=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [30]:
model.summary()

In [31]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=100,  # Original high epoch count
    batch_size=32,
    callbacks=[early_stopping],  # Adding Early Stopping
    verbose=1
)

Epoch 1/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 12ms/step - accuracy: 0.9579 - loss: 0.1218 - val_accuracy: 0.8857 - val_loss: 0.3545
Epoch 2/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.9676 - loss: 0.0944 - val_accuracy: 0.8808 - val_loss: 0.3876
Epoch 3/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 12ms/step - accuracy: 0.9749 - loss: 0.0741 - val_accuracy: 0.8817 - val_loss: 0.4223
Epoch 4/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.9813 - loss: 0.0591 - val_accuracy: 0.8823 - val_loss: 0.5619
Epoch 5/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.9874 - loss: 0.0451 - val_accuracy: 0.8769 - val_loss: 0.5620
Epoch 6/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.9895 - loss: 0.0366 - val_accuracy: 0.8776 - val_loss: 0.701

<keras.src.callbacks.history.History at 0x7bf48e9d26d0>

In [33]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8857 - loss: 0.3589
Test Loss: 0.35446277260780334
Test Accuracy: 0.885699987411499


In [48]:
def predict_sentiment(review):
    # Apply preprocessing
    cleaned_review = preprocess_text(review)

    # Convert text to sequence
    sequence = tokenizer.texts_to_sequences([cleaned_review])

    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=200)

    # Predict sentiment
    prediction = model.predict(padded_sequence)[0][0]

    # Determine sentiment label
    sentiment = 'positive' if prediction > 0.5 else 'negative'

    # Return both sentiment and confidence score
    return sentiment, prediction


In [49]:
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
The sentiment of the review is: ('positive', np.float32(0.9543887))


In [50]:
new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
The sentiment of the review is: ('negative', np.float32(0.4372286))


In [51]:
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
The sentiment of the review is: ('negative', np.float32(0.45782337))


In [52]:
new_review = "This movie is fantastic"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
The sentiment of the review is: ('positive', np.float32(0.8762567))
