In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 89% 23.0M/25.7M [00:00<00:00, 74.0MB/s]
100% 25.7M/25.7M [00:00<00:00, 67.9MB/s]


In [None]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [None]:
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
lemmatizer=WordNetLemmatizer()
def preprocess_text(text):
  text=text.lower()

  #remove HTML tags
  text=re.sub(r'<.*?>',"",text)

  #remove non alphapitichal text
  text = re.sub(r'[^a-zA-Z\s]', '', text)

  words=nltk.word_tokenize(text)

  words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]

  text=' '.join(words)

  return text

df["cleaning_review"]=df['review'].apply(preprocess_text)

In [None]:
import re
def handle_negations(text):
    text = re.sub(r"\bnot\b|\bno\b|\bnever\b", 'not', text)
    return text
df["cleaning_review"] = df["cleaning_review"].apply(handle_negations)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
Tokenizer=Tokenizer(num_words=5000)
Tokenizer.fit_on_texts(df['cleaning_review'])
sequences = Tokenizer.texts_to_sequences(df['review'])
x=pad_sequences(sequences,maxlen=100)

In [None]:
x.shape

(50000, 100)

In [None]:
y=df['sentiment']
y.shape

(50000,)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((40000, 100), (10000, 100), (40000,), (10000,))

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dropout

model=Sequential([
    Input(shape=(100,)),
    Embedding(5000,100,input_length=100),
    LSTM(128,return_sequences=True),
    Dropout(0.4),
    LSTM(64),
    Dropout(0.2),
    Dense(1,activation='sigmoid')
])



In [None]:
model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss="binary_crossentropy",
    metrics=['accuracy']
)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 12ms/step - accuracy: 0.7069 - loss: 0.5429 - val_accuracy: 0.8544 - val_loss: 0.3408
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.8720 - loss: 0.3117 - val_accuracy: 0.8584 - val_loss: 0.3426
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.8842 - loss: 0.2850 - val_accuracy: 0.8595 - val_loss: 0.3243
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.8892 - loss: 0.2693 - val_accuracy: 0.8615 - val_loss: 0.3243
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.8938 - loss: 0.2653 - val_accuracy: 0.8584 - val_loss: 0.3314
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.8940 - loss: 0.2621 - val_accuracy: 0.8533 - val_loss: 0.3420


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8648 - loss: 0.3163
Test Loss: 0.3181909918785095
Test Accuracy: 0.864799976348877


In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      4961
           1       0.86      0.87      0.87      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [None]:
import numpy as np
predection=model.predict(X_test)
y_pred=np.argmax(predection,axis=1)
y

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1
...,...
49995,1
49996,0
49997,0
49998,0


In [None]:
tf.keras.models.save_model(model, 'my_model.keras')

In [None]:
from google.colab import files
files.download('my_model.keras')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import tensorflow as tf
model = tf.keras.models.load_model('my_model.keras')

In [None]:
import pickle
tokenizer = Tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
def preprocess_given_text(text, tokenizer):
    sequence = tokenizer.texts_to_sequences([text])

    padded_sequence = pad_sequences(sequence, maxlen=100)

    return padded_sequence

In [None]:
def predict_sentiment(text):
    padded_sequence = preprocess_given_text(text, Tokenizer)

    prediction = model.predict(padded_sequence)

    positive_confidence = prediction[0][0] * 100

    if positive_confidence > 50:
        return f"Positive with confidence {positive_confidence:.2f}%"
    else:
        return f"Negative with confidence {100 - positive_confidence:.2f}%"


In [None]:
predict_sentiment("I couldn't even finish watching it. It was that bad—poor acting, predictable plot, and awful dialogue.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step


'Negative with confidence 99.04%'

In [None]:
!pip install gradio



In [None]:
import gradio as gr
import pickle
import matplotlib.pyplot as plt
model = tf.keras.models.load_model("my_model.keras")

with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
def classify_sentiment(text):
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=100)
    prediction = model.predict(padded_sequences)

    positive_confidence = prediction[0][0]
    negative_confidence = 1 - positive_confidence


    if positive_confidence > 0.7:
        sentiment = "Positive"
    elif negative_confidence > 0.7:
        sentiment = "Negative"
    elif 0.4 < positive_confidence < 0.6 and 0.4 < negative_confidence < 0.6:
        sentiment = "Natural"
    else:
        sentiment = "Neutral"

    labels = ["Positive", "Negative"]
    confidences = [positive_confidence, negative_confidence]

    plt.figure(figsize=(6, 6))
    plt.pie(confidences, labels=labels, autopct='%1.1f%%', startangle=90)
    plt.title("Sentiment Confidence Distribution")
    plt.axis('equal')
    plt.savefig("sentiment_distribution.png")
    plt.close()
    return sentiment, f"{positive_confidence:.2f}", f"{negative_confidence:.2f}", "sentiment_distribution.png"

iface = gr.Interface(
    fn=classify_sentiment,
    inputs=gr.Textbox(label="Enter a movie review"),
    outputs=[
        gr.Textbox(label="Sentiment Analysis Result"),
        gr.Textbox(label="Positive Confidence"),
        gr.Textbox(label="Negative Confidence"),
        gr.Image(label="Sentiment Confidence Distribution")
    ],
    title="Movie Review Sentiment Analysis",
    description="Enter a movie review to get a prediction with sentiment and confidence."
)


iface.launch(debug=True)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://841016d81de6af3892.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
