<a href="https://colab.research.google.com/github/Sakshisharma87/Sentiment-Analysis/blob/main/sentiment_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk
!pip install scikit-learn



In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [None]:
from google.colab import files
uploaded = files.upload()

Saving IMDB Dataset.csv to IMDB Dataset.csv


In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df = pd.read_csv("IMDB Dataset.csv")
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df['clean_review'] = df['review'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Tokenize and pad
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_review'])

X = tokenizer.texts_to_sequences(df['clean_review'])
X_pad = pad_sequences(X, maxlen=max_len)
y = df['label'].values

In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# ✅ Train on full data
model.fit(X_pad, y, batch_size=128, epochs=5, validation_split=0.1)

Epoch 1/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m291s[0m 801ms/step - accuracy: 0.7270 - loss: 0.5208 - val_accuracy: 0.8626 - val_loss: 0.3378
Epoch 2/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 764ms/step - accuracy: 0.8853 - loss: 0.2958 - val_accuracy: 0.8716 - val_loss: 0.3065
Epoch 3/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 757ms/step - accuracy: 0.9065 - loss: 0.2479 - val_accuracy: 0.8826 - val_loss: 0.2895
Epoch 4/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 754ms/step - accuracy: 0.9202 - loss: 0.2096 - val_accuracy: 0.8772 - val_loss: 0.3097
Epoch 5/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 747ms/step - accuracy: 0.9301 - loss: 0.1887 - val_accuracy: 0.8726 - val_loss: 0.3215


<keras.src.callbacks.history.History at 0x7a7406dd3710>

In [None]:
model.save("sentiment_model_full_50k.h5")

import pickle
# Save tokenizer
with open("tokenizer_50k.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("✅ Model and tokenizer saved!")



✅ Model and tokenizer saved!


In [None]:
def predict_sentiment(text):
    clean = clean_text(text)
    seq = tokenizer.texts_to_sequences([clean])
    pad_seq = pad_sequences(seq, maxlen=max_len)
    prob = model.predict(pad_seq)[0][0]
    label = "Positive 😊" if prob >= 0.5 else "Negative 😞"
    print(f"\n💬 Review: {text}\n🎯 Score: {prob:.4f}\n🧾 Sentiment: {label}")

# Example
predict_sentiment("The movie was amazing and inspiring!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step

💬 Review: The movie was amazing and inspiring!
🎯 Score: 0.8592
🧾 Sentiment: Positive 😊


In [None]:
predict_sentiment("What a waste of time. Completely boring.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step

💬 Review: What a waste of time. Completely boring.
🎯 Score: 0.0112
🧾 Sentiment: Negative 😞


In [None]:
y_pred_prob = model.predict(X_pad, batch_size=256)
y_pred = (y_pred_prob >= 0.5).astype(int).flatten()

[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 294ms/step


In [None]:
# Accuracy
y_true = df['label'].values

acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Accuracy: {acc * 100:.2f}%")

# Classification report
print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))

# Confusion matrix
print("\n🧮 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))



✅ Accuracy: 94.28%

📊 Classification Report:
              precision    recall  f1-score   support

    Negative       0.96      0.93      0.94     25000
    Positive       0.93      0.96      0.94     25000

    accuracy                           0.94     50000
   macro avg       0.94      0.94      0.94     50000
weighted avg       0.94      0.94      0.94     50000


🧮 Confusion Matrix:
[[23205  1795]
 [ 1066 23934]]


In [None]:
predict_sentiment("It was okay. Not great, not terrible. Just average.")
predict_sentiment("Some parts were good, but overall it didn’t leave much of an impression.")
predict_sentiment("Had potential, but it just didn’t work for me.")
predict_sentiment("The visuals were nice, but the plot dragged.")
predict_sentiment("Mediocre movie. You won’t miss anything if you skip it.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step

💬 Review: It was okay. Not great, not terrible. Just average.
🎯 Score: 0.1684
🧾 Sentiment: Negative 😞
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step

💬 Review: Some parts were good, but overall it didn’t leave much of an impression.
🎯 Score: 0.4716
🧾 Sentiment: Negative 😞
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step

💬 Review: Had potential, but it just didn’t work for me.
🎯 Score: 0.3449
🧾 Sentiment: Negative 😞
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step

💬 Review: The visuals were nice, but the plot dragged.
🎯 Score: 0.3535
🧾 Sentiment: Negative 😞
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step

💬 Review: Mediocre movie. You won’t miss anything if you skip it.
🎯 Score: 0.0166
🧾 Sentiment: Negative 😞


In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM

print("\n⚙️ Model Configuration:")
print(f"- Embedding input dim     : {model.layers[0].input_dim}")
print(f"- Embedding output dim    : {model.layers[0].output_dim}")
print(f"- Sequence length         : {model.input_shape[1]}")

# Handle LSTM or Bidirectional(LSTM)
if isinstance(model.layers[1], Bidirectional):
    lstm_layer = model.layers[1].forward_layer
    print(f"- LSTM units              : {lstm_layer.units}")
    print(f"- Bidirectional LSTM?     : True")
elif isinstance(model.layers[1], LSTM):
    print(f"- LSTM units              : {model.layers[1].units}")
    print(f"- Bidirectional LSTM?     : False")
else:
    print("- LSTM layer not found.")

print(f"- Activation (output)     : {model.layers[-1].activation.__name__}")
print(f"- Loss function           : {model.loss}")
print(f"- Optimizer               : {type(model.optimizer).__name__}")
print(f"- Metrics                 : {model.metrics_names}")


⚙️ Model Configuration:
- Embedding input dim     : 10000
- Embedding output dim    : 64
- Sequence length         : 200
- LSTM units              : 64
- Bidirectional LSTM?     : True
- Activation (output)     : sigmoid
- Loss function           : binary_crossentropy
- Optimizer               : Adam
- Metrics                 : ['loss', 'compile_metrics']
