Importing Libraries

In [None]:
# 1️⃣ Import Required Libraries
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from transformers import DistilBertTokenizer, TFDistilBertModel
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib

In [None]:
# 2️⃣ Load Dataset
df = pd.read_csv('/content/IMDB Dataset.csv', encoding='ISO-8859-1', on_bad_lines='skip')
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# 3️⃣ Understand Dataset
df.nunique() # Get number of unique values

Unnamed: 0,0
review,49582
sentiment,2


In [None]:
# Checking whether Data is imbalance or not
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


Processing the Data

In [None]:
# 4️⃣ Text Preprocessing
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.PorterStemmer()

def preprocess(review):
    review = review.lower()  # Convert to lowercase
    review = re.sub('[^a-z]', ' ', review)  # Remove non-alphabetic characters
    words = [stemmer.stem(word) for word in review.split() if word not in stopwords]  # Remove stopwords & stem
    return ' '.join(words)

df['processed_review'] = df['review'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 5️⃣ Load TensorFlow-based DistilBERT Model & Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g.

In [None]:
# 6️⃣ Function to Get BERT Embeddings (Batch Processing for Efficiency)
def get_bert_embeddings_batch(texts, batch_size=16):
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='tf', truncation=True, padding=True, max_length=128)

        outputs = bert_model(inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Extract [CLS] token embedding
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)  # Stack all embeddings into a single numpy array

In [None]:
# 7️⃣ Convert Reviews to BERT Embeddings
processed_reviews = df['processed_review'].tolist()
X_bert = get_bert_embeddings_batch(processed_reviews, batch_size=16)

In [None]:
# 8️⃣ Prepare Labels for Training (Binary Classification)
y = df['sentiment'].map({'positive': 1, 'negative': 0}).astype(np.int8)  # Only 0 & 1

In [None]:
# 9️⃣ Split Data into Training & Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)


In [None]:
# 🔟 Define Deep Learning Model (Binary Classification)
model = Sequential([
    Dense(128, activation='relu', input_shape=(768,)),  # BERT output size = 768
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # 1 neuron for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# 1️⃣1️⃣ Train Deep Learning Model
model.fit(X_train, y_train, epochs=53, batch_size=32, validation_split=0.2)

Epoch 1/53
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.6900 - loss: 0.5789 - val_accuracy: 0.7544 - val_loss: 0.5083
Epoch 2/53
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7485 - loss: 0.5100 - val_accuracy: 0.7600 - val_loss: 0.5018
Epoch 3/53
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7551 - loss: 0.5000 - val_accuracy: 0.7624 - val_loss: 0.4944
Epoch 4/53
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7554 - loss: 0.4922 - val_accuracy: 0.7635 - val_loss: 0.4892
Epoch 5/53
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7647 - loss: 0.4854 - val_accuracy: 0.7635 - val_loss: 0.4875
Epoch 6/53
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7629 - loss: 0.4887 - val_accuracy: 0.7615 - val_loss: 0.4866
Epoch 7/53
[1m1

<keras.src.callbacks.history.History at 0x7edd57562110>

In [None]:
# 1️⃣2️⃣ Train Logistic Regression Model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
# 1️⃣3️⃣ Train Random Forest Model
clf_rf = RandomForestClassifier(n_estimators=200, random_state=42)
clf_rf.fit(X_train, y_train)

In [None]:
# 1️⃣4️⃣ Train Gradient Boosting
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=0.8)
clf_gb.fit(X_train, y_train)

In [None]:
# 1️⃣5️⃣ Compute Training Accuracy for All Models
y_train_pred_dl = np.argmax(model.predict(X_train), axis=1)  # Deep Learning Model
y_train_pred_lr = clf.predict(X_train)  # Logistic Regression
y_train_pred_gb = clf_gb.predict(X_train)  # Gradient Boosting
y_train_pred_rf = clf_rf.predict(X_train)  # Random Forest (if using)

# Print Training Accuracy for Each Model
print(f"Deep Learning Model Train Accuracy: {accuracy_score(y_train, y_train_pred_dl):.4f}")
print(f"Logistic Regression Train Accuracy: {accuracy_score(y_train, y_train_pred_lr):.4f}")
print(f"Gradient Boosting Train Accuracy: {accuracy_score(y_train, y_train_pred_gb):.4f}")
print(f"Random Forest Train Accuracy: {accuracy_score(y_train, y_train_pred_rf):.4f}")  # If using Random Forest


[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Deep Learning Model Train Accuracy: 0.5010
Logistic Regression Train Accuracy: 0.7904
Gradient Boosting Train Accuracy: 0.7611
Random Forest Train Accuracy: 1.0000


In [None]:
# 1️⃣6️⃣ Predict & Evaluate All Models on Test Data
y_pred_dl = (model.predict(X_test) > 0.5).astype("int32").flatten()
y_pred_lr = clf.predict(X_test)
y_pred_rf = clf_rf.predict(X_test)
y_pred_gb = clf_gb.predict(X_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [None]:
# 1️⃣6️⃣ Predict & Evaluate All Models on Test Data
print(f"Deep Learning Model Test Accuracy: {accuracy_score(y_test, y_pred_dl):.4f}")
print(f"Logistic Regression Test Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Gradient Boosting Test Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"Random Forest Test Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")

Deep Learning Model Test Accuracy: 0.7786
Logistic Regression Test Accuracy: 0.7798
Gradient Boosting Test Accuracy: 0.7389
Random Forest Test Accuracy: 0.7335


In [None]:
# 1️⃣7️⃣ Save All Models
model.save('sentiment_dl_model.h5')  # Save Deep Learning Model
joblib.dump(clf, 'sentiment_lr_model.pkl')  # Save Logistic Regression Model
joblib.dump(clf_rf, 'sentiment_rf_model.pkl')  # Save Random Forest Model
joblib.dump(clf_gb, 'sentiment_gb_model.pkl')  # Save Gradient Boosting Model



['sentiment_gb_model.pkl']

In [None]:
# 1️⃣8️⃣ Example Predictions on New Data
reviews = ['Pure cinematic magic!', 'This film left me speechless', 'Boring, even with explosions.']
test_embeddings = get_bert_embeddings_batch([preprocess(r) for r in reviews], batch_size=16)
predictions = clf.predict(test_embeddings)
print(predictions)  # Display predictions

[1 0 0]
