Importing Libraries

In [1]:
# 1️⃣ Import Required Libraries
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from transformers import DistilBertTokenizer, TFDistilBertModel
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import joblib

In [4]:
# 2️⃣ Load Dataset
df = pd.read_csv('/content/IMDB Dataset.csv', encoding='ISO-8859-1', on_bad_lines='skip')
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Processing the Data

In [5]:
# 3️⃣ Text Preprocessing
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.PorterStemmer()

def preprocess(review):
    review = review.lower()  # Convert to lowercase
    review = re.sub('[^a-z]', ' ', review)  # Remove non-alphabetic characters
    words = [stemmer.stem(word) for word in review.split() if word not in stopwords]  # Remove stopwords & stem
    return ' '.join(words)

df['processed_review'] = df['review'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
# # Apply preprocessing to reviews
# nltk.download('stopwords')
# stopwords = nltk.corpus.stopwords.words('english')
# processed_reviews = df['review'].apply(preprocess).tolist()
# y_train = df['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2}).astype(np.int8)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

In [7]:
# 4️⃣ Load TensorFlow-based DistilBERT Model & Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [8]:
# 5️⃣ Function to Get BERT Embeddings (Batch Processing for Efficiency)
def get_bert_embeddings_batch(texts, batch_size=16):
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='tf', truncation=True, padding=True, max_length=128)

        outputs = bert_model(inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Extract [CLS] token embedding
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)  # Stack all embeddings into a single numpy array

In [9]:
# 6️⃣ Convert Reviews to BERT Embeddings
processed_reviews = df['processed_review'].tolist()
X_bert = get_bert_embeddings_batch(processed_reviews, batch_size=16)

In [10]:
# 7️⃣ Prepare Labels for Training
y = df['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2}).astype(np.int8)

In [11]:
# 8️⃣ Split Data into Training & Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)

In [12]:
# 9️⃣ Define Deep Learning Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(768,)),  # BERT output size = 768
    Dropout(0.3),
    Dense(3, activation='softmax')  # 3 classes: Positive, Negative, Neutral
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# 🔟 Convert labels to TensorFlow format
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.int32)

In [25]:
# 1️⃣1️⃣ Train Deep Learning Model
model.fit(X_train, y_train_tensor, epochs=65, batch_size=32, validation_split=0.2)

Epoch 1/65
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8561 - loss: 0.3184 - val_accuracy: 0.7689 - val_loss: 0.5432
Epoch 2/65
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8541 - loss: 0.3197 - val_accuracy: 0.7656 - val_loss: 0.5375
Epoch 3/65
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8575 - loss: 0.3157 - val_accuracy: 0.7644 - val_loss: 0.5297
Epoch 4/65
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8583 - loss: 0.3160 - val_accuracy: 0.7689 - val_loss: 0.5316
Epoch 5/65
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8519 - loss: 0.3181 - val_accuracy: 0.7640 - val_loss: 0.5275
Epoch 6/65
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8587 - loss: 0.3145 - val_accuracy: 0.7709 - val_loss: 0.5349
Epoch 7/65
[1m1

<keras.src.callbacks.history.History at 0x795e171b6b90>

In [26]:
# 1️⃣2️⃣ Train Logistic Regression Model for Comparison
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [27]:
# 1️⃣3️⃣ Train Gradient Boosting Classifier
clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=0.8)
clf2.fit(X_train, y_train)

In [28]:
# 1️⃣4️⃣ Predict & Evaluate Performance
y_pred_dl = np.argmax(model.predict(X_test), axis=1)  # Deep Learning Model Predictions
y_pred_lr = clf.predict(X_test)  # Logistic Regression Predictions
y_pred_gb = clf2.predict(X_test)  # Gradient Boosting Predictions

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [29]:
print(f"Deep Learning Model Test Accuracy: {accuracy_score(y_test, y_pred_dl):.4f}")
print(f"Logistic Regression Test Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Gradient Boosting Test Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")

Deep Learning Model Test Accuracy: 0.7605
Logistic Regression Test Accuracy: 0.7794
Gradient Boosting Test Accuracy: 0.7415


In [31]:
# Training Accuracy for Deep Learning Model
y_train_pred_dl = np.argmax(model.predict(X_train), axis=1)
print(f"Deep Learning Model Train Accuracy: {accuracy_score(y_train, y_train_pred_dl):.4f}")
# Training Accuracy for Logistic Regression
y_train_pred_lr = clf.predict(X_train)
print(f"Logistic Regression Train Accuracy: {accuracy_score(y_train, y_train_pred_lr):.4f}")
# Training Accuracy for Gradient Boosting
y_train_pred_gb = clf2.predict(X_train)
print(f"Gradient Boosting Train Accuracy: {accuracy_score(y_train, y_train_pred_gb):.4f}")


[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Deep Learning Model Train Accuracy: 0.8843
Logistic Regression Train Accuracy: 0.7901
Gradient Boosting Train Accuracy: 0.7622


In [32]:
# 1️⃣5️⃣ Save Trained Deep Learning Model
model.save('sentiment_dl_model.h5')  # Save Deep Learning Model
joblib.dump(clf2, 'sentiment_gb_model.pkl')  # Save Gradient Boosting Model



['sentiment_gb_model.pkl']

**THE END**

In [None]:
# Train Logistic Regression Model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_bert, y_train)

In [None]:
# Evaluate on test set
y_pred = clf.predict(X_test_bert)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Test Accuracy: 0.7787


In [None]:
# Saving the models
joblib.dump(clf, 'sentiment_model.joblib')  # Save Logistic Regression model
torch.save(model.state_dict(), 'distilbert_model.pth')  # Save BERT model

In [None]:
# Example predictions
reviews = ['Pure cinematic magic!', 'This film left me speechless', 'Boring, even with explosions.']
test_embeddings = get_bert_embeddings_batch([preprocess(r) for r in reviews], batch_size=16)
predictions = clf.predict(test_embeddings)
print(predictions)  # Display predictions

[1 0 0]


In [None]:
clf1 = LogisticRegression(max_iter=2000, C=0.5, solver='liblinear')


In [None]:
clf1.fit(X_train_bert, y_train)

In [None]:
# Evaluate on test set
y_pred_1 = clf1.predict(X_test_bert)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Test Accuracy: 0.7787


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample= 0.8)



In [None]:
clf2.fit(X_train_bert, y_train)

In [None]:
y_pred_2 = clf2.predict(X_test_bert)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Test Accuracy: 0.7787
