Importing Libraries

In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
from transformers import DistilBertTokenizer, DistilBertModel
import time
import nltk
import re

In [7]:
# Load and preprocess dataset
df = pd.read_csv('/content/IMDB Dataset.csv', encoding='ISO-8859-1', on_bad_lines='skip')
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Processing the Data

In [8]:
# Preprocess function
def preprocess(review):
    review = review.lower()
    review = re.sub('[^a-z]', ' ', review)
    words = [nltk.PorterStemmer().stem(word) for word in review.split() if word not in stopwords]
    return ' '.join(words)

In [9]:
# Apply preprocessing to reviews
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
processed_reviews = df['review'].apply(preprocess).tolist()
y_train = df['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2}).astype(np.int8)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(processed_reviews, y_train, test_size=0.2, random_state=42)

In [11]:
# Load BERT Model and Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [12]:
# Function to get BERT embeddings in batches
def get_bert_embeddings_batch(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

Limited usage of memory separately running X_train_bert.npy and X_test_bert.npy

In [13]:
# Get BERT embeddings for train and test data
X_train_bert = get_bert_embeddings_batch(X_train, batch_size=16)
# Save to disk
np.save("X_train_bert.npy", X_train_bert)
print("Saved X_train_bert.npy")

Saved X_train_bert.npy


In [14]:
X_test_bert = get_bert_embeddings_batch(X_test, batch_size=16)
# Save to disk
np.save("X_train_bert.npy", X_train_bert)
print("Saved X_train_bert.npy")

Saved X_train_bert.npy


In [15]:
# Train Logistic Regression Model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_bert, y_train)

In [16]:
# Evaluate on test set
y_pred = clf.predict(X_test_bert)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Test Accuracy: 0.7787


In [17]:
# Saving the models
joblib.dump(clf, 'sentiment_model.joblib')  # Save Logistic Regression model
torch.save(model.state_dict(), 'distilbert_model.pth')  # Save BERT model

In [18]:
# Example predictions
reviews = ['Pure cinematic magic!', 'This film left me speechless', 'Boring, even with explosions.']
test_embeddings = get_bert_embeddings_batch([preprocess(r) for r in reviews], batch_size=16)
predictions = clf.predict(test_embeddings)
print(predictions)  # Display predictions

[1 0 0]


In [20]:
clf1 = LogisticRegression(max_iter=2000, C=0.5, solver='liblinear')


In [21]:
clf1.fit(X_train_bert, y_train)

In [22]:
# Evaluate on test set
y_pred_1 = clf1.predict(X_test_bert)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Test Accuracy: 0.7787


In [25]:
from sklearn.ensemble import GradientBoostingClassifier

clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample= 0.8)



In [26]:
clf2.fit(X_train_bert, y_train)

In [27]:
y_pred_2 = clf2.predict(X_test_bert)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Test Accuracy: 0.7787
