In [1]:
!pip install -q transformers accelerate scikit-learn pandas torch

In [2]:
import pandas as pd
import os
import json
import numpy as np
import torch
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


In [3]:
df = pd.read_csv('../data/processed/preprocessed_reviews.csv')
df = df[['cleaned_text', 'sentiment_binary']].dropna()
df = df[df['cleaned_text'].str.strip().astype(bool)]
df.head()


Unnamed: 0,cleaned_text,sentiment_binary
0,cannot open app anymore,0.0
1,begging refund app month nobody replying,0.0
2,costly premium version approx indian rupees pe...,0.0
3,used keep organized updates made mess things c...,0.0
4,dan birthday oct,0.0


In [4]:
# 80/20 split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_text'], df['sentiment_binary'],
    test_size=0.2, random_state=42, stratify=df['sentiment_binary']
)


In [5]:
#tokenize text
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)


In [6]:
# convert to PyTorch dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long),
        }
        if 'token_type_ids' in self.encodings:
            item['token_type_ids'] = torch.tensor(self.encodings['token_type_ids'][idx], dtype=torch.long)
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)


train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset   = SentimentDataset(val_encodings,   val_labels)


In [7]:
# load pretrained BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=10
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds),
        'f1': f1_score(labels, preds)
    }


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()




Step,Training Loss
10,0.7082
20,0.6043
30,0.4851
40,0.4445
50,0.5133
60,0.4618
70,0.4685
80,0.4581
90,0.5434
100,0.6086




TrainOutput(global_step=1557, training_loss=0.276961836664816, metrics={'train_runtime': 3890.4823, 'train_samples_per_second': 6.403, 'train_steps_per_second': 0.4, 'total_flos': 1638655652782080.0, 'train_loss': 0.276961836664816, 'epoch': 3.0})

In [11]:
# Cell 10: Evaluate & save metrics + plots


# 1) Get predictions
pred_out = trainer.predict(val_dataset)
preds    = np.argmax(pred_out.predictions, axis=1)
labels   = val_labels.values

# 2) Compute metrics
acc  = accuracy_score(labels, preds)
prec = precision_score(labels, preds)
rec  = recall_score(labels, preds)
f1   = f1_score(labels, preds)

clf_rep = classification_report(
    labels, preds,
    target_names=["Negative", "Positive"],
    digits=4
)

# 3) Create evaluation directory
os.makedirs("evaluation", exist_ok=True)

# 4) Save numeric metrics to JSON
metrics = {
    "model": "bert",
    "accuracy": acc,
    "precision": prec,
    "recall": rec,
    "f1": f1
}
with open("../evaluation/bert_metrics.json", "w") as fp:
    json.dump(metrics, fp, indent=2)

# 6) Confusion matrix plot
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(5, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Neg","Pos"],
    yticklabels=["Neg","Pos"]
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("BERT Confusion Matrix")
plt.tight_layout()
plt.savefig("../evaluation/bert_confusion_matrix.png")
plt.close()

print("Evaluation metrics & plots saved to /evaluation/")




Evaluation metrics & plots saved to /evaluation/


In [12]:
# save model and tokenizer
model.save_pretrained('../models/bert-sentiment')
tokenizer.save_pretrained('../models/bert-sentiment')


('../models/bert-sentiment/tokenizer_config.json',
 '../models/bert-sentiment/special_tokens_map.json',
 '../models/bert-sentiment/vocab.txt',
 '../models/bert-sentiment/added_tokens.json',
 '../models/bert-sentiment/tokenizer.json')