In [None]:
!pip install torchtext==0.18.0

In [None]:

!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121

In [None]:
import torch
import pandas as pd
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import random
import time
import torch.nn.functional as F

In [None]:
RANDOM_SEED=42
torch.manual_seed(RANDOM_SEED)

NUM_EPOCHS = 15
BATCH_SIZE = 128
LEARNING_RATE = 0.005
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 2
VOCAB_SIZE=20000

torch.backends.cudnn.deterministic = True

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

In [None]:
tokenizer=get_tokenizer('spacy',language='en_core_web_sm')

In [None]:
def yeild_tokens(data_iter):
    for _,text in data_iter:
        yield tokenizer(text)

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

class IMDBBertDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_len=256):
        self.data = pd.read_csv(csv_path)
        self.data['sentiment'] = self.data['sentiment'].map({'positive': 1, 'negative': 0})
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['review']
        label = int(self.data.iloc[idx]['sentiment'])

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        # squeeze to remove batch dim
        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }
        return item


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
full_dataset = IMDBBertDataset(
    '/content/drive/My Drive/IMDB_sentiment_analysis/IMDB Dataset.csv',
    tokenizer=tokenizer,
    max_len=256
)

train_size = int(0.8 * len(full_dataset))
test_size  = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

train_size = int(0.85 * len(train_dataset))
val_size   = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

print(len(train_dataset), len(val_dataset), len(test_dataset))


In [None]:
!pip install transformers datasets torch accelerate

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=OUTPUT_DIM
).to(DEVICE)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./bert-finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
import numpy as np

pred_output = trainer.predict(val_dataset)

logits = pred_output.predictions   # shape: [N, 2]
labels = pred_output.label_ids     # shape: [N]

In [None]:
preds = np.argmax(logits, axis=1)

In [None]:
probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)
pos_probs = probs[:, 1]

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

accuracy  = accuracy_score(labels, preds)
precision = precision_score(labels, preds)
recall    = recall_score(labels, preds)
f1        = f1_score(labels, preds)
roc_auc   = roc_auc_score(labels, pos_probs)

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(labels, pos_probs)
roc_auc = roc_auc_score(labels, pos_probs)

print("ROC-AUC:", roc_auc)

import matplotlib.pyplot as plt

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", label="Random Classifier")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve â€“ BERT Sentiment Classifier (IMDB)")
plt.legend(loc="lower right")
plt.grid(True)

plt.show()
