<a href="https://colab.research.google.com/github/ONION4520x/FYP/blob/main/FYP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ====================================
# 1. Install & Import Dependencies
# ====================================
!pip install torch torchvision transformers scikit-learn tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, BertModel, get_scheduler
from tqdm import tqdm
import json
from sklearn.metrics import accuracy_score, f1_score, classification_report




In [2]:
# ====================================
# 2. Label Definitions
# ====================================
SENTIMENT_LABELS = ["Negative", "Neutral", "Positive"]
ROLE_LABELS = ["bystander", "cyberbully", "supporter", "victim"]
TYPE_LABELS = ["Comparison-based Attack", "Defamation", "Harassment/Threats", "Insult", "Sarcasm", "not_cyberbully"]

def encode_sentiment(label: str) -> int:
    return SENTIMENT_LABELS.index(label)

def encode_role(label: str) -> int:
    return ROLE_LABELS.index(label)

def encode_type(labels: list) -> torch.Tensor:
    vec = torch.zeros(len(TYPE_LABELS), dtype=torch.float)
    for l in labels:
        if l in TYPE_LABELS:
            vec[TYPE_LABELS.index(l)] = 1.0
    return vec


In [3]:
# ====================================
# 3. Dataset Class
# ====================================
class CyberbullyingDataset(Dataset):
    def __init__(self, json_path, tokenizer, max_len=256):
        with open(json_path, "r", encoding="utf-8") as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["input_text"]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        sentiment = encode_sentiment(item["sentiment"])
        role = encode_role(item["role"])
        type_vec = encode_type(item["type"])

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "sentiment": torch.tensor(sentiment, dtype=torch.long),
            "role": torch.tensor(role, dtype=torch.long),
            "type": type_vec
        }


In [4]:
# ====================================
# 4. Model + Focal Loss
# ====================================
class MultiTaskClassifier(nn.Module):
    def __init__(self, model_name="bert-base-uncased", num_sentiment=3, num_role=4, num_type=6, dropout=0.1):
        super().__init__()
        self.encoder = BertModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.sentiment_head = nn.Linear(hidden_size, num_sentiment)
        self.role_head = nn.Linear(hidden_size, num_role)
        self.type_head = nn.Linear(hidden_size, num_type)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        return {
            "sentiment": self.sentiment_head(pooled_output),
            "role": self.role_head(pooled_output),
            "type": self.type_head(pooled_output)
        }

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        bce_loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = (1 - pt) ** self.gamma * bce_loss
        if self.alpha is not None:
            focal_loss = self.alpha * focal_loss
        return focal_loss.mean() if self.reduction=='mean' else focal_loss.sum()


In [5]:
# ====================================
# 5. Metrics Function
# ====================================
def compute_metrics(outputs, labels, task="sentiment", average="weighted", label_names=None):
    outputs = outputs.detach().cpu()
    labels = labels.detach().cpu()

    if task in ["sentiment", "role"]:
        preds = torch.argmax(outputs, dim=1)
        acc = accuracy_score(labels, preds)
        f1_w = f1_score(labels, preds, average="weighted")
        f1_m = f1_score(labels, preds, average="macro")

        result = {"accuracy": acc, "f1_weighted": f1_w, "f1_macro": f1_m}
        if label_names is not None:
            report = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
            result["per_class_f1"] = {name: report[name]["f1-score"] for name in label_names}
        return result

    elif task == "type":
        preds = (torch.sigmoid(outputs) > 0.5).int()
        subset_acc = (preds == labels).all(dim=1).float().mean().item()
        f1_micro = f1_score(labels, preds, average="micro", zero_division=0)
        f1_macro = f1_score(labels, preds, average="macro", zero_division=0)

        result = {"subset_acc": subset_acc, "f1_micro": f1_micro, "f1_macro": f1_macro}
        if label_names is not None:
            report = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
            result["per_class_f1"] = {name: report[name]["f1-score"] for name in label_names}
        return result


In [9]:
# ====================================
# 6. Training Config
# ====================================
BATCH_SIZE = 16
EPOCHS = 10
LR_HEAD = 5e-5
LR_ENCODER = 3e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_path = "train.json"
val_path = "val.json"

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dataset = CyberbullyingDataset(train_path, tokenizer)
val_dataset = CyberbullyingDataset(val_path, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [10]:
# ====================================
# 7. Class Weights
# ====================================
def get_class_weights(dataset, attr, num_classes):
    labels = [sample[attr] for sample in dataset]
    labels = torch.tensor(labels)
    counts = torch.bincount(labels, minlength=num_classes)
    weights = 1.0 / (counts.float() + 1e-6)
    weights = weights / weights.sum() * num_classes
    return weights

sentiment_weights = get_class_weights(train_dataset, "sentiment", len(SENTIMENT_LABELS)).to(DEVICE)
role_weights = get_class_weights(train_dataset, "role", len(ROLE_LABELS)).to(DEVICE)


In [11]:
# ====================================
# 8. Model, Loss, Optimizer
# ====================================
model = MultiTaskClassifier(
    model_name="bert-base-uncased",
    num_sentiment=len(SENTIMENT_LABELS),
    num_role=len(ROLE_LABELS),
    num_type=len(TYPE_LABELS)
).to(DEVICE)

criterion_sentiment = nn.CrossEntropyLoss(weight=sentiment_weights)
criterion_role = nn.CrossEntropyLoss(weight=role_weights)
criterion_type = FocalLoss()

for param in model.encoder.parameters():
    param.requires_grad = False

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR_HEAD)
num_training_steps = EPOCHS * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_val_loss = float("inf")


In [None]:
# ====================================
# 9. Training Loop
# ====================================
for epoch in range(EPOCHS):
    # Unfreeze encoder after 3 epochs
    if epoch == 3:
        for param in model.encoder.parameters():
            param.requires_grad = True
        optimizer = AdamW(model.parameters(), lr=LR_ENCODER)
        lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=(EPOCHS-epoch)*len(train_loader))
        print("🔓 Unfroze BERT encoder")

    # ---- Training ----
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        sentiment_labels = batch["sentiment"].to(DEVICE)
        role_labels = batch["role"].to(DEVICE)
        type_labels = batch["type"].to(DEVICE)

        outputs = model(input_ids, attention_mask)
        loss_sentiment = criterion_sentiment(outputs["sentiment"], sentiment_labels)
        loss_role = criterion_role(outputs["role"], role_labels)
        loss_type = criterion_type(outputs["type"], type_labels.float())
        loss = loss_sentiment + loss_role + loss_type

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    lr_scheduler.step()
    print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f}")

    # ---- Validation ----
    model.eval()
    val_loss = 0
    all_sentiment_preds, all_sentiment_labels = [], []
    all_role_preds, all_role_labels = [], []
    all_type_preds, all_type_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            sentiment_labels = batch["sentiment"].to(DEVICE)
            role_labels = batch["role"].to(DEVICE)
            type_labels = batch["type"].to(DEVICE)

            outputs = model(input_ids, attention_mask)
            loss_sentiment = criterion_sentiment(outputs["sentiment"], sentiment_labels)
            loss_role = criterion_role(outputs["role"], role_labels)
            loss_type = criterion_type(outputs["type"], type_labels.float())
            loss = loss_sentiment + loss_role + loss_type
            val_loss += loss.item()

            all_sentiment_preds.append(outputs["sentiment"].cpu())
            all_sentiment_labels.append(sentiment_labels.cpu())
            all_role_preds.append(outputs["role"].cpu())
            all_role_labels.append(role_labels.cpu())
            all_type_preds.append(outputs["type"].cpu())
            all_type_labels.append(type_labels.cpu())

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1} Val Loss: {avg_val_loss:.4f}")

    all_sentiment_preds = torch.cat(all_sentiment_preds)
    all_sentiment_labels = torch.cat(all_sentiment_labels)
    all_role_preds = torch.cat(all_role_preds)
    all_role_labels = torch.cat(all_role_labels)
    all_type_preds = torch.cat(all_type_preds)
    all_type_labels = torch.cat(all_type_labels)

    sentiment_metrics = compute_metrics(all_sentiment_preds, all_sentiment_labels, task="sentiment", label_names=SENTIMENT_LABELS)
    role_metrics = compute_metrics(all_role_preds, all_role_labels, task="role", label_names=ROLE_LABELS)
    type_metrics = compute_metrics(all_type_preds, all_type_labels, task="type", label_names=TYPE_LABELS)

    print(f"Sentiment Val: {sentiment_metrics}")
    print(f"Role Val: {role_metrics}")
    print(f"Type Val: {type_metrics}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pt")
        print("✅ Saved new best model")


Epoch 1 Training:  26%|██▋       | 33/125 [11:35<31:39, 20.65s/it]