In [None]:
# -------------------- Imports --------------------
import os
import json
import random
import pickle
import time
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.stats import mode

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    DebertaV2Tokenizer,
    DebertaV2ForSequenceClassification,
    get_scheduler
)

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score
)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import joblib


# -------------------- Configuration --------------------
SEED = 42
BATCH_SIZE = 16
VAL_BATCH_SIZE = 32
MAX_LENGTH = 128
EPOCHS = 3
LEARNING_RATE = 4e-5
PATIENCE = 1
MODEL_NAME = 'microsoft/deberta-v3-small'
OUTPUT_DIR = "/content/drive/MyDrive/FIRE/outputs"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------- Seed Setup --------------------
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -------------------- Tokenizer --------------------
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))

# -------------------- Dataset --------------------
class CryptoDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def collate_fn(batch):
    return {
        'input_ids': torch.stack([b['input_ids'] for b in batch]),
        'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
        'labels': torch.stack([b['labels'] for b in batch]),
    }


# -------------------- Confusion Matrix --------------------
def plot_confusion_matrix(labels, preds, classes, title, save_path):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

# -------------------- Contrastive Supervision --------------------
def apply_contrastive_supervision(features, labels, temperature=0.1):
    features = F.normalize(features, dim=1)
    similarity_matrix = torch.matmul(features, features.T)
    labels = labels.contiguous().view(-1, 1)
    mask = torch.eq(labels, labels.T).float().to(features.device)

    logits = similarity_matrix / temperature
    logits_mask = torch.ones_like(mask) - torch.eye(mask.size(0), device=mask.device)
    mask = mask * logits_mask

    exp_logits = torch.exp(logits) * logits_mask
    log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True) + 1e-9)
    mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1) + 1e-9)

    loss = -mean_log_prob_pos.mean()
    return loss

# -------------------- Training Losses --------------------
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, input, target):
        logpt = F.log_softmax(input, dim=1)
        pt = torch.exp(logpt)
        logpt = (1 - pt) ** self.gamma * logpt
        loss = F.nll_loss(logpt, target, weight=self.weight, reduction=self.reduction)
        return self.alpha * loss

def dice_loss(logits, targets, smooth=1):
    probs = F.softmax(logits, dim=1)
    targets_one_hot = F.one_hot(targets, num_classes=logits.size(1)).float().to(logits.device)
    intersection = (probs * targets_one_hot).sum(dim=0)
    cardinality = probs.sum(dim=0) + targets_one_hot.sum(dim=0)
    dice = (2. * intersection + smooth) / (cardinality + smooth)
    return 1. - dice.mean()

def smoothed_cross_entropy(logits, target, smoothing=0.1):
    num_classes = logits.size(1)
    confidence = 1.0 - smoothing
    with torch.no_grad():
        true_dist = torch.zeros_like(logits)
        true_dist.fill_(smoothing / (num_classes - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), confidence)
    log_probs = F.log_softmax(logits, dim=1)
    return torch.mean(torch.sum(-true_dist * log_probs, dim=1))

# -------------------- Helper --------------------
def get_preds_from_logits(logits):
    probs = F.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)
    return preds, probs

#  Updated train_model_for_level with support for:
# - AMP
# - Gradient Checkpointing (optional)
# - Flexible loss combinations
# - Early stopping
# - Training history saving

def train_model_for_level(
    num_labels, train_loader, val_loader, save_path, level_name="level",
    y_train_labels=None, loss_type="focal+dice+contrastive", contrastive_weight=0.2,
    label_smoothing=0.0, gradient_checkpointing=False, use_amp=True, patience=2
):
    import json
    from sklearn.utils.class_weight import compute_class_weight
    from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support, roc_auc_score
    from sklearn.preprocessing import label_binarize
    from transformers import AdamW, get_scheduler, DebertaV2ForSequenceClassification
    from torch.cuda.amp import GradScaler, autocast
    import matplotlib.pyplot as plt

    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(OUTPUT_DIR, f"run_{run_id}")
    os.makedirs(os.path.join(run_dir, "plots"), exist_ok=True)
    os.makedirs(os.path.join(run_dir, "logs"), exist_ok=True)

    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(device)
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()
    model.config.output_hidden_states = True

    if y_train_labels is not None:
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
    else:
        all_train_labels = [label.item() for batch in train_loader for label in batch['labels']]
        class_weights = compute_class_weight('balanced', classes=np.unique(all_train_labels), y=all_train_labels)

    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    focal = FocalLoss(alpha=class_weights_tensor, gamma=2.0, label_smoothing=label_smoothing)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=EPOCHS * len(train_loader))
    scaler = GradScaler(enabled=use_amp)

    best_f1, patience_counter = 0, 0
    train_losses, train_accuracies, train_f1s = [], [], []
    val_accuracies, val_f1s = [], []
    best_metrics = {}

    for epoch in range(EPOCHS):
        model.train()
        total_loss, all_preds, all_labels = 0, [], []

        for batch in tqdm(train_loader, desc=f"[{level_name}] Epoch {epoch+1}/{EPOCHS}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad(set_to_none=True)

            with autocast(device_type="cuda", enabled=use_amp):
                outputs = model(**batch)
                logits = outputs.logits
                loss = 0

                if "focal" in loss_type:
                    loss += focal(logits, batch['labels'])
                elif label_smoothing > 0:
                    loss += smoothed_cross_entropy(logits, batch['labels'], smoothing=label_smoothing)
                else:
                    loss += F.cross_entropy(logits, batch['labels'], weight=class_weights_tensor)

                if "dice" in loss_type:
                    loss += dice_loss(logits, batch['labels'])
                if "contrastive" in loss_type:
                    hidden_states = outputs.hidden_states[-1][:, 0, :]
                    if hidden_states.size(0) > 1:
                        loss += contrastive_weight * apply_contrastive_supervision(hidden_states, batch['labels'])

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()
            preds, _ = get_preds_from_logits(logits)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

        train_acc = accuracy_score(all_labels, all_preds)
        train_f1 = f1_score(all_labels, all_preds, average='weighted')
        train_losses.append(total_loss)
        train_accuracies.append(train_acc)
        train_f1s.append(train_f1)

        # Validation
        model.eval()
        val_preds, val_labels, val_probs = [], [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                with autocast(device_type="cuda", enabled=use_amp):
                    outputs = model(**batch)
                logits = outputs.logits
                preds, probs = get_preds_from_logits(logits)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(batch['labels'].cpu().numpy())
                val_probs.extend(probs.cpu().numpy())

        val_acc = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds, average='weighted')

        try:
            val_labels_bin = label_binarize(val_labels, classes=list(range(num_labels)))
            roc_auc = roc_auc_score(val_labels_bin, val_probs, average='macro', multi_class='ovr')
        except Exception as e:
            print(f" ROC AUC calculation failed: {e}")
            roc_auc = None

        print("=" * 80)
        print(f" Epoch {epoch+1}/{EPOCHS}")
        print(f" Train Loss: {total_loss:.4f}")
        print(f" Train Acc: {train_acc:.4f} |  Train F1: {train_f1:.4f}")
        print(f" Val Acc:   {val_acc:.4f} |  Val F1:   {val_f1:.4f}")
        if roc_auc is not None:
            print(f" ROC AUC:   {roc_auc:.4f}")
        print("=" * 80, flush=True)

        val_accuracies.append(val_acc)
        val_f1s.append(val_f1)

        if val_f1 > best_f1:
            best_f1 = val_f1
            patience_counter = 0
            torch.save(model.state_dict(), save_path)
            precision, recall, f1_metric, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')
            best_metrics = {
                "val_precision_weighted": precision,
                "val_recall_weighted": recall,
                "val_f1_weighted": f1_metric,
                "roc_auc": roc_auc
            }
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(" Early stopping.")
                break

    # Plot Training Curve
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Train Loss")
    plt.plot(train_f1s, label="Train F1")
    plt.plot(train_accuracies, label="Train Accuracy")
    plt.plot(val_f1s, label="Val F1")
    plt.plot(val_accuracies, label="Val Accuracy")
    plt.legend(), plt.grid(True)
    plt.title(f" Training Curve - {level_name}")
    plt.savefig(f"{run_dir}/plots/loss_f1_curve_{level_name}.png")
    plt.close()

    # Save history log
    history = {
        "train_losses": train_losses,
        "train_accuracies": train_accuracies,
        "train_f1s": train_f1s,
        "val_f1s": val_f1s,
        "val_accuracies": val_accuracies,
        **best_metrics
    }
    with open(os.path.join(run_dir, "logs", f"history_{level_name}.json"), "w") as f:
        json.dump(history, f, indent=4)

    model.load_state_dict(torch.load(save_path))
    return model



# -------------------- Evaluation --------------------
# -------------------- Evaluation --------------------


def compute_roc_auc(y_true, y_probs, num_labels):
    """
    Compute macro-averaged ROC AUC for multi-class classification.
    """
    try:
        y_true_bin = label_binarize(y_true, classes=list(range(num_labels)))
        roc_auc = roc_auc_score(y_true_bin, y_probs, average="macro", multi_class="ovr")
        return roc_auc
    except Exception as e:
        print(f" ROC AUC computation failed: {e}")
        return None


def evaluate_saved_model(model_path, dataloader, num_labels, class_names=None, return_outputs=False):
    """
    Load a saved model, evaluate it on the provided dataloader, and print metrics.

    Args:
        model_path (str): Path to .pt checkpoint.
        dataloader (DataLoader): Validation/test DataLoader.
        num_labels (int): Number of output classes.
        class_names (list or None): Label names.
        return_outputs (bool): Whether to return predictions, labels, and probabilities.

    Returns:
        Optional: preds, labels, probs
    """
    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    preds, labels, probs = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            pred, prob = get_preds_from_logits(logits)
            preds.extend(pred.cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())
            probs.extend(prob.cpu().numpy())

    # Print classification report
    try:
        report = classification_report(labels, preds, target_names=class_names, digits=4)
    except:
        report = classification_report(labels, preds, digits=4)
    print(report)

    # Compute ROC AUC
    roc_auc = compute_roc_auc(labels, probs, num_labels)
    if roc_auc is not None:
        print(f" ROC AUC (macro, OVR): {roc_auc:.4f}")

    if return_outputs:
        return preds, labels, probs, roc_auc


def load_model_for_inference(num_labels, path, device):
    """
    Load a trained model for inference.
    """
    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    return model


def visualize_model_performance(true_labels, pred_labels, class_names, title, save_path):
    """
    Save classification report and plot confusion matrix.

    Args:
        true_labels (List[int]): Ground truth labels.
        pred_labels (List[int]): Model predictions.
        class_names (List[str]): Names of the classes.
        title (str): Plot title.
        save_path (str): File path to save the confusion matrix.
    """
    try:
        report = classification_report(true_labels, pred_labels, target_names=class_names, digits=4)
    except:
        report = classification_report(true_labels, pred_labels, digits=4)

    print(report)

    # Save classification report to .txt
    with open(save_path.replace(".png", "_report.txt"), "w") as f:
        f.write(report)

    # Plot confusion matrix
    plot_confusion_matrix(true_labels, pred_labels, class_names, title, save_path)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

# **LEVEL-2 CODE**

In [None]:
import os, pickle, random
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import DebertaV2Tokenizer
from torch.utils.data import DataLoader
import torch

# ==================== ENV SETUP ====================
SEED = 42
BATCH_SIZE = 16
VAL_BATCH_SIZE = 32
OUTPUT_DIR = "/content/drive/MyDrive/FIRE"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ========== DATASET CLASS & COLLATE ==========
class CryptoDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        return {
            "text": self.texts[idx],
            "label": self.labels[idx]
        }

def collate_fn(batch):
    texts = [x["text"] for x in batch]
    labels = torch.tensor([x["label"] for x in batch], dtype=torch.long)
    tokenizer_output = tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt", max_length=128
    )
    return {
        "input_ids": tokenizer_output["input_ids"],
        "attention_mask": tokenizer_output["attention_mask"],
        "labels": labels
    }

# ========== FOCAL + CONTRASTIVE ==========
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2.0, label_smoothing=0.1):
        super().__init__()
        self.alpha = torch.tensor(alpha).to(device) if alpha is not None else None
        self.gamma = gamma
        self.smoothing = label_smoothing
    def forward(self, logits, target):
        ce = torch.nn.functional.cross_entropy(logits, target, reduction='none', label_smoothing=self.smoothing)
        pt = torch.exp(-ce)
        focal = (1 - pt) ** self.gamma * ce
        if self.alpha is not None:
            at = self.alpha[target]
            focal = at * focal
        return focal.mean()

def compute_supervised_contrastive_loss(cls_emb, labels, temperature=0.3):
    normalized = torch.nn.functional.normalize(cls_emb, dim=1)
    similarity_matrix = torch.matmul(normalized, normalized.T) / temperature
    labels = labels.contiguous().view(-1, 1)
    mask = torch.eq(labels, labels.T).float().to(device)
    logits_mask = torch.ones_like(mask) - torch.eye(mask.size(0)).to(device)
    mask *= logits_mask
    exp_logits = torch.exp(similarity_matrix) * logits_mask
    log_prob = similarity_matrix - torch.log(exp_logits.sum(1, keepdim=True) + 1e-12)
    mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1) + 1e-12)
    return -mean_log_prob_pos.mean()

# ========== TRAIN FUNCTION ==========
def train_model_for_level(
    num_labels, train_loader, val_loader, save_path, level_name,
    y_train_labels, loss_type="focal+dice+contrastive", contrastive_weight=0.3,
    label_smoothing=0.1, use_amp=True, gradient_checkpointing=False,
    patience=3, epochs=5
):
    from transformers import DebertaV2ForSequenceClassification, get_cosine_schedule_with_warmup
    from torch.cuda.amp import GradScaler, autocast
    model = DebertaV2ForSequenceClassification.from_pretrained(
        "microsoft/deberta-v3-small", num_labels=num_labels, output_hidden_states=True
    ).to(device)
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * epochs
    scheduler = get_cosine_schedule_with_warmup(optimizer, int(0.1 * total_steps), total_steps)
    scaler = GradScaler(enabled=use_amp)

    class_counts = np.bincount(y_train_labels)
    class_weights = 1.0 / (np.log(1.01 + class_counts))
    class_weights = class_weights / class_weights.sum()
    loss_fn = FocalLoss(alpha=class_weights.tolist(), gamma=2.0, label_smoothing=label_smoothing)

    best_f1 = -1
    patience_counter = 0
    for epoch in range(epochs):
        print(f"\n🚂 {level_name} Epoch {epoch+1}/{epochs}")
        model.train(); total_loss = 0; preds, targets = [], []
        for batch in tqdm(train_loader, desc=f"[{level_name}] Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            with autocast(enabled=use_amp):
                outputs = model(**batch)
                logits = outputs.logits
                ce_loss = loss_fn(logits, batch["labels"])
                if "contrastive" in loss_type:
                    cls_emb = torch.stack(outputs.hidden_states[-4:], dim=0).mean(0)[:, 0]
                    if batch["labels"].unique().numel() > 1:
                        con_loss = compute_supervised_contrastive_loss(cls_emb, batch["labels"])
                        ce_loss += contrastive_weight * con_loss
            scaler.scale(ce_loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            total_loss += ce_loss.item()
            preds += logits.argmax(dim=-1).cpu().tolist()
            targets += batch["labels"].cpu().tolist()

        train_acc = accuracy_score(targets, preds)
        train_f1 = f1_score(targets, preds, average="weighted")
        print(f" Train Loss: {total_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")

        model.eval(); val_preds, val_targets = [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                with autocast(enabled=use_amp):
                    logits = model(**batch).logits
                val_preds += logits.argmax(dim=-1).cpu().tolist()
                val_targets += batch["labels"].cpu().tolist()
        val_acc = accuracy_score(val_targets, val_preds)
        val_f1 = f1_score(val_targets, val_preds, average="weighted")
        print(f" Val Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), save_path)
            print(f" Saved model: {save_path}")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(" Early Stopping")
                break
    model.load_state_dict(torch.load(save_path))
    return model


# ========== LEVEL 2 EXECUTION ==========
if __name__ == "__main__":
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(OUTPUT_DIR, f"run_{run_id}")
    os.makedirs(run_dir, exist_ok=True)
    for folder in ["models", "logs", "plots", "encoders", "ensembles"]:
        os.makedirs(os.path.join(run_dir, folder), exist_ok=True)

    train_df = pd.read_csv("/content/drive/MyDrive/FIRE/crypto_task1_train.csv")
    val_df = pd.read_csv("/content/drive/MyDrive/FIRE/crypto_task1_val.csv")

    # --- Fix label encoding ---
    label1_map = {0: "NOISE", 1: "OBJECTIVE", 2: "SUBJECTIVE"}
    train_df["level_1_str"] = train_df["level_1"].map(label1_map)
    val_df["level_1_str"] = val_df["level_1"].map(label1_map)

    le1 = LabelEncoder()
    train_df["level_1_enc"] = le1.fit_transform(train_df["level_1_str"])
    val_df["level_1_enc"] = le1.transform(val_df["level_1_str"])

    # SAVE TO FIXED PATH
    fixed_level1_encoder_path = "/content/drive/MyDrive/FIRE/run_20250628_034630/encoders/label_encoder_level_1.pkl"
    pickle.dump(le1, open(fixed_level1_encoder_path, "wb"))

    tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-small")

    subjective_code = le1.transform(["SUBJECTIVE"])[0]
    train_l2_df = train_df[train_df["level_1_enc"] == subjective_code].copy()
    val_l2_df = val_df[val_df["level_1_enc"] == subjective_code].copy()

    le2 = LabelEncoder()
    train_l2_df["level_2_enc"] = le2.fit_transform(train_l2_df["level_2"])
    val_l2_df["level_2_enc"] = le2.transform(val_l2_df["level_2"])
    pickle.dump(le2, open(f"{run_dir}/encoders/label_encoder_level_2.pkl", "wb"))

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    level2_preds, level2_labels = [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_l2_df, train_l2_df["level_2_enc"])):
        print(f"\n Fold {fold + 1}/5")
        fold_train = train_l2_df.iloc[train_idx].reset_index(drop=True)
        fold_val = train_l2_df.iloc[val_idx].reset_index(drop=True)
        train_loader = DataLoader(CryptoDataset(fold_train["text"], fold_train["level_2_enc"], tokenizer), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(CryptoDataset(fold_val["text"], fold_val["level_2_enc"], tokenizer), batch_size=VAL_BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
        save_path = f"{run_dir}/models/level2_fold{fold+1}.pth"

        model = train_model_for_level(
            num_labels=len(le2.classes_),
            train_loader=train_loader,
            val_loader=val_loader,
            save_path=save_path,
            level_name=f"level2_fold{fold+1}",
            y_train_labels=fold_train["level_2_enc"].values,
            loss_type="focal+contrastive",
            label_smoothing=0.1,
            epochs=10,
            patience=4
        )

        model.eval(); fold_preds, fold_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                logits = model(**batch).logits
                fold_preds += logits.argmax(dim=-1).cpu().tolist()
                fold_labels += batch["labels"].cpu().tolist()
        level2_preds.append(fold_preds)
        level2_labels.append(fold_labels)

    from scipy.stats import mode
    majority_preds = mode(np.array(level2_preds), axis=0).mode[0]
    true_labels = np.array(level2_labels[0])
    acc = accuracy_score(true_labels, majority_preds)
    f1 = f1_score(true_labels, majority_preds, average="weighted")
    print(f"\n Level 2 Ensemble Accuracy: {acc:.4f} | F1: {f1:.4f}")


In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from scipy.stats import mode
from transformers import DebertaV2ForSequenceClassification, AutoTokenizer
import pickle

# === CONFIG ===
model_paths = [
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold1.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold2.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold3.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold4.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold5.pth",
]
model_name = "microsoft/deberta-v3-small"
label_encoder_path = "/content/drive/MyDrive/FIRE/run_20250629_090953/encoders/label_encoder_level_2.pkl"
val_csv_path = "/content/drive/MyDrive/FIRE/crypto_task1_val.csv"
save_dir = "/content/drive/MyDrive/FIRE/outputs/ensemble_level2_eval"
os.makedirs(save_dir, exist_ok=True)

# === Load label encoder ===
with open(label_encoder_path, "rb") as f:
    le2 = pickle.load(f)

# === Load and preprocess validation data ===
val_df = pd.read_csv(val_csv_path)
print(" Columns in validation CSV:", val_df.columns.tolist())

# Add [SOURCE] token to text
val_df['source_token'] = val_df['source'].str.upper().map({
    'REDDIT': '[REDDIT]',
    'TWITTER': '[TWITTER]',
    'YOUTUBE': '[YOUTUBE]'
})
val_df['text'] = val_df['source_token'] + ' ' + val_df['text']

# Only SUBJECTIVE samples are relevant for Level 2
subjective_mask = val_df['level_1'] == 2
val_df = val_df[subjective_mask].copy()
val_df["level_2_enc"] = le2.transform(val_df["level_2"])
true_labels = val_df["level_2_enc"].values

# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
encodings = tokenizer(
    list(val_df['text']),
    padding=True,
    truncation=True,
    return_tensors='pt',
    max_length=128
)

labels = torch.tensor(true_labels)
val_dataset = torch.utils.data.TensorDataset(
    encodings['input_ids'], encodings['attention_mask'], labels
)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# === Inference ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
all_fold_preds = []

for fold, model_path in enumerate(model_paths):
    print(f"\n Fold {fold+1} — loading model")
    model = DebertaV2ForSequenceClassification.from_pretrained(
        model_name, num_labels=len(le2.classes_)
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    fold_preds = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Predicting Fold {fold+1}"):
            input_ids, attention_mask, _ = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            fold_preds.extend(preds)

    all_fold_preds.append(np.array(fold_preds))

# === Majority Voting ===
ensemble_preds = mode(np.array(all_fold_preds), axis=0).mode.squeeze()
pred_labels = le2.inverse_transform(ensemble_preds)

# Save predictions
val_df["preds"] = ensemble_preds
val_df["pred_labels"] = pred_labels
val_df.to_csv(os.path.join(save_dir, "level2_val_predictions.csv"), index=False)

# === Overall Metrics ===
acc = accuracy_score(true_labels, ensemble_preds)
f1_weighted = f1_score(true_labels, ensemble_preds, average='weighted')
f1_macro = f1_score(true_labels, ensemble_preds, average='macro')
f1_micro = f1_score(true_labels, ensemble_preds, average='micro')
prec_macro = precision_score(true_labels, ensemble_preds, average='macro')
recall_macro = recall_score(true_labels, ensemble_preds, average='macro')

try:
    report = classification_report(
        true_labels,
        ensemble_preds,
        labels=list(range(len(le2.classes_))),
        target_names=[str(c) for c in le2.classes_],
        digits=4
    )
except Exception as e:
    print(" Error generating classification report:", e)
    report = classification_report(true_labels, ensemble_preds, digits=4)

metrics_text = f"""
Ensemble Accuracy:        {acc:.4f}
F1 Score (Weighted):      {f1_weighted:.4f}
F1 Score (Macro):         {f1_macro:.4f}
F1 Score (Micro):         {f1_micro:.4f}
Precision (Macro):        {prec_macro:.4f}
Recall (Macro):           {recall_macro:.4f}

Classification Report:
{report}
"""


print(metrics_text)
with open(os.path.join(save_dir, "metrics.txt"), "w") as f:
    f.write(metrics_text)

# === Confusion Matrix ===
plt.figure(figsize=(6, 5))
cm = confusion_matrix(true_labels, ensemble_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Level 2 Ensemble)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
plt.close()

# === Platform-wise Evaluation
print("\nPlatform-wise Evaluation:")
platforms = ["youtube", "reddit", "twitter"]

for platform in platforms:
    mask = val_df['source'].str.lower() == platform
    y_true = val_df["level_2_enc"].values[mask]
    y_pred = val_df["preds"].values[mask]

    print(f"\nPlatform: {platform.upper()}")
    if len(y_true) == 0:
        print(f"Warning: No samples for {platform.upper()}. Skipping.")
        continue

    print(f"Accuracy:     {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Weighted:  {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Macro:     {f1_score(y_true, y_pred, average='macro'):.4f}")

    try:
        platform_report = classification_report(
            y_true, y_pred,
            labels=list(range(len(le2.classes_))),
            target_names=[str(c) for c in le2.classes_],
            digits=4
        )
    except Exception as e:
        print("Error generating report:", e)
        platform_report = classification_report(y_true, y_pred, digits=4)

    print(f"Platform Report:\n{platform_report}")

print(f"\nAll outputs saved to: {save_dir}")


✅ Columns in validation CSV: ['text', 'level_1', 'level_2', 'level_3', 'source']





🔁 Fold 1 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 1: 100%|██████████| 29/29 [00:03<00:00,  7.29it/s]



🔁 Fold 2 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 2: 100%|██████████| 29/29 [00:04<00:00,  7.17it/s]



🔁 Fold 3 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 3: 100%|██████████| 29/29 [00:04<00:00,  7.12it/s]



🔁 Fold 4 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 4: 100%|██████████| 29/29 [00:04<00:00,  7.05it/s]



🔁 Fold 5 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 5: 100%|██████████| 29/29 [00:04<00:00,  6.94it/s]



✅ Ensemble Accuracy: 0.8190
🎯 F1 (Weighted): 0.8001
📏 F1 (Macro):    0.6440
📐 F1 (Micro):    0.8190
🎯 Precision (Macro): 0.7272
📌 Recall (Macro):    0.6103

📋 Classification Report:
              precision    recall  f1-score   support

         0.0     0.8363    0.9545    0.8915       637
         1.0     0.8049    0.6439    0.7154       205
         2.0     0.5405    0.2326    0.3252        86

    accuracy                         0.8190       928
   macro avg     0.7272    0.6103    0.6440       928
weighted avg     0.8020    0.8190    0.8001       928



🔎 Platform-wise Evaluation:

📦 Platform: YOUTUBE
Accuracy: 0.8254
F1 Weighted: 0.8154
F1 Macro:    0.6098
Platform Report:
              precision    recall  f1-score   support

         0.0     0.8204    0.9510    0.8809       245
         1.0     0.8730    0.6962    0.7746       158
         2.0     0.2500    0.1333    0.1739        15

    accuracy                         0.8254       418
   macro avg     0.6478    0.5935    0.