In [None]:
import os
import json
import random
import pickle
import time
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.stats import mode

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    DebertaV2Tokenizer,
    DebertaV2ForSequenceClassification,
    get_scheduler
)

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score
)
from sklearn.model_selection import StratifiedKFold

import joblib



# -------------------- Configuration --------------------
SEED = 42
BATCH_SIZE = 16
VAL_BATCH_SIZE = 32
MAX_LENGTH = 128
EPOCHS = 3
LEARNING_RATE = 1e-5
PATIENCE = 2
MODEL_NAME = 'microsoft/deberta-v3-small'
OUTPUT_DIR = "/content/drive/MyDrive/FIRE/outputs"

os.makedirs(OUTPUT_DIR, exist_ok=True)
# -------------------- Seed Setup --------------------
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -------------------- Tokenizer --------------------
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))

# -------------------- Dataset --------------------
class CryptoDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def collate_fn(batch):
    keys = batch[0].keys()
    return {key: torch.stack([item[key] for item in batch]) for key in keys}


# -------------------- Confusion Matrix --------------------
def plot_confusion_matrix(labels, preds, classes, title, save_path):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

# -------------------- Contrastive Supervision --------------------
def apply_contrastive_supervision(features, labels, temperature=0.1):
    features = F.normalize(features, dim=1)
    similarity_matrix = torch.matmul(features, features.T)
    labels = labels.contiguous().view(-1, 1)
    mask = torch.eq(labels, labels.T).float().to(features.device)

    logits = similarity_matrix / temperature
    logits_mask = torch.ones_like(mask) - torch.eye(mask.size(0), device=mask.device)
    mask = mask * logits_mask

    exp_logits = torch.exp(logits) * logits_mask
    log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True) + 1e-9)
    mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1) + 1e-9)

    loss = -mean_log_prob_pos.mean()
    return loss

# -------------------- Training Losses --------------------
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, input, target):
        logpt = F.log_softmax(input, dim=1)
        pt = torch.exp(logpt)
        logpt = (1 - pt) ** self.gamma * logpt
        loss = F.nll_loss(logpt, target, weight=self.weight, reduction=self.reduction)
        return self.alpha * loss


def dice_loss(logits, targets, smooth=1):
    probs = F.softmax(logits, dim=1)
    targets_one_hot = F.one_hot(targets, num_classes=logits.size(1)).float().to(logits.device)
    intersection = (probs * targets_one_hot).sum(dim=0)
    cardinality = probs.sum(dim=0) + targets_one_hot.sum(dim=0)
    dice = (2. * intersection + smooth) / (cardinality + smooth)
    return 1. - dice.mean()


def smoothed_cross_entropy(logits, target, smoothing=0.1):
    num_classes = logits.size(1)
    confidence = 1.0 - smoothing
    with torch.no_grad():
        true_dist = torch.zeros_like(logits)
        true_dist.fill_(smoothing / (num_classes - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), confidence)
    log_probs = F.log_softmax(logits, dim=1)
    return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


# -------------------- Helper --------------------
def get_preds_from_logits(logits):
    """Return predicted class indices and softmax probabilities."""
    probs = F.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)
    return preds, probs


def train_model_for_level(
    num_labels, train_loader, val_loader, save_path, level_name="level",
    y_train_labels=None, loss_type="focal+dice", contrastive_weight=0.2, label_smoothing=0.0
):
    # Ensure plot and log directories exist
    os.makedirs(os.path.join(OUTPUT_DIR, "plots"), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_DIR, "logs"), exist_ok=True)

    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(device)
    model.gradient_checkpointing_enable()
    model.config.output_hidden_states = True

    # Compute class weights
    if y_train_labels is not None:
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
    else:
        all_train_labels = [label.item() for batch in train_loader for label in batch['labels']]
        class_weights = compute_class_weight('balanced', classes=np.unique(all_train_labels), y=all_train_labels)

    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    focal = FocalLoss(weight=class_weights_tensor)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=EPOCHS * len(train_loader))

    best_f1 = 0
    patience_counter = 0
    train_losses, train_accuracies, train_f1s = [], [], []
    val_accuracies, val_f1s = [], []
    best_metrics = {}

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch in tqdm(train_loader, desc=f"[{level_name}] Epoch {epoch+1}/{EPOCHS}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits

            # ===== Loss Calculation =====
            loss = 0
            if "focal" in loss_type:
                loss += focal(logits, batch['labels'])
            elif label_smoothing > 0:
                loss += smoothed_cross_entropy(logits, batch['labels'], smoothing=label_smoothing)
            else:
                loss += F.cross_entropy(logits, batch['labels'], weight=class_weights_tensor)

            if "dice" in loss_type:
                loss += dice_loss(logits, batch['labels'])

            if "contrastive" in loss_type:
                hidden_states = outputs.hidden_states[-1][:, 0, :]
                loss += contrastive_weight * apply_contrastive_supervision(hidden_states, batch['labels'])

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            preds, _ = get_preds_from_logits(logits)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

        train_f1 = f1_score(all_labels, all_preds, average='weighted')
        train_acc = accuracy_score(all_labels, all_preds)
        train_losses.append(total_loss)
        train_accuracies.append(train_acc)
        train_f1s.append(train_f1)

        # ===== Validation =====
        model.eval()
        val_preds, val_labels, val_probs = [], [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                preds, probs = get_preds_from_logits(logits)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(batch['labels'].cpu().numpy())
                val_probs.extend(probs.cpu().numpy())

        val_f1 = f1_score(val_labels, val_preds, average='weighted')
        val_acc = accuracy_score(val_labels, val_preds)

        try:
            val_labels_bin = label_binarize(val_labels, classes=list(range(num_labels)))
            roc_auc = roc_auc_score(val_labels_bin, val_probs, average='macro', multi_class='ovr')
        except Exception as e:
            print(f" ROC AUC calculation failed: {e}")
            roc_auc = None

        print(f" Epoch {epoch+1} | Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}")

        val_accuracies.append(val_acc)
        val_f1s.append(val_f1)

        if val_f1 > best_f1:
            best_f1 = val_f1
            patience_counter = 0

            #  Save best checkpoint
            with open(save_path, "wb") as f:
                torch.save(model.state_dict(), f)

            precision, recall, f1_metric, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')
            best_metrics = {
                "val_precision_weighted": precision,
                "val_recall_weighted": recall,
                "val_f1_weighted": f1_metric,
                "roc_auc": roc_auc
            }
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(" Early stopping.")
                break

    # ===== Plot Training Curve =====
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Train Loss")
    plt.plot(train_f1s, label="Train F1")
    plt.plot(train_accuracies, label="Train Accuracy")
    plt.plot(val_f1s, label="Val F1")
    plt.plot(val_accuracies, label="Val Accuracy")
    plt.legend()
    plt.grid(True)
    plt.title(f" Training Curve - {level_name}")
    plt.savefig(f"{OUTPUT_DIR}/plots/loss_f1_curve_{level_name}.png")
    plt.close()

    # ===== Save Training Log =====
    history = {
        "train_losses": train_losses,
        "train_accuracies": train_accuracies,
        "train_f1s": train_f1s,
        "val_f1s": val_f1s,
        "val_accuracies": val_accuracies,
        **best_metrics
    }
    with open(f"{OUTPUT_DIR}/logs/history_{level_name}.json", "w") as f:
        json.dump(history, f, indent=4)

    #  Return the best model (loaded from disk)
    model.load_state_dict(torch.load(save_path))
    return model




# -------------------- Evaluation --------------------
def evaluate_saved_model(model_path, dataloader, num_labels):
    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())

    report = classification_report(labels, preds, digits=4)
    print(report)

def load_model_for_inference(num_labels, path, device):
    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    return model


def visualize_model_performance(true_labels, pred_labels, class_names, title, save_path):
    """
    Visualizes classification performance using a confusion matrix and saves the classification report.

    Args:
        true_labels (list or np.array): Ground truth labels.
        pred_labels (list or np.array): Predicted labels from the ensemble.
        class_names (list): List of class label names (e.g., label_encoder.classes_).
        title (str): Title for the plot and report.
        save_path (str): Path to save the confusion matrix image.
    """
    from sklearn.metrics import classification_report

    # Generate classification report
    report = classification_report(true_labels, pred_labels, target_names=class_names, digits=4)
    print(report)

    # Save report
    report_path = save_path.replace(".png", "_report.txt")
    with open(report_path, "w") as f:
        f.write(report)

    # Plot confusion matrix
    plot_confusion_matrix(true_labels, pred_labels, class_names, title, save_path)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, WeightedRandomSampler
import torch
from transformers import DebertaV2Tokenizer
from scipy.stats import mode

# ==== Assumed Pre-defined: SEED, OUTPUT_DIR, BATCH_SIZE, VAL_BATCH_SIZE, CryptoDataset, collate_fn,
# train_model_for_level, load_model_for_inference, visualize_model_performance ====

# === FIXED PATH FOR LEVEL 1 ENCODER ===
LEVEL1_ENCODER_PATH = "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/encoders/label_encoder_level_1.pkl"

if __name__ == "__main__":
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(OUTPUT_DIR, f"run_{run_id}")
    os.makedirs(run_dir, exist_ok=True)
    for subfolder in ["models", "plots", "logs", "encoders", "ensembles"]:
        os.makedirs(os.path.join(run_dir, subfolder), exist_ok=True)

    # === Load Data ===
    train_df = pd.read_csv("/content/drive/MyDrive/FIRE/crypto_task1_train.csv")
    val_df = pd.read_csv("/content/drive/MyDrive/FIRE/crypto_task1_val.csv")

    print(f"\n Train size: {len(train_df)}, Validation size: {len(val_df)}")
    print(" Train Source Distribution:\n", train_df['source'].value_counts())
    print("Validation Source Distribution:\n", val_df['source'].value_counts())

    # === Add [SOURCE] token ===
    def add_source_token(df):
        df = df.copy()
        df['source_token'] = df['source'].str.upper().map({
            'REDDIT': '[REDDIT]',
            'TWITTER': '[TWITTER]',
            'YOUTUBE': '[YOUTUBE]'
        })
        df['text'] = df['source_token'] + ' ' + df['text']
        return df

    train_df = add_source_token(train_df)
    val_df = add_source_token(val_df)

    tokenizer = DebertaV2Tokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))

    def save_ensemble_model(preds_list, save_path):
        np.save(save_path, np.array(preds_list))

    def save_platform_reports(val_sources, platforms, true_labels, majority_preds, label_classes, level_num):
        for platform in platforms:
            mask = val_sources == platform
            platform_true = true_labels[mask]
            platform_pred = majority_preds[mask]
            report = classification_report(platform_true, platform_pred, target_names=label_classes, digits=4)
            print(f"\n Level {level_num} - Platform: {platform}")
            print(report)
            with open(f"{run_dir}/logs/level{level_num}_report_{platform}.txt", "w") as f:
                f.write(report)

    # -------- LEVEL 1 --------
    # -------- LEVEL 1 --------
le1 = LabelEncoder()
train_df['level_1_enc'] = le1.fit_transform(train_df['level_1'])
val_df['level_1_enc'] = le1.transform(val_df['level_1'])

# Save label encoder to specified path
pickle.dump(le1, open("/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/encoders/label_encoder_level_1.pkl", "wb"))

skf1 = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(skf1.split(train_df, train_df['level_1_enc'])):
    if fold != 4:
        continue  #  Skip all folds except Fold 5

    print(f"\n Level 1 Fold {fold + 1}/5")
    train_fold_df = train_df.iloc[train_idx].reset_index(drop=True)
    val_fold_df = train_df.iloc[val_idx].reset_index(drop=True)

    source_weights = train_fold_df['source'].map({
        'YOUTUBE': 2.5,
        'REDDIT': 2.5,
        'TWITTER': 6.5
    }).fillna(1.0).astype(float).values
    source_weights = torch.tensor(source_weights, dtype=torch.double)
    sampler = WeightedRandomSampler(source_weights, num_samples=len(source_weights), replacement=True)

    train_loader = DataLoader(
        CryptoDataset(train_fold_df['text'], train_fold_df['level_1_enc'], tokenizer),
        batch_size=BATCH_SIZE, sampler=sampler, collate_fn=collate_fn)
    val_loader = DataLoader(
        CryptoDataset(val_fold_df['text'], val_fold_df['level_1_enc'], tokenizer),
        batch_size=VAL_BATCH_SIZE, collate_fn=collate_fn)

    model_path = "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/models/level1_fold5.pth"
    model = train_model_for_level(
        num_labels=len(le1.classes_),
        train_loader=train_loader,
        val_loader=val_loader,
        save_path=model_path,
        level_name="level1_fold5",
        y_train_labels=train_fold_df['level_1_enc'],
        loss_type="focal+dice+contrastive",
        label_smoothing=0.1
    )



✅ Train size: 12859, Validation size: 1429
🔎 Train Source Distribution:
 source
reddit     4500
youtube    4500
twitter    3859
Name: count, dtype: int64
🔎 Validation Source Distribution:
 source
reddit     500
youtube    500
twitter    429
Name: count, dtype: int64

📂 Level 1 Fold 5/5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[level1_fold5] Epoch 1/3: 100%|██████████| 643/643 [14:32<00:00,  1.36s/it]


📊 Epoch 1 | Loss: 815.7788 | Train Acc: 0.7367 | Val Acc: 0.8044 | Train F1: 0.7459 | Val F1: 0.8076


[level1_fold5] Epoch 2/3: 100%|██████████| 643/643 [14:31<00:00,  1.36s/it]


📊 Epoch 2 | Loss: 640.2732 | Train Acc: 0.8406 | Val Acc: 0.8285 | Train F1: 0.8456 | Val F1: 0.8317


[level1_fold5] Epoch 3/3: 100%|██████████| 643/643 [14:31<00:00,  1.36s/it]


📊 Epoch 3 | Loss: 580.1549 | Train Acc: 0.8688 | Val Acc: 0.8312 | Train F1: 0.8724 | Val F1: 0.8360


  plt.savefig(f"{OUTPUT_DIR}/plots/loss_f1_curve_{level_name}.png")


In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from scipy.stats import mode
from transformers import DebertaV2ForSequenceClassification, AutoTokenizer
import pickle

# === CONFIG ===
model_paths = [
    "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/models/level1_fold1.pth",
    "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/models/level1_fold2.pth",
    "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/models/level1_fold3.pth",
    "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/models/level1_fold4.pth",
    "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/models/level1_fold5.pth",
]
model_name = "microsoft/deberta-v3-small"
label_encoder_path = "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/encoders/label_encoder_level_1.pkl"
val_csv_path = "/content/drive/MyDrive/FIRE/crypto_task1_val.csv"
save_dir = "/content/drive/MyDrive/FIRE/outputs/ensemble_level1_eval"
os.makedirs(save_dir, exist_ok=True)

# === Load label encoder ===
with open(label_encoder_path, "rb") as f:
    le1 = pickle.load(f)

# === Load and preprocess validation data ===
val_df = pd.read_csv(val_csv_path)
print(" Columns in validation CSV:", val_df.columns.tolist())

# Add [SOURCE] token to text
val_df['source_token'] = val_df['source'].str.upper().map({
    'REDDIT': '[REDDIT]',
    'TWITTER': '[TWITTER]',
    'YOUTUBE': '[YOUTUBE]'
})
val_df['text'] = val_df['source_token'] + ' ' + val_df['text']

# Encode labels
if 'level_1_enc' not in val_df.columns:
    if 'level_1' in val_df.columns:
        val_df['level_1_enc'] = le1.transform(val_df['level_1'])
    else:
        raise ValueError(" 'level_1_enc' or 'level_1' must be present in CSV.")

# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
encodings = tokenizer(
    list(val_df['text']),
    padding=True,
    truncation=True,
    return_tensors='pt',
    max_length=128
)

labels = torch.tensor(val_df['level_1_enc'].values)
true_labels = labels.numpy()

val_dataset = torch.utils.data.TensorDataset(
    encodings['input_ids'], encodings['attention_mask'], labels
)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# === Inference ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
all_fold_preds = []

for fold, model_path in enumerate(model_paths):
    print(f"\n Fold {fold+1} — loading model")
    model = DebertaV2ForSequenceClassification.from_pretrained(
        model_name, num_labels=len(le1.classes_)
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    fold_preds = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Predicting Fold {fold+1}"):
            input_ids, attention_mask, _ = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            fold_preds.extend(preds)

    all_fold_preds.append(np.array(fold_preds))

# === Majority Voting ===
ensemble_preds = mode(np.array(all_fold_preds), axis=0).mode.squeeze()
pred_labels = le1.inverse_transform(ensemble_preds)

# Save predictions
val_df["preds"] = ensemble_preds
val_df["pred_labels"] = pred_labels
val_df.to_csv(os.path.join(save_dir, "level1_val_predictions.csv"), index=False)

# === Overall Metrics ===
acc = accuracy_score(true_labels, ensemble_preds)
f1_weighted = f1_score(true_labels, ensemble_preds, average='weighted')
f1_macro = f1_score(true_labels, ensemble_preds, average='macro')
f1_micro = f1_score(true_labels, ensemble_preds, average='micro')
prec_macro = precision_score(true_labels, ensemble_preds, average='macro')
recall_macro = recall_score(true_labels, ensemble_preds, average='macro')

try:
    report = classification_report(
        true_labels,
        ensemble_preds,
        labels=list(range(len(le1.classes_))),
        target_names=[str(c) for c in le1.classes_],
        digits=4
    )
except Exception as e:
    print(" Error generating classification report:", e)
    report = classification_report(true_labels, ensemble_preds, digits=4)

metrics_text = f"""
 Ensemble Accuracy: {acc:.4f}
 F1 (Weighted): {f1_weighted:.4f}
 F1 (Macro):    {f1_macro:.4f}
 F1 (Micro):    {f1_micro:.4f}
 Precision (Macro): {prec_macro:.4f}
 Recall (Macro):    {recall_macro:.4f}

 Classification Report:
{report}
"""

print(metrics_text)
with open(os.path.join(save_dir, "metrics.txt"), "w") as f:
    f.write(metrics_text)

# === Confusion Matrix ===
plt.figure(figsize=(6, 5))
cm = confusion_matrix(true_labels, ensemble_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Level 1 Ensemble)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
plt.close()

# ===  Platform-wise Evaluation — using val_df['source'].str.lower()
print("\n Platform-wise Evaluation:")
platforms = ["youtube", "reddit", "twitter"]

for platform in platforms:
    mask = val_df['source'].str.lower() == platform
    y_true = true_labels[mask]
    y_pred = ensemble_preds[mask]

    print(f"\n Platform: {platform.upper()}")
    if len(y_true) == 0:
        print(f" No samples for {platform.upper()}. Skipping.")
        continue

    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Weighted: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Macro:    {f1_score(y_true, y_pred, average='macro'):.4f}")

    try:
        platform_report = classification_report(
            y_true, y_pred,
            labels=list(range(len(le1.classes_))),
            target_names=[str(c) for c in le1.classes_],
            digits=4
        )
    except Exception as e:
        print(" Error generating report:", e)
        platform_report = classification_report(y_true, y_pred, digits=4)

    print(f"Platform Report:\n{platform_report}")

print(f"\n All outputs saved to: {save_dir}")


✅ Columns in validation CSV: ['text', 'level_1', 'level_2', 'level_3', 'source']





🔁 Fold 1 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 1: 100%|██████████| 45/45 [00:05<00:00,  7.65it/s]



🔁 Fold 2 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 2: 100%|██████████| 45/45 [00:05<00:00,  7.63it/s]



🔁 Fold 3 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 3: 100%|██████████| 45/45 [00:05<00:00,  7.67it/s]



🔁 Fold 4 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 4: 100%|██████████| 45/45 [00:05<00:00,  7.64it/s]



🔁 Fold 5 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 5: 100%|██████████| 45/45 [00:05<00:00,  7.62it/s]



✅ Ensemble Accuracy: 0.8383
🎯 F1 (Weighted): 0.8424
📏 F1 (Macro):    0.7832
📐 F1 (Micro):    0.8383
🎯 Precision (Macro): 0.7669
📌 Recall (Macro):    0.8122

📋 Classification Report:
              precision    recall  f1-score   support

           0     0.7074    0.6871    0.6971       278
           1     0.6457    0.8744    0.7429       223
           2     0.9475    0.8750    0.9098       928

    accuracy                         0.8383      1429
   macro avg     0.7669    0.8122    0.7832      1429
weighted avg     0.8537    0.8383    0.8424      1429



🔎 Platform-wise Evaluation:

📦 Platform: YOUTUBE
Accuracy: 0.9080
F1 Weighted: 0.9077
F1 Macro:    0.6915
Platform Report:
              precision    recall  f1-score   support

           0     0.7250    0.7342    0.7296        79
           1     0.5000    0.3333    0.4000         3
           2     0.9450    0.9450    0.9450       418

    accuracy                         0.9080       500
   macro avg     0.7233    0.6708    0.