In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# === 1. Load datasets ===
reddit_path = "/content/drive/MyDrive/FIRE/CRYPTO-REDDIT-OPINION-TRAIN.csv"
twitter_path = "/content/drive/MyDrive/FIRE/CRYPTO-TWITTER-OPINION-TRAIN.csv"
youtube_path = "/content/drive/MyDrive/FIRE/CRYPTO-YOUTUBE-OPINION-TRAIN.csv"

df_reddit = pd.read_csv(reddit_path)
df_twitter = pd.read_csv(twitter_path)
df_youtube = pd.read_csv(youtube_path)

# === 2. Normalize column names ===
def normalize_columns(df):
    df.columns = df.columns.str.lower().str.strip()
    return df

df_reddit = normalize_columns(df_reddit)
df_twitter = normalize_columns(df_twitter)
df_youtube = normalize_columns(df_youtube)

# === 3. Standardize Reddit ===
df_reddit['text'] = (
    df_reddit['title'].fillna('') + ' ' +
    df_reddit['selftext'].fillna('') + ' ' +
    df_reddit['main'].fillna('')
).str.strip()

df_reddit = df_reddit.rename(columns={
    'level 1': 'level_1',
    'level 2': 'level_2',
    'level 3': 'level_3'
})
df_reddit['source'] = 'reddit'
df_reddit = df_reddit[['text', 'level_1', 'level_2', 'level_3', 'source']]

# === 4. Standardize Twitter ===
df_twitter = df_twitter.rename(columns={
    'tweet': 'text',
    'level 1': 'level_1',
    'level 2': 'level_2',
    'level 3': 'level_3'
})
df_twitter['source'] = 'twitter'
df_twitter = df_twitter[['text', 'level_1', 'level_2', 'level_3', 'source']]

# === 5. Standardize YouTube ===
df_youtube = df_youtube.rename(columns={
    'comment': 'text',
    'level 1': 'level_1',
    'level 2': 'level_2',
    'level 3': 'level_3'
})
df_youtube['source'] = 'youtube'
df_youtube = df_youtube[['text', 'level_1', 'level_2', 'level_3', 'source']]

# === 6. Combine all datasets ===
df_all = pd.concat([df_reddit, df_twitter, df_youtube], ignore_index=True)

# === 7. Filter/clean valid samples ===
def preprocess_task1_dataset(df):
    def should_keep(row):
        try:
            lvl1 = int(row.get('level_1'))
        except:
            return False

        try:
            lvl2 = int(row.get('level_2')) if pd.notna(row.get('level_2')) else None
        except:
            lvl2 = None

        try:
            lvl3 = int(row.get('level_3')) if pd.notna(row.get('level_3')) else None
        except:
            lvl3 = None

        if lvl1 in [0, 1]:
            return True
        if lvl1 == 2:
            if lvl2 is None:
                return False
            if lvl2 == 0:
                return lvl3 is not None
            if lvl2 in [1, 2]:
                return True
        return False

    return df[df.apply(should_keep, axis=1)].reset_index(drop=True)

df_cleaned = preprocess_task1_dataset(df_all)

# === 7.1 Sanitize column types ===
for col in ['level_1', 'level_2', 'level_3']:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

df_cleaned = df_cleaned[df_cleaned['level_1'].notna()].reset_index(drop=True)
df_cleaned['level_1'] = df_cleaned['level_1'].astype(int)
df_cleaned.loc[df_cleaned['level_2'].notna(), 'level_2'] = df_cleaned.loc[df_cleaned['level_2'].notna(), 'level_2'].astype(int)
df_cleaned.loc[df_cleaned['level_3'].notna(), 'level_3'] = df_cleaned.loc[df_cleaned['level_3'].notna(), 'level_3'].astype(int)

print("\n✅ Total before cleaning:", len(df_all))
print("✅ Total after cleaning:", len(df_cleaned))

# === 8. Hierarchical Stratified Split per source ===
def hierarchical_split_by_source(df, test_size=0.1, random_state=42):
    train_dfs, val_dfs = [], []
    for source in df['source'].unique():
        df_src = df[df['source'] == source]

        train_src, val_src = train_test_split(
            df_src, test_size=test_size, stratify=df_src['level_1'], random_state=random_state
        )

        for df_sub in [train_src, val_src]:
            mask_l2 = df_sub['level_1'] == 2
            if df_sub[mask_l2]['level_2'].nunique() > 1:
                pass

            mask_l3 = (df_sub['level_1'] == 2) & (df_sub['level_2'] == 0)
            if df_sub[mask_l3]['level_3'].nunique() > 1:
                pass

        train_dfs.append(train_src)
        val_dfs.append(val_src)

    train_df = pd.concat(train_dfs).sample(frac=1, random_state=random_state).reset_index(drop=True)
    val_df = pd.concat(val_dfs).sample(frac=1, random_state=random_state).reset_index(drop=True)
    return train_df, val_df

train_df, val_df = hierarchical_split_by_source(df_cleaned, test_size=0.1)

# === 9. Save cleaned + split data ===
train_df.to_csv("/content/drive/MyDrive/FIRE/crypto_task1_train.csv", index=False)
val_df.to_csv("/content/drive/MyDrive/FIRE/crypto_task1_val.csv", index=False)

print("\n✅ Balanced Train/Val CSVs saved.")
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Train source breakdown:\n", train_df['source'].value_counts())
print("Val source breakdown:\n", val_df['source'].value_counts())


In [None]:
import os
import json
import random
import pickle
import time
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.stats import mode

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    DebertaV2Tokenizer,
    DebertaV2ForSequenceClassification,
    get_scheduler
)

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score
)
from sklearn.model_selection import StratifiedKFold

import joblib



# -------------------- Configuration --------------------
SEED = 42
BATCH_SIZE = 16
VAL_BATCH_SIZE = 32
MAX_LENGTH = 128
EPOCHS = 3
LEARNING_RATE = 1e-5
PATIENCE = 2
MODEL_NAME = 'microsoft/deberta-v3-small'
OUTPUT_DIR = "/content/drive/MyDrive/FIRE/outputs"

os.makedirs(OUTPUT_DIR, exist_ok=True)
# -------------------- Seed Setup --------------------
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -------------------- Tokenizer --------------------
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))

# -------------------- Dataset --------------------
class CryptoDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def collate_fn(batch):
    keys = batch[0].keys()
    return {key: torch.stack([item[key] for item in batch]) for key in keys}


# -------------------- Confusion Matrix --------------------
def plot_confusion_matrix(labels, preds, classes, title, save_path):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

# -------------------- Contrastive Supervision --------------------
def apply_contrastive_supervision(features, labels, temperature=0.1):
    features = F.normalize(features, dim=1)
    similarity_matrix = torch.matmul(features, features.T)
    labels = labels.contiguous().view(-1, 1)
    mask = torch.eq(labels, labels.T).float().to(features.device)

    logits = similarity_matrix / temperature
    logits_mask = torch.ones_like(mask) - torch.eye(mask.size(0), device=mask.device)
    mask = mask * logits_mask

    exp_logits = torch.exp(logits) * logits_mask
    log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True) + 1e-9)
    mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1) + 1e-9)

    loss = -mean_log_prob_pos.mean()
    return loss

# -------------------- Training Losses --------------------
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, input, target):
        logpt = F.log_softmax(input, dim=1)
        pt = torch.exp(logpt)
        logpt = (1 - pt) ** self.gamma * logpt
        loss = F.nll_loss(logpt, target, weight=self.weight, reduction=self.reduction)
        return self.alpha * loss


def dice_loss(logits, targets, smooth=1):
    probs = F.softmax(logits, dim=1)
    targets_one_hot = F.one_hot(targets, num_classes=logits.size(1)).float().to(logits.device)
    intersection = (probs * targets_one_hot).sum(dim=0)
    cardinality = probs.sum(dim=0) + targets_one_hot.sum(dim=0)
    dice = (2. * intersection + smooth) / (cardinality + smooth)
    return 1. - dice.mean()


def smoothed_cross_entropy(logits, target, smoothing=0.1):
    num_classes = logits.size(1)
    confidence = 1.0 - smoothing
    with torch.no_grad():
        true_dist = torch.zeros_like(logits)
        true_dist.fill_(smoothing / (num_classes - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), confidence)
    log_probs = F.log_softmax(logits, dim=1)
    return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


# -------------------- Helper --------------------
def get_preds_from_logits(logits):
    """Return predicted class indices and softmax probabilities."""
    probs = F.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)
    return preds, probs


def train_model_for_level(
    num_labels, train_loader, val_loader, save_path, level_name="level",
    y_train_labels=None, loss_type="focal+dice", contrastive_weight=0.2, label_smoothing=0.0
):
    # Ensure plot and log directories exist
    os.makedirs(os.path.join(OUTPUT_DIR, "plots"), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_DIR, "logs"), exist_ok=True)

    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(device)
    model.gradient_checkpointing_enable()
    model.config.output_hidden_states = True

    # Compute class weights
    if y_train_labels is not None:
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
    else:
        all_train_labels = [label.item() for batch in train_loader for label in batch['labels']]
        class_weights = compute_class_weight('balanced', classes=np.unique(all_train_labels), y=all_train_labels)

    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    focal = FocalLoss(weight=class_weights_tensor)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=EPOCHS * len(train_loader))

    best_f1 = 0
    patience_counter = 0
    train_losses, train_accuracies, train_f1s = [], [], []
    val_accuracies, val_f1s = [], []
    best_metrics = {}

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch in tqdm(train_loader, desc=f"[{level_name}] Epoch {epoch+1}/{EPOCHS}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits

            # ===== Loss Calculation =====
            loss = 0
            if "focal" in loss_type:
                loss += focal(logits, batch['labels'])
            elif label_smoothing > 0:
                loss += smoothed_cross_entropy(logits, batch['labels'], smoothing=label_smoothing)
            else:
                loss += F.cross_entropy(logits, batch['labels'], weight=class_weights_tensor)

            if "dice" in loss_type:
                loss += dice_loss(logits, batch['labels'])

            if "contrastive" in loss_type:
                hidden_states = outputs.hidden_states[-1][:, 0, :]
                loss += contrastive_weight * apply_contrastive_supervision(hidden_states, batch['labels'])

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            preds, _ = get_preds_from_logits(logits)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

        train_f1 = f1_score(all_labels, all_preds, average='weighted')
        train_acc = accuracy_score(all_labels, all_preds)
        train_losses.append(total_loss)
        train_accuracies.append(train_acc)
        train_f1s.append(train_f1)

        # ===== Validation =====
        model.eval()
        val_preds, val_labels, val_probs = [], [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                preds, probs = get_preds_from_logits(logits)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(batch['labels'].cpu().numpy())
                val_probs.extend(probs.cpu().numpy())

        val_f1 = f1_score(val_labels, val_preds, average='weighted')
        val_acc = accuracy_score(val_labels, val_preds)

        try:
            val_labels_bin = label_binarize(val_labels, classes=list(range(num_labels)))
            roc_auc = roc_auc_score(val_labels_bin, val_probs, average='macro', multi_class='ovr')
        except Exception as e:
            print(f" ROC AUC calculation failed: {e}")
            roc_auc = None

        print(f" Epoch {epoch+1} | Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}")

        val_accuracies.append(val_acc)
        val_f1s.append(val_f1)

        if val_f1 > best_f1:
            best_f1 = val_f1
            patience_counter = 0

            # Save best checkpoint
            with open(save_path, "wb") as f:
                torch.save(model.state_dict(), f)

            precision, recall, f1_metric, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')
            best_metrics = {
                "val_precision_weighted": precision,
                "val_recall_weighted": recall,
                "val_f1_weighted": f1_metric,
                "roc_auc": roc_auc
            }
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(" Early stopping.")
                break

    # ===== Plot Training Curve =====
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Train Loss")
    plt.plot(train_f1s, label="Train F1")
    plt.plot(train_accuracies, label="Train Accuracy")
    plt.plot(val_f1s, label="Val F1")
    plt.plot(val_accuracies, label="Val Accuracy")
    plt.legend()
    plt.grid(True)
    plt.title(f" Training Curve - {level_name}")
    plt.savefig(f"{OUTPUT_DIR}/plots/loss_f1_curve_{level_name}.png")
    plt.close()

    # ===== Save Training Log =====
    history = {
        "train_losses": train_losses,
        "train_accuracies": train_accuracies,
        "train_f1s": train_f1s,
        "val_f1s": val_f1s,
        "val_accuracies": val_accuracies,
        **best_metrics
    }
    with open(f"{OUTPUT_DIR}/logs/history_{level_name}.json", "w") as f:
        json.dump(history, f, indent=4)

    #  Return the best model (loaded from disk)
    model.load_state_dict(torch.load(save_path))
    return model




# -------------------- Evaluation --------------------
def evaluate_saved_model(model_path, dataloader, num_labels):
    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())

    report = classification_report(labels, preds, digits=4)
    print(report)

def load_model_for_inference(num_labels, path, device):
    model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    return model


def visualize_model_performance(true_labels, pred_labels, class_names, title, save_path):
    """
    Visualizes classification performance using a confusion matrix and saves the classification report.

    Args:
        true_labels (list or np.array): Ground truth labels.
        pred_labels (list or np.array): Predicted labels from the ensemble.
        class_names (list): List of class label names (e.g., label_encoder.classes_).
        title (str): Title for the plot and report.
        save_path (str): Path to save the confusion matrix image.
    """
    from sklearn.metrics import classification_report

    # Generate classification report
    report = classification_report(true_labels, pred_labels, target_names=class_names, digits=4)
    print(report)

    # Save report
    report_path = save_path.replace(".png", "_report.txt")
    with open(report_path, "w") as f:
        f.write(report)

    # Plot confusion matrix
    plot_confusion_matrix(true_labels, pred_labels, class_names, title, save_path)


# **LEVEL-1**

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, WeightedRandomSampler
import torch
from transformers import DebertaV2Tokenizer
from scipy.stats import mode

# ==== Assumed Pre-defined: SEED, OUTPUT_DIR, BATCH_SIZE, VAL_BATCH_SIZE,
# CryptoDataset, collate_fn, train_model_for_level, load_model_for_inference, visualize_model_performance ====

LEVEL1_ENCODER_PATH = "/content/drive/MyDrive/FIRE/outputs/run_20250628_034630/encoders/label_encoder_level_1.pkl"

if __name__ == "__main__":
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(OUTPUT_DIR, f"run_{run_id}")
    os.makedirs(run_dir, exist_ok=True)
    for subfolder in ["models", "plots", "logs", "encoders", "ensembles"]:
        os.makedirs(os.path.join(run_dir, subfolder), exist_ok=True)

    # === Load Data ===
    train_df = pd.read_csv("/content/drive/MyDrive/FIRE/crypto_task1_train.csv")
    val_df = pd.read_csv("/content/drive/MyDrive/FIRE/crypto_task1_val.csv")

    print(f"\n Train size: {len(train_df)}, Validation size: {len(val_df)}")
    print(" Train Source Distribution:\n", train_df['source'].value_counts())
    print(" Validation Source Distribution:\n", val_df['source'].value_counts())

    # === Add [SOURCE] token ===
    def add_source_token(df):
        df = df.copy()
        df['source_token'] = df['source'].str.upper().map({
            'REDDIT': '[REDDIT]',
            'TWITTER': '[TWITTER]',
            'YOUTUBE': '[YOUTUBE]'
        })
        df['text'] = df['source_token'] + ' ' + df['text']
        return df

    train_df = add_source_token(train_df)
    val_df = add_source_token(val_df)

    tokenizer = DebertaV2Tokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))

    def save_ensemble_model(preds_list, save_path):
        np.save(save_path, np.array(preds_list))

    def save_platform_reports(val_sources, platforms, true_labels, majority_preds, label_classes, level_num):
        for platform in platforms:
            mask = val_sources == platform
            platform_true = true_labels[mask]
            platform_pred = majority_preds[mask]
            report = classification_report(platform_true, platform_pred, target_names=label_classes, digits=4)
            print(f"\n Level {level_num} - Platform: {platform}")
            print(report)
            with open(f"{run_dir}/logs/level{level_num}_report_{platform}.txt", "w") as f:
                f.write(report)

    # -------- LEVEL 1 --------
    le1 = LabelEncoder()
    train_df['level_1_enc'] = le1.fit_transform(train_df['level_1'])
    val_df['level_1_enc'] = le1.transform(val_df['level_1'])

    # Save label encoder
    pickle.dump(le1, open(LEVEL1_ENCODER_PATH, "wb"))

    skf1 = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(skf1.split(train_df, train_df['level_1_enc'])):
        print(f"\n Level 1 Fold {fold + 1}/5")
        train_fold_df = train_df.iloc[train_idx].reset_index(drop=True)
        val_fold_df = train_df.iloc[val_idx].reset_index(drop=True)

        # Weighted Sampling based on 'source'
        source_weights = train_fold_df['source'].map({
            'YOUTUBE': 2.5,
            'REDDIT': 2.5,
            'TWITTER': 6.5
        }).fillna(1.0).astype(float).values
        source_weights = torch.tensor(source_weights, dtype=torch.double)
        sampler = WeightedRandomSampler(source_weights, num_samples=len(source_weights), replacement=True)

        # Loaders
        train_loader = DataLoader(
            CryptoDataset(train_fold_df['text'], train_fold_df['level_1_enc'], tokenizer),
            batch_size=BATCH_SIZE, sampler=sampler, collate_fn=collate_fn)
        val_loader = DataLoader(
            CryptoDataset(val_fold_df['text'], val_fold_df['level_1_enc'], tokenizer),
            batch_size=VAL_BATCH_SIZE, collate_fn=collate_fn)

        model_path = os.path.join(run_dir, f"models/level1_fold{fold + 1}.pth")

        model = train_model_for_level(
            num_labels=len(le1.classes_),
            train_loader=train_loader,
            val_loader=val_loader,
            save_path=model_path,
            level_name=f"level1_fold{fold + 1}",
            y_train_labels=train_fold_df['level_1_enc'],
            loss_type="focal+dice+contrastive",
            label_smoothing=0.1
        )
