# **LEEVL1**

In [1]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from scipy.stats import mode
from transformers import DebertaV2ForSequenceClassification, AutoTokenizer
import pickle

# === CONFIG ===
model_paths = [
    "/content/drive/MyDrive/FIRE/run_20250628_034630/models/level1_fold1.pth",
    "/content/drive/MyDrive/FIRE/run_20250628_034630/models/level1_fold2.pth",
    "/content/drive/MyDrive/FIRE/run_20250628_034630/models/level1_fold3.pth",
    "/content/drive/MyDrive/FIRE/run_20250628_034630/models/level1_fold4.pth",
    "/content/drive/MyDrive/FIRE/run_20250628_034630/models/level1_fold5.pth",
]
model_name = "microsoft/deberta-v3-small"
label_encoder_path = "/content/drive/MyDrive/FIRE/run_20250628_034630/encoders/label_encoder_level_1.pkl"
val_csv_path = "/content/drive/MyDrive/FIRE/crypto_task1_val.csv"
save_dir = "/content/drive/MyDrive/FIRE/outputs/ensemble_level1_eval"
os.makedirs(save_dir, exist_ok=True)

# === Load label encoder ===
with open(label_encoder_path, "rb") as f:
    le1 = pickle.load(f)

# === Load and preprocess validation data ===
val_df = pd.read_csv(val_csv_path)
print(" Columns in validation CSV:", val_df.columns.tolist())

# Add [SOURCE] token to text
val_df['source_token'] = val_df['source'].str.upper().map({
    'REDDIT': '[REDDIT]',
    'TWITTER': '[TWITTER]',
    'YOUTUBE': '[YOUTUBE]'
})
val_df['text'] = val_df['source_token'] + ' ' + val_df['text']

# Encode labels
if 'level_1_enc' not in val_df.columns:
    if 'level_1' in val_df.columns:
        val_df['level_1_enc'] = le1.transform(val_df['level_1'])
    else:
        raise ValueError(" 'level_1_enc' or 'level_1' must be present in CSV.")

# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
encodings = tokenizer(
    list(val_df['text']),
    padding=True,
    truncation=True,
    return_tensors='pt',
    max_length=128
)

labels = torch.tensor(val_df['level_1_enc'].values)
true_labels = labels.numpy()

val_dataset = torch.utils.data.TensorDataset(
    encodings['input_ids'], encodings['attention_mask'], labels
)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# === Inference ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
all_fold_preds = []

for fold, model_path in enumerate(model_paths):
    print(f"\n Fold {fold+1} — loading model")
    model = DebertaV2ForSequenceClassification.from_pretrained(
        model_name, num_labels=len(le1.classes_)
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    fold_preds = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Predicting Fold {fold+1}"):
            input_ids, attention_mask, _ = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            fold_preds.extend(preds)

    all_fold_preds.append(np.array(fold_preds))

# === Majority Voting ===
ensemble_preds = mode(np.array(all_fold_preds), axis=0).mode.squeeze()
pred_labels = le1.inverse_transform(ensemble_preds)

# Save predictions
val_df["preds"] = ensemble_preds
val_df["pred_labels"] = pred_labels
val_df.to_csv(os.path.join(save_dir, "level1_val_predictions.csv"), index=False)

# === Overall Metrics ===
acc = accuracy_score(true_labels, ensemble_preds)
f1_weighted = f1_score(true_labels, ensemble_preds, average='weighted')
f1_macro = f1_score(true_labels, ensemble_preds, average='macro')
f1_micro = f1_score(true_labels, ensemble_preds, average='micro')
prec_macro = precision_score(true_labels, ensemble_preds, average='macro')
recall_macro = recall_score(true_labels, ensemble_preds, average='macro')

try:
    report = classification_report(
        true_labels,
        ensemble_preds,
        labels=list(range(len(le1.classes_))),
        target_names=[str(c) for c in le1.classes_],
        digits=4
    )
except Exception as e:
    print(" Error generating classification report:", e)
    report = classification_report(true_labels, ensemble_preds, digits=4)

metrics_text = f"""
 Ensemble Accuracy: {acc:.4f}
 F1 (Weighted): {f1_weighted:.4f}
 F1 (Macro):    {f1_macro:.4f}
 F1 (Micro):    {f1_micro:.4f}
 Precision (Macro): {prec_macro:.4f}
 Recall (Macro):    {recall_macro:.4f}

 Classification Report:
{report}
"""

print(metrics_text)
with open(os.path.join(save_dir, "metrics.txt"), "w") as f:
    f.write(metrics_text)

# === Confusion Matrix ===
plt.figure(figsize=(6, 5))
cm = confusion_matrix(true_labels, ensemble_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Level 1 Ensemble)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
plt.close()

# ===  Platform-wise Evaluation — using val_df['source'].str.lower()
print("\n Platform-wise Evaluation:")
platforms = ["youtube", "reddit", "twitter"]

for platform in platforms:
    mask = val_df['source'].str.lower() == platform
    y_true = true_labels[mask]
    y_pred = ensemble_preds[mask]

    print(f"\n Platform: {platform.upper()}")
    if len(y_true) == 0:
        print(f" No samples for {platform.upper()}. Skipping.")
        continue

    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Weighted: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Macro:    {f1_score(y_true, y_pred, average='macro'):.4f}")

    try:
        platform_report = classification_report(
            y_true, y_pred,
            labels=list(range(len(le1.classes_))),
            target_names=[str(c) for c in le1.classes_],
            digits=4
        )
    except Exception as e:
        print(" Error generating report:", e)
        platform_report = classification_report(y_true, y_pred, digits=4)

    print(f"Platform Report:\n{platform_report}")

print(f"\n All outputs saved to: {save_dir}")


 Columns in validation CSV: ['text', 'level_1', 'level_2', 'level_3', 'source']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]




 Fold 1 — loading model


pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Predicting Fold 1: 100%|██████████| 45/45 [00:06<00:00,  6.50it/s]



 Fold 2 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 2: 100%|██████████| 45/45 [00:05<00:00,  7.69it/s]



 Fold 3 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 3: 100%|██████████| 45/45 [00:06<00:00,  7.49it/s]



 Fold 4 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 4: 100%|██████████| 45/45 [00:06<00:00,  7.25it/s]



 Fold 5 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 5: 100%|██████████| 45/45 [00:06<00:00,  7.38it/s]



 Ensemble Accuracy: 0.8383
 F1 (Weighted): 0.8424
 F1 (Macro):    0.7832
 F1 (Micro):    0.8383
 Precision (Macro): 0.7669
 Recall (Macro):    0.8122

 Classification Report:
              precision    recall  f1-score   support

           0     0.7074    0.6871    0.6971       278
           1     0.6457    0.8744    0.7429       223
           2     0.9475    0.8750    0.9098       928

    accuracy                         0.8383      1429
   macro avg     0.7669    0.8122    0.7832      1429
weighted avg     0.8537    0.8383    0.8424      1429



 Platform-wise Evaluation:

 Platform: YOUTUBE
Accuracy: 0.9080
F1 Weighted: 0.9077
F1 Macro:    0.6915
Platform Report:
              precision    recall  f1-score   support

           0     0.7250    0.7342    0.7296        79
           1     0.5000    0.3333    0.4000         3
           2     0.9450    0.9450    0.9450       418

    accuracy                         0.9080       500
   macro avg     0.7233    0.6708    0.6915     

**LEVEL2**

In [2]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from scipy.stats import mode
from transformers import DebertaV2ForSequenceClassification, AutoTokenizer
import pickle

# === CONFIG ===
model_paths = [
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold1.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold2.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold3.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold4.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold5.pth",
]
model_name = "microsoft/deberta-v3-small"
label_encoder_path = "/content/drive/MyDrive/FIRE/run_20250629_090953/encoders/label_encoder_level_2.pkl"
val_csv_path = "/content/drive/MyDrive/FIRE/crypto_task1_val.csv"
save_dir = "/content/drive/MyDrive/FIRE/outputs/ensemble_level2_eval"
os.makedirs(save_dir, exist_ok=True)

# === Load label encoder ===
with open(label_encoder_path, "rb") as f:
    le2 = pickle.load(f)

# === Load and preprocess validation data ===
val_df = pd.read_csv(val_csv_path)
print("Columns in validation CSV:", val_df.columns.tolist())

# Add [SOURCE] token to text
val_df['source_token'] = val_df['source'].str.upper().map({
    'REDDIT': '[REDDIT]',
    'TWITTER': '[TWITTER]',
    'YOUTUBE': '[YOUTUBE]'
})
val_df['text'] = val_df['source_token'] + ' ' + val_df['text']

# Only SUBJECTIVE samples are relevant for Level 2
subjective_mask = val_df['level_1'] == 2
val_df = val_df[subjective_mask].copy()
val_df["level_2_enc"] = le2.transform(val_df["level_2"])
true_labels = val_df["level_2_enc"].values

# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
encodings = tokenizer(
    list(val_df['text']),
    padding=True,
    truncation=True,
    return_tensors='pt',
    max_length=128
)

labels = torch.tensor(true_labels)
val_dataset = torch.utils.data.TensorDataset(
    encodings['input_ids'], encodings['attention_mask'], labels
)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# === Inference ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
all_fold_preds = []

for fold, model_path in enumerate(model_paths):
    print(f"\nFold {fold+1} — loading model")
    model = DebertaV2ForSequenceClassification.from_pretrained(
        model_name, num_labels=len(le2.classes_)
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    fold_preds = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Predicting Fold {fold+1}"):
            input_ids, attention_mask, _ = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            fold_preds.extend(preds)

    all_fold_preds.append(np.array(fold_preds))

# === Majority Voting ===
ensemble_preds = mode(np.array(all_fold_preds), axis=0).mode.squeeze()
pred_labels = le2.inverse_transform(ensemble_preds)

# Save predictions
val_df["preds"] = ensemble_preds
val_df["pred_labels"] = pred_labels
val_df.to_csv(os.path.join(save_dir, "level2_val_predictions.csv"), index=False)

# === Overall Metrics ===
acc = accuracy_score(true_labels, ensemble_preds)
f1_weighted = f1_score(true_labels, ensemble_preds, average='weighted')
f1_macro = f1_score(true_labels, ensemble_preds, average='macro')
f1_micro = f1_score(true_labels, ensemble_preds, average='micro')
prec_macro = precision_score(true_labels, ensemble_preds, average='macro')
recall_macro = recall_score(true_labels, ensemble_preds, average='macro')

try:
    report = classification_report(
        true_labels,
        ensemble_preds,
        labels=list(range(len(le2.classes_))),
        target_names=[str(c) for c in le2.classes_],
        digits=4
    )
except Exception as e:
    print("Error generating classification report:", e)
    report = classification_report(true_labels, ensemble_preds, digits=4)

metrics_text = f"""
Ensemble Accuracy: {acc:.4f}
F1 (Weighted): {f1_weighted:.4f}
F1 (Macro):    {f1_macro:.4f}
F1 (Micro):    {f1_micro:.4f}
Precision (Macro): {prec_macro:.4f}
Recall (Macro):    {recall_macro:.4f}

Classification Report:
{report}
"""

print(metrics_text)
with open(os.path.join(save_dir, "metrics.txt"), "w") as f:
    f.write(metrics_text)

# === Confusion Matrix ===
plt.figure(figsize=(6, 5))
cm = confusion_matrix(true_labels, ensemble_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Level 2 Ensemble)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
plt.close()

# === Platform-wise Evaluation ===
print("\nPlatform-wise Evaluation:")
platforms = ["youtube", "reddit", "twitter"]

for platform in platforms:
    mask = val_df['source'].str.lower() == platform
    y_true = val_df["level_2_enc"].values[mask]
    y_pred = val_df["preds"].values[mask]

    print(f"\nPlatform: {platform.upper()}")
    if len(y_true) == 0:
        print(f"No samples for {platform.upper()}. Skipping.")
        continue

    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Weighted: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Macro:    {f1_score(y_true, y_pred, average='macro'):.4f}")

    try:
        platform_report = classification_report(
            y_true, y_pred,
            labels=list(range(len(le2.classes_))),
            target_names=[str(c) for c in le2.classes_],
            digits=4
        )
    except Exception as e:
        print("Error generating report:", e)
        platform_report = classification_report(y_true, y_pred, digits=4)

    print(f"Platform Report:\n{platform_report}")

print(f"\nAll outputs saved to: {save_dir}")


Columns in validation CSV: ['text', 'level_1', 'level_2', 'level_3', 'source']





Fold 1 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 1: 100%|██████████| 29/29 [00:04<00:00,  7.24it/s]



Fold 2 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 2: 100%|██████████| 29/29 [00:04<00:00,  7.06it/s]



Fold 3 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 3: 100%|██████████| 29/29 [00:04<00:00,  7.24it/s]



Fold 4 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 4: 100%|██████████| 29/29 [00:03<00:00,  7.29it/s]



Fold 5 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 5: 100%|██████████| 29/29 [00:03<00:00,  7.27it/s]



Ensemble Accuracy: 0.8190
F1 (Weighted): 0.8001
F1 (Macro):    0.6440
F1 (Micro):    0.8190
Precision (Macro): 0.7272
Recall (Macro):    0.6103

Classification Report:
              precision    recall  f1-score   support

         0.0     0.8363    0.9545    0.8915       637
         1.0     0.8049    0.6439    0.7154       205
         2.0     0.5405    0.2326    0.3252        86

    accuracy                         0.8190       928
   macro avg     0.7272    0.6103    0.6440       928
weighted avg     0.8020    0.8190    0.8001       928



Platform-wise Evaluation:

Platform: YOUTUBE
Accuracy: 0.8254
F1 Weighted: 0.8154
F1 Macro:    0.6098
Platform Report:
              precision    recall  f1-score   support

         0.0     0.8204    0.9510    0.8809       245
         1.0     0.8730    0.6962    0.7746       158
         2.0     0.2500    0.1333    0.1739        15

    accuracy                         0.8254       418
   macro avg     0.6478    0.5935    0.6098       418
wei

# **LEVEL3**

In [3]:
# === CONFIRMED LEVEL 3 INFERENCE & EVALUATION SCRIPT ===
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from scipy.stats import mode
from transformers import DebertaV2ForSequenceClassification, AutoTokenizer
import pickle

# === CONFIG ===
model_paths = [
    "/content/drive/MyDrive/FIRE/run_20250629_121221/models/level3_fold1.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_121221/models/level3_fold2.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_121221/models/level3_fold3.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_121221/models/level3_fold4.pth",
    "/content/drive/MyDrive/FIRE/run_20250629_121221/models/level3_fold5.pth",
]
model_name = "microsoft/deberta-v3-small"
label_encoder_path = "/content/drive/MyDrive/FIRE/run_20250629_121221/encoders/label_encoder_level_3.pkl"
val_csv_path = "/content/drive/MyDrive/FIRE/crypto_task1_val.csv"
save_dir = "/content/drive/MyDrive/FIRE/outputs/ensemble_level3_eval"
os.makedirs(save_dir, exist_ok=True)

# === Load label encoder ===
with open(label_encoder_path, "rb") as f:
    le3 = pickle.load(f)

# === Load and preprocess validation data ===
val_df = pd.read_csv(val_csv_path)
print("Columns in validation CSV:", val_df.columns.tolist())

val_df['source_token'] = val_df['source'].str.upper().map({
    'REDDIT': '[REDDIT]',
    'TWITTER': '[TWITTER]',
    'YOUTUBE': '[YOUTUBE]'
})
val_df['text'] = val_df['source_token'] + ' ' + val_df['text']

# Only NEUTRAL under SUBJECTIVE are valid for Level 3
val_df = val_df[(val_df['level_1'] == 2) & (val_df['level_2'] == 0)].copy()
val_df["level_3_enc"] = le3.transform(val_df["level_3"])
true_labels = val_df["level_3_enc"].values

# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
encodings = tokenizer(
    list(val_df['text']),
    padding=True,
    truncation=True,
    return_tensors='pt',
    max_length=128
)

labels = torch.tensor(true_labels)
val_dataset = torch.utils.data.TensorDataset(
    encodings['input_ids'], encodings['attention_mask'], labels
)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# === Inference ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
all_fold_preds = []

for fold, model_path in enumerate(model_paths):
    print(f"\nFold {fold+1} — loading model")
    model = DebertaV2ForSequenceClassification.from_pretrained(
        model_name, num_labels=len(le3.classes_)
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    fold_preds = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Predicting Fold {fold+1}"):
            input_ids, attention_mask, _ = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            fold_preds.extend(preds)

    all_fold_preds.append(np.array(fold_preds))

# === Majority Voting ===
ensemble_preds = mode(np.array(all_fold_preds), axis=0).mode.squeeze()
pred_labels = le3.inverse_transform(ensemble_preds)

# Save predictions
val_df["preds"] = ensemble_preds
val_df["pred_labels"] = pred_labels
val_df.to_csv(os.path.join(save_dir, "level3_val_predictions.csv"), index=False)

# === Overall Metrics ===
acc = accuracy_score(true_labels, ensemble_preds)
f1_weighted = f1_score(true_labels, ensemble_preds, average='weighted')
f1_macro = f1_score(true_labels, ensemble_preds, average='macro')
f1_micro = f1_score(true_labels, ensemble_preds, average='micro')
prec_macro = precision_score(true_labels, ensemble_preds, average='macro')
recall_macro = recall_score(true_labels, ensemble_preds, average='macro')

try:
    report = classification_report(
        true_labels,
        ensemble_preds,
        labels=list(range(len(le3.classes_))),
        target_names=[str(c) for c in le3.classes_],
        digits=4
    )
except Exception as e:
    print("Error generating classification report:", e)
    report = classification_report(true_labels, ensemble_preds, digits=4)

metrics_text = f"""
Ensemble Accuracy: {acc:.4f}
F1 (Weighted): {f1_weighted:.4f}
F1 (Macro):    {f1_macro:.4f}
F1 (Micro):    {f1_micro:.4f}
Precision (Macro): {prec_macro:.4f}
Recall (Macro):    {recall_macro:.4f}

Classification Report:
{report}
"""

print(metrics_text)
with open(os.path.join(save_dir, "metrics.txt"), "w") as f:
    f.write(metrics_text)

# === Confusion Matrix ===
plt.figure(figsize=(6, 5))
cm = confusion_matrix(true_labels, ensemble_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Level 3 Ensemble)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
plt.close()

# === Platform-wise Evaluation
print("\nPlatform-wise Evaluation:")
platforms = ["youtube", "reddit", "twitter"]

for platform in platforms:
    mask = val_df['source'].str.lower() == platform
    y_true = val_df["level_3_enc"].values[mask]
    y_pred = val_df["preds"].values[mask]

    print(f"\nPlatform: {platform.upper()}")
    if len(y_true) == 0:
        print(f"No samples for {platform.upper()}. Skipping.")
        continue

    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Weighted: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Macro:    {f1_score(y_true, y_pred, average='macro'):.4f}")

    try:
        platform_report = classification_report(
            y_true, y_pred,
            labels=list(range(len(le3.classes_))),
            target_names=[str(c) for c in le3.classes_],
            digits=4
        )
    except Exception as e:
        print("Error generating report:", e)
        platform_report = classification_report(y_true, y_pred, digits=4)

    print(f"Platform Report:\n{platform_report}")

print(f"\nAll outputs saved to: {save_dir}")


Columns in validation CSV: ['text', 'level_1', 'level_2', 'level_3', 'source']





Fold 1 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 1: 100%|██████████| 20/20 [00:02<00:00,  6.85it/s]



Fold 2 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 2: 100%|██████████| 20/20 [00:02<00:00,  7.24it/s]



Fold 3 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 3: 100%|██████████| 20/20 [00:02<00:00,  7.33it/s]



Fold 4 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 4: 100%|██████████| 20/20 [00:02<00:00,  7.15it/s]



Fold 5 — loading model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Fold 5: 100%|██████████| 20/20 [00:02<00:00,  7.17it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Ensemble Accuracy: 0.7598
F1 (Weighted): 0.7099
F1 (Macro):    0.4122
F1 (Micro):    0.7598
Precision (Macro): 0.6279
Recall (Macro):    0.4531

Classification Report:
              precision    recall  f1-score   support

         0.0     0.5916    0.9200    0.7202       200
         1.0     0.9200    0.8768    0.8979       341
         2.0     1.0000    0.0156    0.0308        64
         3.0     0.0000    0.0000    0.0000        32

    accuracy                         0.7598       637
   macro avg     0.6279    0.4531    0.4122       637
weighted avg     0.7787    0.7598    0.7099       637



Platform-wise Evaluation:

Platform: YOUTUBE
Accuracy: 0.8898
F1 Weighted: 0.8829
F1 Macro:    0.5951
Platform Report:
              precision    recall  f1-score   support

         0.0     0.8291    1.0000    0.9066       131
         1.0     1.0000    0.7838    0.8788       111
         2.0     0.0000    0.0000    0.0000         0
         3.0     0.0000    0.0000    0.0000         3

   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

# **TASK2**

In [4]:
import os
import json
import pandas as pd
from tabulate import tabulate

# Base path
base_path = "/content/drive/MyDrive/FIRE/Task2/fold_outputs"

# Valid class labels
possible_labels = [["0", "1"], ["Not Relevant", "Relevant"]]

# Loop over folds 1 to 5
for fold in range(1, 6):
    print(f"\n{'='*60}")
    print(f"Fold {fold} Report\n")

    fold_path = os.path.join(base_path, f"fold_{fold}")
    report_path = os.path.join(fold_path, "final_report.json")

    if not os.path.exists(report_path):
        print(f"Missing report: {report_path}")
        continue

    # Load report
    with open(report_path, "r") as f:
        report = json.load(f)

    # Determine which label format to use
    label_keys = next((labels for labels in possible_labels if labels[0] in report), None)

    if not label_keys:
        print("Could not detect label keys in report.")
        continue

    rows = []
    for label in label_keys:
        row = {
            "Class": label,
            "Precision": round(report[label]["precision"], 4),
            "Recall": round(report[label]["recall"], 4),
            "F1-Score": round(report[label]["f1-score"], 4),
            "Support": int(report[label]["support"])
        }
        rows.append(row)

    # Add Accuracy + Macro & Weighted Avg
    rows.append({
        "Class": "accuracy",
        "Precision": "",
        "Recall": "",
        "F1-Score": round(report["accuracy"], 4),
        "Support": sum(int(report[label]["support"]) for label in label_keys)
    })

    for avg in ["macro avg", "weighted avg"]:
        rows.append({
            "Class": avg,
            "Precision": round(report[avg]["precision"], 4),
            "Recall": round(report[avg]["recall"], 4),
            "F1-Score": round(report[avg]["f1-score"], 4),
            "Support": int(report[avg]["support"])
        })

    # Print as table
    df = pd.DataFrame(rows)
    print(tabulate(df, headers="keys", tablefmt="github", showindex=False))

    # Summary
    print(f"\nFold {fold} Final Metrics:")
    print(f"Accuracy       : {report['accuracy']:.4f}")
    print(f"Macro F1       : {report['macro avg']['f1-score']:.4f}")
    print(f"Weighted F1    : {report['weighted avg']['f1-score']:.4f}")
    print(f"Macro Precision: {report['macro avg']['precision']:.4f}")
    print(f"Macro Recall   : {report['macro avg']['recall']:.4f}")



Fold 1 Report

| Class        | Precision   | Recall   |   F1-Score |   Support |
|--------------|-------------|----------|------------|-----------|
| 0            | 0.9019      | 0.97     |     0.9347 |      4161 |
| 1            | 0.6356      | 0.3318   |     0.436  |       657 |
| accuracy     |             |          |     0.8829 |      4818 |
| macro avg    | 0.7687      | 0.6509   |     0.6853 |      4818 |
| weighted avg | 0.8656      | 0.8829   |     0.8667 |      4818 |

Fold 1 Final Metrics:
Accuracy       : 0.8829
Macro F1       : 0.6853
Weighted F1    : 0.8667
Macro Precision: 0.7687
Macro Recall   : 0.6509

Fold 2 Report

| Class        | Precision   | Recall   |   F1-Score |   Support |
|--------------|-------------|----------|------------|-----------|
| 0            | 0.9074      | 0.9613   |     0.9336 |      4161 |
| 1            | 0.6073      | 0.379    |     0.4667 |       657 |
| accuracy     |             |          |     0.8819 |      4818 |
| macro avg    | 0.75