# **IMPORTS**

In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed
)

SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


In [None]:
train_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/train.csv')
test_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/test.csv')

In [None]:
wandb.login(key = "1cae1eb0b3009c258573b649b577124df891befe" , relogin=True)

# **Preparing Dataset**
This block converts the emotion label columns into integer format and splits the dataset into training and validation sets. It then creates HuggingFace Dataset objects from the pandas DataFrames and adds a labels field for each example, which stores all five emotion labels as a list. These processed datasets are later used for model training and evaluation

In [None]:
label_cols = ['anger','fear','joy','sadness','surprise']
train_df[label_cols] = train_df[label_cols].astype(int)

train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=SEED, shuffle=True)
train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)

hf_train = Dataset.from_pandas(train_df[['id','text'] + label_cols])
hf_val   = Dataset.from_pandas(val_df[['id','text'] + label_cols])
hf_test  = Dataset.from_pandas(test_df[['id','text']])

def add_labels(example):
    example["labels"] = [float(example[c]) for c in label_cols]
    return example

hf_train = hf_train.map(add_labels)
hf_val   = hf_val.map(add_labels)

# **Tokenization and Data Preparation for DeBERTa**

This step initializes the DeBERTa tokenizer and applies it to the train, validation, and test datasets. The text is converted into token IDs with a fixed maximum length, while labels are removed from the HuggingFace datasets since they will be handled separately. A DataCollatorWithPadding is also used to dynamically pad batches during training.

In [None]:
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
MAX_LENGTH = 256

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=MAX_LENGTH)

hf_train = hf_train.map(tokenize, batched=True)
hf_val   = hf_val.map(tokenize, batched=True)
hf_test  = hf_test.map(tokenize, batched=True)


hf_train = hf_train.remove_columns(label_cols)
hf_val   = hf_val.remove_columns(label_cols)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# **Model Initialization and Evaluation Metrics**

This section loads the DeBERTa model for multi-label emotion classification and defines the evaluation metrics used during training. Predictions are converted into probabilities using a sigmoid activation, thresholded at 0.5, and evaluated using Macro F1-score along with class-wise F1-scores for all five emotions.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    labels = labels.astype(int)
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)
    per_label_f1 = f1_score(labels, preds, average=None, zero_division=0).tolist()
    return {
        "macro_f1": macro_f1,
        "f1_anger": per_label_f1[0],
        "f1_fear": per_label_f1[1],
        "f1_joy": per_label_f1[2],
        "f1_sadness": per_label_f1[3],
        "f1_surprise": per_label_f1[4],
    }

# **Training Configuration and Trainer Setup**

This section defines the training hyperparameters such as batch size, learning rate, number of epochs, and evaluation strategy using HuggingFaceâ€™s TrainingArguments. We also initialize a Weights & Biases run for experiment tracking and create a Trainer object that combines the model, datasets, tokenizer, data collator, and evaluation metrics for training and validation.

In [None]:
training_args = TrainingArguments(
    output_dir="./deberta_v3_base_emotion",

    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,

    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,

    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,

    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",

    save_total_limit=2,
    fp16=True if torch.cuda.is_available() else False,
    seed=SEED,

    report_to="wandb",
    run_name="DeBERTa-v3-emotion",

)

wandb.init(project="23f1001420-t32025", name="deberta-v3-base")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.save_model()

In [None]:
wandb.finish()

# **Saving and Uploading the Trained Model**

This step saves the fine-tuned DeBERTa model and tokenizer into a local directory. After saving, the complete model folder is uploaded to Kaggle Models using kagglehub, allowing easy reuse for inference or future experiments.

In [None]:
save_dir = "deberta_full_model"
os.makedirs(save_dir, exist_ok=True)

# Save HF model
trainer.save_model(save_dir)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

print("Saved full model folder:", os.listdir(save_dir))
user = "somya2611"
Deberta_handle = f"{user}/DebertaFull/pyTorch/v1"

kagglehub.model_upload(Deberta_handle, "deberta_full_model")

# **Downloading the Saved Model from Kaggle**

This section sets the appropriate compute device (CPU or GPU) and downloads the previously uploaded model from Kaggle Models using `kagglehub`. Once downloaded, the contents of the model folder are listed to confirm successful retrieval.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

user = "somya2611"
Deberta_handle = f"{user}/DebertaFull/pyTorch/v1"

model_folder = kagglehub.model_download(Deberta_handle)
print("Files downloaded:", os.listdir(model_folder))


# **Loading the Fine-Tuned Model for Inference**

This section loads the tokenizer and the fine-tuned DeBERTa model from the saved model folder downloaded from Kaggle. The model is moved to the appropriate device (CPU/GPU) and set to evaluation mode so it can generate predictions without performing any training steps.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_folder)

model = AutoModelForSequenceClassification.from_pretrained(
    model_folder,
    problem_type="multi_label_classification"
).to(device)

model.eval()

# **Running Inference and Generating Submission File**

In this step, a lightweight Trainer is created for inference, and predictions are generated on the test dataset. The model outputs logits, which are converted to probabilities and then thresholded to obtain binary emotion labels. Finally, a submission file is created in the required format and saved as submission.csv.

In [None]:
inference_trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator
)

preds = inference_trainer.predict(hf_test)
logits = preds.predictions
probs = torch.sigmoid(torch.tensor(logits)).numpy()
binary_preds = (probs > 0.5).astype(int)


submission = pd.DataFrame(binary_preds, columns=label_cols)
submission.insert(0, "id", test_df["id"].values)

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")
submission.head()
