# **IMPORTS**

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from tqdm import tqdm

In [None]:
train = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/train.csv')
test = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/test.csv')
sample = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv")

In [None]:
wandb.login(key = "1cae1eb0b3009c258573b649b577124df891befe" , relogin=True)

wandb.init(
    project="23f1001420-t32025",
    name="roberta_trainer",
    config={
        "model_name": "roberta-base",
        "batch_size": 16,
        "lr": 2e-5,
        "epochs": 3,
        "max_len": 128
    }
)
config = wandb.config

# **Custom Dataset Class**

This block defines a PyTorch Dataset class that tokenizes text inputs and returns input IDs, attention masks, and corresponding multi-label emotion targets during training. It also supports inference by returning only the tokenized inputs when is_train=False.

In [None]:
label_cols = ["anger","fear","joy","sadness","surprise"]

X_train, X_val = train_test_split(train, test_size=0.1, random_state=42, shuffle=True)
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row["text"])

        enc = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        if self.is_train:
            labels = torch.tensor(row[label_cols].values.astype('float32'))
            return enc.input_ids.squeeze(), enc.attention_mask.squeeze(), labels
        else:
            return enc.input_ids.squeeze(), enc.attention_mask.squeeze()


# **Custom RoBERTa-Based Emotion Classifier and DataLoaders**

This section defines a PyTorch nn.Module for multi-label emotion classification using a pre-trained RoBERTa encoder, followed by a dropout layer and a linear classifier. It also creates EmotionDataset instances for training, validation, and test sets, and wraps them in PyTorch DataLoaders for efficient batching during training and evaluation.

In [None]:
class EmotionClassifier(nn.Module):
    def __init__(self, model_name="roberta-base", dropout=0.3):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.encoder.config.hidden_size, 5)

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.last_hidden_state[:,0]  # CLS token
        x = self.dropout(pooled)
        return self.fc(x)


tokenizer = AutoTokenizer.from_pretrained("roberta-base")

train_ds = EmotionDataset(X_train, tokenizer, is_train=True)
val_ds   = EmotionDataset(X_val, tokenizer, is_train=True)
test_ds  = EmotionDataset(test, tokenizer, is_train=False)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=16, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=16, shuffle=False)


# **Training and Validation Loops**

This section defines the training loop using BCEWithLogitsLoss for multi-label classification, with an AdamW optimizer and a linear learning rate scheduler. During training, batch losses are logged to Weights & Biases, and after each epoch, the model is evaluated on the validation set using Macro F1-score to monitor performance.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = EmotionClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
epochs = config.epochs

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def train_model():
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            wandb.log({"batch_loss": loss.item()})

        epoch_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}")

        wandb.log({"train_epoch_loss": epoch_loss})

        validate(epoch)

def validate(epoch):
    model.eval()
    preds_list = []
    true_list = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask)
            preds = torch.sigmoid(outputs).cpu().numpy()
            preds_list.append(preds)
            true_list.append(labels.cpu().numpy())

    preds_list = np.vstack(preds_list)
    true_list = np.vstack(true_list)

    # Macro F1 calculation across 5 labels
    from sklearn.metrics import f1_score
    binary_preds = (preds_list > 0.5).astype(int)
    f1 = f1_score(true_list, binary_preds, average="macro")
    print("Validation Macro F1:", f1)
    wandb.log({"val_macro_f1": f1})


In [None]:
train_model()

In [None]:
wandb.finish()

# **Saving and Uploading the Trained Model**

In [None]:
save_dir = "roberta_emotion_model"
os.makedirs(save_dir, exist_ok=True)

model_path = os.path.join(save_dir, "model.pth")
torch.save(model.state_dict(), model_path)
print("Saved:", model_path)

user = "somya2611"
upload_handle = f"{user}/roberta-emotion/pyTorch/v1"

kagglehub.model_upload(upload_handle, "roberta_emotion_model")

print("Model uploaded to:", upload_handle)

# **Downloading the Saved Model from Kaggle**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

user = "somya2611"
download_handle = f"{user}/roberta-emotion/pyTorch/v1"

model_folder = kagglehub.model_download(download_handle)
print("Downloaded model folder:", model_folder)

In [None]:
inference_model = EmotionClassifier().to(device)
state_dict_path = f"{model_folder}/model.pth"

inference_model.load_state_dict(torch.load(state_dict_path, map_location=device))
inference_model.eval()

print("Model loaded for inference!")

# **Running Inference and Generating Submission File**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

test_ds  = EmotionDataset(test, tokenizer, is_train=False)
test_loader  = DataLoader(test_ds, batch_size=16, shuffle=False)


final_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = [b.to(device) for b in batch]
        outputs = inference_model(input_ids, attention_mask)
        probs = torch.sigmoid(outputs).cpu().numpy()
        preds = (probs > 0.5).astype(int)
        final_preds.append(preds)

final_preds = np.vstack(final_preds)



submission = pd.DataFrame({
    "id": test["id"],
    "anger": final_preds[:, 0],
    "fear": final_preds[:, 1],
    "joy": final_preds[:, 2],
    "sadness": final_preds[:, 3],
    "surprise": final_preds[:, 4],
})

submission.to_csv("submission.csv", index=False)
submission.head()
