<a href="https://colab.research.google.com/github/Shi-pra-19/dl-genai-project/blob/main/deberta_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb transformers datasets

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset, DatasetDict
import wandb

In [None]:
df = pd.read_csv('/content/augmented_train.csv')

In [None]:
MAX_LEN = 512
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-5
DEBERTA_MODEL = "microsoft/deberta-v3-large"
NUM_LABELS = 5
LABEL_COLS = ["anger", "joy", "fear", "surprise", "sadness"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
wandb.init(
    project="23f3001910-t32025",
    name="deberta-large",
    config={
        "batch_size": BATCH_SIZE,
        "epochs": EPOCHS,
        "learning_rate": LEARNING_RATE,
    },
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
texts = df["text"].astype(str).tolist()
labels = df[LABEL_COLS].values

df = pd.DataFrame({"text": texts})
for i, col in enumerate(LABEL_COLS):
    df[col] = labels[:, i]

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

Train: 12501 | Val: 1563 | Test: 1563


In [None]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df),
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL)

tokenized_datasets = dataset.map(
    lambda examples: tokenizer(examples["text"], truncation=True),
    batched=True,
)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Map:   0%|          | 0/12501 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

In [None]:
def set_labels(example):
    example["labels"] = [float(example[col]) for col in LABEL_COLS]
    return example

tokenized_datasets = tokenized_datasets.map(set_labels)

# dynamic padding per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/12501 [00:00<?, ? examples/s]

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (torch.sigmoid(torch.tensor(logits)) >= 0.5).int().numpy()
    labels = labels.astype(int)
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)
    per_label_f1 = f1_score(labels, preds, average=None, zero_division=0)
    result = {"macro_f1": macro_f1}
    for lbl, score in zip(LABEL_COLS, per_label_f1):
        result[f"f1_{lbl}"] = score
    wandb.log(result)
    return result

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    DEBERTA_MODEL,
    problem_type="multi_label_classification",
    num_labels=NUM_LABELS,
)

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=50,
    report_to=["wandb"],
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # üëà dynamic padding here
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Macro F1,F1 Anger,F1 Joy,F1 Fear,F1 Surprise,F1 Sadness
1,0.4888,0.442732,0.692256,0.62069,0.721239,0.807359,0.666027,0.645963
2,0.3463,0.399735,0.752827,0.716518,0.772679,0.811725,0.743847,0.719368
3,0.259,0.394102,0.768991,0.726862,0.768386,0.834248,0.765694,0.749764


TrainOutput(global_step=2346, training_loss=0.39764517835339014, metrics={'train_runtime': 1965.6112, 'train_samples_per_second': 19.08, 'train_steps_per_second': 1.194, 'total_flos': 3331206688789386.0, 'train_loss': 0.39764517835339014, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)
wandb.log({"test_metrics": metrics})

{'eval_loss': 0.4191151261329651, 'eval_macro_f1': 0.7434603819944055, 'eval_f1_anger': 0.7331042382588774, 'eval_f1_joy': 0.7426778242677824, 'eval_f1_fear': 0.7976744186046512, 'eval_f1_surprise': 0.7238454288407163, 'eval_f1_sadness': 0.72, 'eval_runtime': 19.7411, 'eval_samples_per_second': 79.175, 'eval_steps_per_second': 4.964, 'epoch': 3.0}


In [None]:
trainer.save_model("./deberta_large_v1")
tokenizer.save_pretrained("./deberta_large_v1")
artifact = wandb.Artifact("deberta_large_v1", type="model")
artifact.add_dir("./deberta_large_v1")
wandb.log_artifact(artifact)

[34m[1mwandb[0m: Adding directory to artifact (deberta_large_v1)... Done. 61.2s


<Artifact deberta_large_v1>

In [None]:
wandb.finish()

0,1
eval/f1_anger,‚ñÅ‚ñá‚ñà‚ñà
eval/f1_fear,‚ñÉ‚ñÑ‚ñà‚ñÅ
eval/f1_joy,‚ñÅ‚ñà‚ñá‚ñÑ
eval/f1_sadness,‚ñÅ‚ñÜ‚ñà‚ñÜ
eval/f1_surprise,‚ñÅ‚ñÜ‚ñà‚ñÖ
eval/loss,‚ñà‚ñÇ‚ñÅ‚ñÖ
eval/macro_f1,‚ñÅ‚ñá‚ñà‚ñÜ
eval/runtime,‚ñÑ‚ñà‚ñÖ‚ñÅ
eval/samples_per_second,‚ñÖ‚ñÅ‚ñÑ‚ñà
eval/steps_per_second,‚ñÖ‚ñÅ‚ñÑ‚ñà

0,1
eval/f1_anger,0.7331
eval/f1_fear,0.79767
eval/f1_joy,0.74268
eval/f1_sadness,0.72
eval/f1_surprise,0.72385
eval/loss,0.41912
eval/macro_f1,0.74346
eval/runtime,19.7411
eval/samples_per_second,79.175
eval/steps_per_second,4.964


In [None]:
MODEL_DIR = "./deberta_large_v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(DEVICE)
model.eval()

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNo

In [None]:
TEST_PATH = "/content/test_clean.csv"
test = pd.read_csv(TEST_PATH)
texts = test["text"].astype(str).tolist()

In [None]:
BATCH_SIZE = 16
all_preds = []

for i in range(0, len(texts), BATCH_SIZE):
    batch_texts = texts[i:i+BATCH_SIZE]
    inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True).to(DEVICE)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()
        all_preds.extend(probs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
import numpy as np

threshold = 0.5
preds_binary = (np.array(all_preds) >= threshold).astype(int)


for i, label in enumerate(LABEL_COLS):
    test[f"{label}"] = preds_binary[:, i]

In [None]:
columns_to_keep = ['id', 'anger', 'fear', 'joy', 'sadness', 'surprise']
test = test[columns_to_keep]
print(test.head())

   id  anger  fear  joy  sadness  surprise
0   0      1     1    0        0         0
1   1      0     0    0        0         0
2   2      1     1    0        0         0
3   3      0     1    0        0         0
4   4      0     1    0        0         1


In [None]:
test.to_csv("submission.csv", index=False)