In [None]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from datasets import Dataset

In [None]:
data = pd.read_csv('./data/final_data/train.csv')
data = data.rename(columns={'score': 'labels'})

In [None]:
data.info()

In [None]:
print(sorted(data['labels'].unique()))
data['labels'] = data['labels'] - 1
# print(sorted(data['labels'].unique()))

num_classes = data["labels"].nunique()
print(num_classes)

In [None]:
# print(sorted(data['score'].unique()))
# data['score'] = data['score'] - 1
# # print(sorted(data['score'].unique()))

# num_classes = data["score"].nunique()
# print(num_classes)

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
# tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


def preprocess_function(examples):
    # Assuming 'text' and 'label' are column names in your dataset
    result = tokenizer(examples['full_text'], padding="max_length", truncation=True, max_length=512)
    result['labels'] = examples['labels']
    return result

# Apply the function across the dataset
print(data.columns)
dataset = Dataset.from_pandas(data.iloc[:,1:])
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
print(tokenized_dataset.features)


In [None]:
split_datasets = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True, seed=46)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

In [None]:
print(train_dataset.features)
print(eval_dataset.features)


In [None]:
from sklearn.metrics import accuracy_score, cohen_kappa_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels.flatten(), predictions.flatten())
    kappa = cohen_kappa_score(labels.flatten(), predictions.flatten(), weights="quadratic")
    return {
        'accuracy': accuracy,
        'kappa': kappa
    }


In [None]:
import math
from transformers import TrainerCallback

class CosineAnnealingScheduler(TrainerCallback):
    """ Custom LR Scheduler that implements a cosine annealing schedule with warmup. """

    def __init__(self, optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
        self.num_warmup_steps = num_warmup_steps
        self.num_training_steps = num_training_steps
        self.num_cycles = num_cycles
        self.last_epoch = last_epoch
        self.optimizer = optimizer

    def on_step_begin(self, args, state, control, **kwargs):
        """ Called right before a training step in the main training loop. """
        step = state.global_step
        if step < self.num_warmup_steps:
            lr_scale = float(step) / float(max(1, self.num_warmup_steps))
        else:
            progress = float(step - self.num_warmup_steps) / float(max(1, self.num_training_steps - self.num_warmup_steps))
            lr_scale = max(0.0, 0.5 * (1.0 + math.cos(math.pi * self.num_cycles * 2.0 * progress)))
        
        for group in self.optimizer.param_groups:
            group['lr'] = lr_scale * group['initial_lr']


In [None]:
from transformers import TrainerCallback

class MetricsCallback(TrainerCallback):
    "A callback that stores all intermediate training, validation losses and validation accuracy."

    def __init__(self):
        super().__init__()
        self.training_losses = []
        self.validation_losses = []
        self.validation_accuracy = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Logs appear as a dictionary. Check if loss and eval_loss are in the dictionary and append them.
        if 'loss' in logs:
            self.training_losses.append(logs['loss'])
        if 'eval_loss' in logs:
            self.validation_losses.append(logs['eval_loss'])
        if 'eval_accuracy' in logs:
            self.validation_accuracy.append(logs['eval_accuracy'])


In [None]:
from transformers import AdamW, TrainingArguments, Trainer, AutoModelForSequenceClassification

num_epochs = 7

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",     # evaluation is done at the end of each epoch
    save_strategy="epoch",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="kappa"
)

model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=num_classes)

optimizer = AdamW(model.parameters(), lr=1e-5)
num_training_steps = num_epochs * len(train_dataset) // training_args.per_device_train_batch_size
scheduler_callback = CosineAnnealingScheduler(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)
metrics_callback = MetricsCallback()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics,
    callbacks=[scheduler_callback, metrics_callback]
)

In [None]:
trainer.train()

In [None]:
training_losses = metrics_callback.training_losses
validation_losses = metrics_callback.validation_losses
validation_accuracy = metrics_callback.validation_accuracy

print("Training Losses:", training_losses)
print("Validation Losses:", validation_losses)
print("Validation Accuracy:", validation_accuracy)

In [None]:
plt.plot(list(range(1,num_epochs+1)),validation_losses, label="Validation Loss")
plt.plot(list(range(1,num_epochs+1)),training_losses, label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss Value")
plt.title("Loss vs Epoch")
plt.legend()
plt.show()

In [None]:
kappa_values = [0.761299, 0.809548, 0.798076, 0.810284, 0.817964, 0.805913, 0.807585]


plt.plot(list(range(1,num_epochs+1)),kappa_values, label="Kappa Score")
plt.xlabel("Epochs")
plt.ylabel("Kappa score")
plt.title("Kappa Score vs Epoch")
plt.legend()
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

ax1.plot(list(range(1,num_epochs+1)),training_losses, label="Training Loss")
ax1.plot(list(range(1,num_epochs+1)),validation_losses, label="Validation Loss")
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Loss Value")
ax1.set_title("Loss vs Epoch")
ax1.legend()

ax2.plot(list(range(1,num_epochs+1)),kappa_values, label="Kappa Score")
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Kappa Score")
ax2.set_title("Kappa Score vs Epoch")
ax2.legend()

plt.suptitle("DeBERTa-v3 Training and Performance Analysis")
plt.tight_layout()

plt.show()