In [141]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from datasets import Dataset

In [142]:
data = pd.read_csv('./data/final_data/train.csv')
data = data.rename(columns={'score': 'labels'})

In [143]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   labels     17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB


In [144]:
print(sorted(data['labels'].unique()))
data['labels'] = data['labels'] - 1
# print(sorted(data['labels'].unique()))

num_classes = data["labels"].nunique()
print(num_classes)

[1, 2, 3, 4, 5, 6]
6


In [145]:
# print(sorted(data['score'].unique()))
# data['score'] = data['score'] - 1
# # print(sorted(data['score'].unique()))

# num_classes = data["score"].nunique()
# print(num_classes)

In [146]:
from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


def preprocess_function(examples):
    # Assuming 'text' and 'label' are column names in your dataset
    result = tokenizer(examples['full_text'], padding="max_length", truncation=True, max_length=512)
    result['labels'] = examples['labels']
    return result

# Apply the function across the dataset
print(data.columns)
dataset = Dataset.from_pandas(data.iloc[:,1:])
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Index(['essay_id', 'full_text', 'labels'], dtype='object')


Map: 100%|██████████| 17307/17307 [00:04<00:00, 4150.17 examples/s]


In [147]:
print(tokenized_dataset.features)


{'full_text': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [148]:
split_datasets = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True, seed=46)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

In [149]:
print(train_dataset.features)
print(eval_dataset.features)


{'full_text': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
{'full_text': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [150]:
from sklearn.metrics import accuracy_score, cohen_kappa_score

# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=1)
#     return {"accuracy": accuracy_score(labels, predictions)}

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels.flatten(), predictions.flatten())
    kappa = cohen_kappa_score(labels.flatten(), predictions.flatten(), weights="quadratic")
    return {
        'accuracy': accuracy,
        'kappa': kappa
    }


In [151]:
import math
from transformers import TrainerCallback

class CosineAnnealingScheduler(TrainerCallback):
    """ Custom LR Scheduler that implements a cosine annealing schedule with warmup. """

    def __init__(self, optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
        self.num_warmup_steps = num_warmup_steps
        self.num_training_steps = num_training_steps
        self.num_cycles = num_cycles
        self.last_epoch = last_epoch
        self.optimizer = optimizer

    def on_step_begin(self, args, state, control, **kwargs):
        """ Called right before a training step in the main training loop. """
        step = state.global_step
        if step < self.num_warmup_steps:
            lr_scale = float(step) / float(max(1, self.num_warmup_steps))
        else:
            progress = float(step - self.num_warmup_steps) / float(max(1, self.num_training_steps - self.num_warmup_steps))
            lr_scale = max(0.0, 0.5 * (1.0 + math.cos(math.pi * self.num_cycles * 2.0 * progress)))
        
        for group in self.optimizer.param_groups:
            group['lr'] = lr_scale * group['initial_lr']


In [152]:
from transformers import TrainerCallback

class MetricsCallback(TrainerCallback):
    "A callback that stores all intermediate training, validation losses and validation accuracy."

    def __init__(self):
        super().__init__()
        self.training_losses = []
        self.validation_losses = []
        self.validation_accuracy = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Logs appear as a dictionary. Check if loss and eval_loss are in the dictionary and append them.
        if 'loss' in logs:
            self.training_losses.append(logs['loss'])
        if 'eval_loss' in logs:
            self.validation_losses.append(logs['eval_loss'])
        if 'eval_accuracy' in logs:
            self.validation_accuracy.append(logs['eval_accuracy'])


In [153]:
from transformers import AdamW, TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoConfig

num_epochs = 7

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",     # evaluation is done at the end of each epoch
    save_strategy="epoch",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="kappa"
)

# model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=num_classes)
# model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=num_classes)

config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", num_labels=num_classes, hidden_dropout_prob=0.30, attention_probs_dropout_prob=0.30)
# config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", num_labels=num_classes)

model = AutoModelForSequenceClassification.from_config(config)
# model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", config=config)

for name, param in model.named_parameters():
    if name.startswith("distilbert.transformer.layer") and int(name.split('.')[3]) < 5:  # Adjust the layer numbers as needed
        param.requires_grad = False
        print(f"Layer {name} is frozen.")
    else:
        print(f"Layer {name} is trainable.")

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.1)
# num_training_steps = 3 * len(train_dataset) // training_args.per_device_train_batch_size
num_training_steps = num_epochs * len(train_dataset) // training_args.per_device_train_batch_size
scheduler_callback = CosineAnnealingScheduler(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)
metrics_callback = MetricsCallback()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics,
    callbacks=[scheduler_callback, metrics_callback]
)



Layer distilbert.embeddings.word_embeddings.weight is trainable.
Layer distilbert.embeddings.position_embeddings.weight is trainable.
Layer distilbert.embeddings.LayerNorm.weight is trainable.
Layer distilbert.embeddings.LayerNorm.bias is trainable.
Layer distilbert.transformer.layer.0.attention.q_lin.weight is frozen.
Layer distilbert.transformer.layer.0.attention.q_lin.bias is frozen.
Layer distilbert.transformer.layer.0.attention.k_lin.weight is frozen.
Layer distilbert.transformer.layer.0.attention.k_lin.bias is frozen.
Layer distilbert.transformer.layer.0.attention.v_lin.weight is frozen.
Layer distilbert.transformer.layer.0.attention.v_lin.bias is frozen.
Layer distilbert.transformer.layer.0.attention.out_lin.weight is frozen.
Layer distilbert.transformer.layer.0.attention.out_lin.bias is frozen.
Layer distilbert.transformer.layer.0.sa_layer_norm.weight is frozen.
Layer distilbert.transformer.layer.0.sa_layer_norm.bias is frozen.
Layer distilbert.transformer.layer.0.ffn.lin1.weig



In [154]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [155]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
training_losses = metrics_callback.training_losses
validation_losses = metrics_callback.validation_losses
validation_accuracy = metrics_callback.validation_accuracy

print("Training Losses:", training_losses)
print("Validation Losses:", validation_losses)
print("Validation Accuracy:", validation_accuracy)

In [None]:
plt.plot(list(range(1,num_epochs+1)),validation_losses)
plt.xlabel("Epochs")
plt.ylabel("Validation loss")
plt.title("Validation Loss vs Epoch")
plt.show()