In [1]:
!pip install transformers



In [2]:
!gdown --id '1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA' --output DATA.zip
!unzip DATA.zip

Downloading...
From: https://drive.google.com/uc?id=1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA
To: /content/DATA.zip
100% 1.89M/1.89M [00:00<00:00, 54.7MB/s]
Archive:  DATA.zip
   creating: data/
  inflating: data/data_test.csv      
  inflating: data/data_train.csv     


#Training part

In [3]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from torch.utils.data import Dataset

os.environ["WANDB_MODE"] = "disabled"

###Load Dataset and model

In [4]:
# Load the dataset
df = pd.read_csv('data/data_train.csv')

# Concatenate context, question, and answer columns for BERT input
df['input_text'] = df['context'] + " [SEP] " + df['question'] + " [SEP] " + df['answer0'] + " [SEP] " + df['answer1'] + " [SEP] " + df['answer2']

# Split into training and validation sets (80-20 split)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments

# Load the DeBERTa v3 base model and tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=3)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Utilities

In [8]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_test=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(
            row['input_text'],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        if self.is_test:
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze()
            }
        else:
            label = torch.tensor(row['label'], dtype=torch.long)
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': label
            }


train_dataset = CustomDataset(train_df, tokenizer, max_len=128)
val_dataset = CustomDataset(val_df, tokenizer, max_len=128)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


###Params

In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Train for more epochs
    per_device_train_batch_size=8,  # Reduce batch size to avoid memory issues
    per_device_eval_batch_size=8,
    warmup_steps=500,  # Warm-up for better convergence
    weight_decay=0.01,  # Regularization
    logging_dir='./logs',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,  # Smaller learning rate for fine-tuning large models
    report_to="none",
    fp16=True,  # Use mixed precision for faster training if supported
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch sizes
    lr_scheduler_type="linear",  # Linear learning rate scheduling
    label_smoothing_factor=0.1  # Helps with generalization
)




In [18]:
from sklearn.metrics import accuracy_score
import numpy as np

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # Extract the model's predictions and ground truth labels
    predictions = np.argmax(logits, axis=-1)  # Get the predicted class (argmax along the logits)
    accuracy = accuracy_score(labels, predictions)  # Compute accuracy
    return {'accuracy': accuracy}

In [19]:
# Initialize the Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print("Validation accuracy:", eval_results['eval_accuracy'])

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,1.07163,0.42874
2,0.935100,1.030659,0.543581
4,0.565700,1.158655,0.590106
6,0.448800,1.269929,0.600707
8,0.339800,1.28744,0.61543
9,0.322200,1.286922,0.612485


Validation accuracy: 0.6124852767962309


In [20]:
'''
# Save the model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

# Load the model and tokenizer for inference
loaded_model = RobertaForSequenceClassification.from_pretrained("./saved_model")
loaded_tokenizer = RobertaTokenizer.from_pretrained("./saved_model")
'''


'\n# Save the model and tokenizer\nmodel.save_pretrained("./saved_model")\ntokenizer.save_pretrained("./saved_model")\n\n# Load the model and tokenizer for inference\nloaded_model = RobertaForSequenceClassification.from_pretrained("./saved_model")\nloaded_tokenizer = RobertaTokenizer.from_pretrained("./saved_model")\n'

#Testing part

In [21]:
from transformers import Trainer, DataCollatorWithPadding
import numpy as np
import pandas as pd

# Load the saved model and tokenizer for inference
loaded_model = DebertaV2ForSequenceClassification.from_pretrained("saved_model")
loaded_tokenizer = DebertaV2Tokenizer.from_pretrained("saved_model")

# Load new test data
new_df = pd.read_csv('data/data_test.csv')
# Prepare input text for the model (concatenate context, question, and answers)
new_df['input_text'] = new_df['context'] + " [SEP] " + new_df['question'] + " [SEP] " + new_df['answer0'] + " [SEP] " + new_df['answer1'] + " [SEP] " + new_df['answer2']

# Create the prediction dataset (is_test=True skips label extraction)
predict_dataset = CustomDataset(new_df, loaded_tokenizer, max_len=128, is_test=True)

# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=loaded_tokenizer)

# Initialize Trainer for prediction (include model, tokenizer, and data collator)
predict_trainer = Trainer(
    model=loaded_model,
    tokenizer=loaded_tokenizer,
    data_collator=data_collator
)

# Make predictions
predictions = predict_trainer.predict(predict_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Save predictions to a CSV file
prediction_df = pd.DataFrame({
    'id': new_df['id'],
    'label': predicted_labels
})

prediction_df.to_csv('prediction.csv', index=False)

print("Predictions saved to 'prediction.csv'.")

  predict_trainer = Trainer(


Predictions saved to 'prediction.csv'.
