In [None]:
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import json
import numpy as np

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/COVID-QA.csv')
df.astype(str)
df = df.dropna()

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# Define your QADataset class
class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        context = item['context']
        question = item['question']
        answer = item['answer']

        # Tokenize the context and question
        encoding = self.tokenizer(
            question,
            context,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_offsets_mapping=True
        )

        # Find the start and end positions of the answer in the tokenized context
        start_position = 0
        end_position = 0
        for i, (start, end) in enumerate(encoding['offset_mapping']):
            if start <= context.find(answer) < end:
                start_position = i
            if start <= context.find(answer) + len(answer) <= end:
                end_position = i

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'start_positions': torch.tensor(start_position, dtype=torch.long),
            'end_positions': torch.tensor(end_position, dtype=torch.long)
        }

# Split dataset into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)  # 90% train, 10% test
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)  # 81% train, 9% val

train_dataset = QADataset(train_df, tokenizer)
val_dataset = QADataset(val_df, tokenizer)
test_dataset = QADataset(test_df, tokenizer)

# Load pre-trained model
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./fine_tuned_roberta',         # Output directory
    num_train_epochs=10,                       # Number of training epochs
    per_device_train_batch_size=8,             # Batch size for training
    per_device_eval_batch_size=8,              # Batch size for evaluation
    warmup_steps=500,                          # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                         # Strength of weight decay
    logging_dir='./logs_roberta',              # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",               # Evaluate every `eval_steps`
    eval_steps=500,                            # Evaluation and save interval
    save_steps=1000,                           # Save checkpoint every `save_steps`
    save_total_limit=2,                        # Limit the total amount of checkpoints
    load_best_model_at_end=True,               # Load the best model at the end of training
)

# Define the metrics function to calculate accuracy and F1 score
def compute_metrics(p):
    preds, labels = p
    start_preds, end_preds = preds
    start_labels, end_labels = labels

    # Convert logits to predicted positions using numpy
    start_preds = np.argmax(start_preds, axis=1)
    end_preds = np.argmax(end_preds, axis=1)

    # Calculate accuracy
    start_accuracy = accuracy_score(start_labels, start_preds)
    end_accuracy = accuracy_score(end_labels, end_preds)

    # Calculate F1 score (considering both start and end positions)
    start_f1 = f1_score(start_labels, start_preds, average='weighted')
    end_f1 = f1_score(end_labels, end_preds, average='weighted')

    # Save the metrics to a dictionary
    metrics = {
        'start_accuracy': start_accuracy,
        'end_accuracy': end_accuracy,
        'start_f1': start_f1,
        'end_f1': end_f1
    }

    # Save metrics to file (JSON format)
    with open('metrics.json', 'a') as f:
        json.dump(metrics, f)
        f.write('\n')

    return metrics


# Initialize the Trainer with the metrics function
trainer = Trainer(
    model=model,                               # The pre-trained model
    args=training_args,                        # Training arguments
    train_dataset=train_dataset,               # Training dataset
    eval_dataset=val_dataset,                  # Validation dataset
    compute_metrics=compute_metrics            # Metrics function
)

trainer.train()

# Save model and tokenizer
model.save_pretrained('./fine_tuned_roberta')
tokenizer.save_pretrained('./fine_tuned_roberta')

# Evaluate on the test dataset and print metrics
results = trainer.evaluate(test_dataset)
print(results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchristian23[0m ([33mchristian23-president-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss


{'eval_loss': 3.779172420501709, 'eval_start_accuracy': 0.1956521739130435, 'eval_end_accuracy': 0.34782608695652173, 'eval_start_f1': 0.2294685990338164, 'eval_end_f1': 0.3659420289855072, 'eval_runtime': 1.3073, 'eval_samples_per_second': 35.186, 'eval_steps_per_second': 4.589, 'epoch': 10.0}
