In [None]:
!pip install datasets transformers torch

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/542.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
# Loading the datasets
train_df = pd.read_csv("/content/drive/MyDrive/NLU_Files/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/NLU_Files/dev.csv")

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def tokenize_data(df):
    return tokenizer(df['Claim'].tolist(), df['Evidence'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

train_encodings = tokenize_data(train_df)
test_encodings = tokenize_data(test_df)


In [None]:
# Dataset class to help us initialise and load datasets
class EDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.encodings['input_ids'])


In [None]:
# Initialise dataset variables
train_dataset = EDDataset(train_encodings, train_df['label'].tolist())
train_size = int(0.8 * len(train_dataset))

# Split training dataset into validation and training with 80:20 split
val_size = len(train_dataset) - train_size
train_data, val_data = random_split(train_dataset, [train_size, val_size])

# Prepare data loader
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)


In [None]:
# Model training section, to have modify and train with different learning rates
def train_model(train_loader, val_loader, learning_rate):
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()

    # 3 epochs for hyperparamter selection
    for epoch in range(3):
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Validation phase
        model.eval()
        total, correct = 0, 0
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
                predictions = torch.argmax(outputs.logits, dim=-1)
                correct += (predictions == batch['labels']).sum().item()
                total += batch['labels'].size(0)

        # Calculate validation accuracy out of total predictions
        accuracy = correct / total
        print(f"Epoch {epoch + 1}, LR {learning_rate}, Val Accuracy: {accuracy}")

    return accuracy

In [None]:
# Range of learning rates tuned
# No other hyperparameter due to computational limitations
learning_rates = [1e-6, 1e-5, 1e-4]
best_lr = None
best_accuracy = 0

# Test through all learning rates
for lr in learning_rates:
    print(f"Testing LR {lr}")
    accuracy = train_model(train_loader, val_loader, lr)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_lr = lr

print(f"Best Learning Rate: {best_lr}")

Testing LR 1e-06


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, LR 1e-06, Val Accuracy: 0.7245306897279055
Epoch 2, LR 1e-06, Val Accuracy: 0.8403290445053786
Epoch 3, LR 1e-06, Val Accuracy: 0.8597342332841172
Testing LR 1e-05


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, LR 1e-05, Val Accuracy: 0.86753849398861
Epoch 2, LR 1e-05, Val Accuracy: 0.8757646066230753
Epoch 3, LR 1e-05, Val Accuracy: 0.874077198903185
Testing LR 0.0001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, LR 0.0001, Val Accuracy: 0.7245306897279055
Epoch 2, LR 0.0001, Val Accuracy: 0.7245306897279055
Epoch 3, LR 0.0001, Val Accuracy: 0.7245306897279055
Best Learning Rate: 1e-05


In [None]:
# Final model training function has 5 epochs, and return it
def train_final_model(train_loader, learning_rate, epochs=5):
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()

    for epoch in range(epochs):
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch + 1}, LR {learning_rate}")

    return model

In [None]:
full_train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Train the final model
final_model = train_final_model(full_train_loader, 1e-5)

# Save the trained model
model_path = '/content/drive/MyDrive/NLU_Files/final_roberta_optimal_lr'
final_model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print("Final model training complete. Model and tokenizer saved.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, LR 1e-05
Epoch 2, LR 1e-05
Epoch 3, LR 1e-05
Epoch 4, LR 1e-05
Epoch 5, LR 1e-05
Final model training complete. Model and tokenizer saved.


In [None]:
print(final_model)
total_params = sum(p.numel() for p in final_model.parameters())
trainable_params = sum(p.numel() for p in final_model.parameters() if p.requires_grad)
print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             