<a href="https://colab.research.google.com/github/Pooret/inspiritai/blob/main/squad_deberta_cosine_annealing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets transformers sentencepiece

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, dill, multiprocess, datasets
Successfully installed datasets-2.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
from torch import tensor
from datasets import load_dataset
from transformers import DebertaV2TokenizerFast, DebertaV2ForQuestionAnswering

# Load the fast tokenizer and model
tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v3-base')
model = DebertaV2ForQuestionAnswering.from_pretrained('microsoft/deberta-v3-base')

dataset = load_dataset('squad')

# Function to convert a batch of samples to tensors
def convert_to_tensors(batch):
    return {k: tensor(v) for k, v in batch.items()}

# Function to tokenize and format a single example
def prepare_train_features(examples):
    # Tokenize the questions and contexts
    tokenized_inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # Truncate only the context (second part of the input)
        max_length=512,
        stride=128,  # Overlap between chunks if context is longer than max length
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    # Adjust answer start and end positions
    sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    tokenized_inputs["start_positions"] = []
    tokenized_inputs["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_inputs.sequence_ids(i)

        sample_index = sample_mapping[i]
        answer = examples["answers"][sample_index]
        if len(answer["answer_start"]) == 0:
            tokenized_inputs["start_positions"].append(cls_index)
            tokenized_inputs["end_positions"].append(cls_index)
        else:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_inputs["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_inputs["end_positions"].append(token_end_index + 1)
            else:
                tokenized_inputs["start_positions"].append(cls_index)
                tokenized_inputs["end_positions"].append(cls_index)

    return tokenized_inputs

# Apply the function to the dataset
tokenized_train = dataset['train'].map(prepare_train_features, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset['validation'].map(prepare_train_features, batched=True, remove_columns=dataset["validation"].column_names)

# Convert the tokenized dataset to tensors
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm
import os
import json

best_model_path = '/content/drive/MyDrive/models/deberta_squad_weight_decay_cosine_annealing_best_model.pt'

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Convert the dataset to a DataLoader for easier batching
train_loader = DataLoader(tokenized_train, batch_size=12, shuffle=True)
val_loader = DataLoader(tokenized_val, batch_size=8, shuffle=False)

# Initial setup
num_epochs = 10
total_steps = len(train_loader) * num_epochs
warmup_steps = total_steps * 0.1  # 10% of total steps for warmup
initial_lr = 1e-7
max_lr = 5e-5

# Use PyTorch's AdamW optimizer
optimizer = AdamW(model.parameters(), lr=initial_lr, weight_decay=0.03)

# Early stopping
patience = 5
best_val_loss = float('inf')
patience_counter = 0

# Cosine Annealing Learning Rate Scheduler
scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=0.00001)

for epoch in range(num_epochs):
    # Training loop
    model.train()
    train_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Training", leave=True)

    for batch_idx, batch in enumerate(progress_bar):
        current_step = epoch * len(train_loader) + batch_idx

        # Warmup phase
        if current_step < warmup_steps:
            lr_scale = min(1., float(current_step + 1) / warmup_steps)
            for pg in optimizer.param_groups:
                pg['lr'] = lr_scale * max_lr

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        progress_bar.set_postfix(avg_loss=train_loss/(batch_idx + 1))

    avg_train_loss = train_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)

            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    # Step the scheduler if warmup is completed
    if current_step >= warmup_steps:
        scheduler.step()

    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Checkpoint for the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        print(f"Saving new best model at epoch {epoch + 1}")
        model.save_pretrained(best_model_path)
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter == patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

    progress_bar.set_description(f"Epoch {epoch + 1}/{num_epochs} Training (Val Loss: {avg_val_loss:.4f})")
    progress_bar.set_postfix(avg_train_loss=avg_train_loss, avg_val_loss=avg_val_loss)

    if epoch < num_epochs - 1:
        progress_bar.set_description(f"Epoch {epoch + 2}/{num_epochs} Training (Prev Val Loss: {avg_val_loss:.4f})")

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/models/fine_tuned_deberta_squad_weight_decay_cosine_annealing")


Epoch 1/10 Training: 100%|██████████| 7309/7309 [1:00:38<00:00,  2.01it/s, avg_loss=1.25]


Validation Loss: 0.8535
Saving new best model at epoch 1


Epoch 2/10 Training: 100%|██████████| 7309/7309 [1:00:36<00:00,  2.01it/s, avg_loss=0.747]


Validation Loss: 0.8312
Saving new best model at epoch 2


Epoch 3/10 Training: 100%|██████████| 7309/7309 [1:00:36<00:00,  2.01it/s, avg_loss=0.589]


Validation Loss: 0.9232


Epoch 4/10 Training: 100%|██████████| 7309/7309 [1:00:35<00:00,  2.01it/s, avg_loss=0.457]


Validation Loss: 0.9458


Epoch 5/10 Training: 100%|██████████| 7309/7309 [1:00:35<00:00,  2.01it/s, avg_loss=0.341]


Validation Loss: 1.0631


Epoch 6/10 Training: 100%|██████████| 7309/7309 [1:00:35<00:00,  2.01it/s, avg_loss=0.242]


Validation Loss: 1.1715


Epoch 7/10 Training:  57%|█████▋    | 4146/7309 [34:23<26:14,  2.01it/s, avg_loss=0.16]


KeyboardInterrupt: 

In [None]:
### NOTES
# Weight decay seems to have a big impact on performance, try playing around with this