In [2]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizer,
    AutoConfig,
    AutoModel,
)
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


In [3]:
model_id = "roberta-base"
# dataset_id = "qiaojin/PubMedQA"
dataset = load_dataset("qiaojin/PubMedQA","pqa_artificial")

In [4]:
dataset = dataset["train"].select(range(18000))
dataset
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
label2id = {"yes": 0, "no": 1, "maybe": 2}
id2label = {v: k for k, v in label2id.items()}

In [5]:
def preprocess(example):

    final_prompt = f"{example['question']}\n{example['long_answer']}"
    inputs = tokenizer(
        final_prompt,
    )
    inputs["labels"] = label2id[example["final_decision"]]
    return inputs

encoded_dataset = dataset.map(preprocess)
# encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_dataset

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 18000
})

In [6]:

id2label

{0: 'yes', 1: 'no', 2: 'maybe'}

In [7]:
class RoBERTaModel(torch.nn.Module):
  def __init__(self, num_labels):
    super().__init__()
    self.base_model = AutoModel.from_pretrained('distilroberta-base')
    self.classifier = torch.nn.Linear(self.base_model.config.hidden_size, num_labels)

  def forward(self, input_ids, attention_mask):
    print("forward loop")
    outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
    print("check 1")
    hidden_states = outputs.last_hidden_state
    print("check 2")
    pooled_output = hidden_states[:, 0, :]
    print("check 3")
    logits = self.classifier(pooled_output)
    print("check 4")
    return logits 

In [8]:
# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [9]:
model = RoBERTaModel(num_labels=len(label2id))


In [10]:
def finetune_roberta_classifier(model, train_loader, num_epochs, lr, weight_decay, device="cuda"):
    """
    Function to fine-tune a distilRoberta model for a classification task
    Args:
    model: instance of distilRoberta
    train_loader: Dataloader for the BoolQ training set
    num_epochs: Number of epochs for training
    lr: Learning rate
    weight_decay: Weight decay
    ...: Any other arguments you may need

    Returns:
    model: Fine-tuned model
    batch_losses: List of losses for each mini-batch
    """
    #<FILL IN>
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    batch_losses = []

    # use binary cross entropy loss
    criterion = torch.nn.CrossEntropyLoss()
    model.to(device)



    for epoch in range(num_epochs):
      for batch in tqdm(train_loader):

        print("Epoch batch check")

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.autocast(device_type=device, dtype=torch.float16):
          print("Input ids check")
          logits = model(input_ids=input_ids, attention_mask=attention_mask)

          print("Autocast out")
          print(logits.shape)

          optimizer.zero_grad()
          # logits = logits.view(-1)
          # labels = labels.type(torch.long)



          loss = criterion(logits, labels)
          batch_losses.append(loss.item())
          loss.backward()
          optimizer.step()

    return model, batch_losses


In [11]:
def make_tokenize(tokenizer):
    def tokenize_boolq_evaluation(examples):
        return tokenizer(f"{examples['question']}.\n{examples['long_answer']}",truncation=True)
    return tokenize_boolq_evaluation

def roberta_collator(batch, tokenizer):
  input_ids = pad_sequence([torch.tensor(x['input_ids']) for x in batch], batch_first=True,padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence([torch.tensor(x['attention_mask']) for x in batch], batch_first=True, padding_value=0)
  labels = torch.tensor([x['labels'] for x in batch])

  return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


In [12]:

tokenized_train_set = dataset.map(make_tokenize(tokenizer), batched=False)
train_loader = DataLoader(encoded_dataset, batch_size=1, collate_fn=lambda batch: roberta_collator(batch, tokenizer))
model, batch_losses = finetune_roberta_classifier(model, train_loader, num_epochs=1, lr=1e-5, weight_decay=2e-4)

  0%|          | 0/18000 [00:00<?, ?it/s]

Epoch batch check
Input ids check
forward loop


: 

In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os
from torch.nn.utils.rnn import pad_sequence

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
dataset = load_dataset("pubmed_qa", "pqa_artificial")
dataset = dataset["train"].select(range(18000)) 

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Define label mapping
label_map = {"yes": 0, "no": 1, "maybe": 2}
id2label = {v: k for k, v in label_map.items()}
label2id = {k: v for v, k in label_map.items()}

def preprocess_function(example):
    # Use only questions, no context
    
    final_prompt = f"{example['question']}\n{example['long_answer']}"
    inputs = tokenizer(
        final_prompt,
        truncation=True,
        max_length=512
    )
    inputs["labels"] = label_map[example["final_decision"]]
    return inputs

def train_collator(batch):
    input_ids = pad_sequence(
        [torch.tensor(x['input_ids']) for x in batch],
        batch_first=True,
        padding_value=tokenizer.pad_token_id
    )
    attention_mask = pad_sequence(
        [torch.tensor(x['attention_mask']) for x in batch],
        batch_first=True,
        padding_value=0
    )
    labels = torch.tensor([x['labels'] for x in batch])
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Preprocess datasets
tokenized_datasets = dataset.map(
    preprocess_function,
    remove_columns=dataset.column_names  # Change to dataset.column_names
)

# Create dataloaders with custom collator
train_dataloader = DataLoader(
    tokenized_datasets,  # Change to tokenized_datasets
    shuffle=True,
    batch_size=16,
    collate_fn=train_collator
)

# eval_dataloader = DataLoader(
#     tokenized_datasets["validation"],
#     batch_size=16,  # Increased batch size since sequences are shorter
#     collate_fn=train_collator
# )

# Initialize model
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=3,  # PubMedQA has 3 labels: yes, no, maybe
    id2label=id2label,
    label2id=label_map
)
model.to(device)

def finetune_roberta_classifier(model, train_loader, num_epochs, lr=2e-5, weight_decay=0.01, device="cuda"):
    """
    Function to fine-tune a RoBERTa model for classification
    """
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss()  # For multi-class classification
    batch_losses = []
    best_f1 = 0

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()
        epoch_losses = []

        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Change device_type to device.type
            with torch.autocast(device_type=device.type, dtype=torch.float16): 
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_losses.append(loss.item())
            epoch_losses.append(loss.item())

        avg_epoch_loss = sum(epoch_losses) / len(epoch_losses)
        print(f"Training loss: {avg_epoch_loss:.4f}")

    return model, batch_losses


# Run training
model, batch_losses = finetune_roberta_classifier(
    model=model,
    train_loader=train_dataloader,
    num_epochs=1,
    lr=2e-5,
    weight_decay=0.001,
    device=device
)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1


Training: 100%|██████████| 1125/1125 [01:51<00:00, 10.09it/s]

Training loss: 0.1539



