# Parameter-Efficient Fine-Tuning on GPT-2

## 1. Loading Dataset

In [1]:
# Import Libraries
from pprint import pprint
from datasets import load_dataset

In [2]:
# Import Dataset
dataset = load_dataset("AdaptLLM/medicine-tasks", "PubMedQA", split="test")

In [3]:
dataset

Dataset({
    features: ['input', 'id', 'gold_index', 'options'],
    num_rows: 1000
})

In [4]:
split_dataset = dataset.train_test_split(test_size=.2, shuffle=True, seed=42)

In [5]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'id', 'gold_index', 'options'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input', 'id', 'gold_index', 'options'],
        num_rows: 200
    })
})

In [6]:
# Inspect data point
split_dataset['train'][0]

{'input': 'Context: (Background and objective) Optimization of the preoperative hemoglobin (Hb) level is an effective way to reduce allogeneic transfusion in total knee arthroplasty (TKA) though the procedure is expensive, requires close monitoring and is often inconvenient for patients with reduced mobility. Our aim was to investigate the value of preoperative Hb levels to predict transfusion and thereby tailoring Hb optimization to patient characteristics.\n(Materials and methods) All consecutive patients who undergone primary TKA in our center over 2\xa0years, and received tranexamic acid intraoperatively, were reviewed. The adjusted association between preoperative Hb levels and transfusion was assessed by multivariate logistic regression, and the estimated probability of transfusion for individual patients was derived from the logistic model.\n(Results) Out of the 784 patients who meet the inclusion criteria, risk of transfusion was associated with poorer performance status, as me

### 1.1. Dataset Interpretation
The dataset contains 1,000 data points containing questions (`input`) prefixed by some biomedical context and their answers (`gold_index`) of "no", "maybe", or "yes".

### 1.2. Dataset Pre-Processing

In [7]:
from transformers import AutoTokenizer, GPT2Tokenizer

In [8]:
# Import tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [9]:
# Fix issue of inexistent pad_token 
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
tokenized_dataset = dict()

def preprocess_fn(data):
    del data['options']
    del data['id']
    data['labels'] = data['gold_index']
    del data['gold_index']

    return tokenizer(data["input"], padding='max_length', truncation=True, return_tensors='pt')

for split in ['train', 'test']:
    tokenized_dataset[split] = split_dataset[split].map(preprocess_fn, batched=True)

In [11]:
# Inspect pre-processed dataset
pprint(tokenized_dataset)
print(tokenized_dataset["train"][0])

{'test': Dataset({
    features: ['input', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 200
}),
 'train': Dataset({
    features: ['input', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 800
})}
{'input': 'Context: (Background and objective) Optimization of the preoperative hemoglobin (Hb) level is an effective way to reduce allogeneic transfusion in total knee arthroplasty (TKA) though the procedure is expensive, requires close monitoring and is often inconvenient for patients with reduced mobility. Our aim was to investigate the value of preoperative Hb levels to predict transfusion and thereby tailoring Hb optimization to patient characteristics.\n(Materials and methods) All consecutive patients who undergone primary TKA in our center over 2\xa0years, and received tranexamic acid intraoperatively, were reviewed. The adjusted association between preoperative Hb levels and transfusion was assessed by multivariate logistic regression, and the estimated probability o

In [12]:
print(tokenized_dataset['train'][0]['labels'])

0


## 2. Loading and Evaluating a Pre-Trained GPT-2

In [13]:
# Import libraries
from transformers import GPT2ForSequenceClassification, AutoModelForSequenceClassification

### 2.1. Loading GPT-2

In [14]:
model = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=3,
    id2label={0: "No", 1: "Maybe", 2: "Yes"},
    label2id={"No": 0, "Maybe": 1, "Yes": 2}
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

### 2.2. Freeze All Parameters

In [16]:
for param in model.base_model.parameters():
    param.requires_grad = False

### 2.3. Train Classifier Layer

In [17]:
# Import Libraries
import numpy as np
import torch
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {"accuracy": (predictions == labels).mean()}

In [19]:
# Fix issue of inexistent pad_token 
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id

In [20]:
# Train the classifier layer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/biomedical_question",
        learning_rate=2e-4,
        per_device_train_batch_size=5,
        per_device_eval_batch_size=5,
        num_train_epochs=4,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer, padding="max_length"),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.127955,0.435
2,No log,1.079048,0.48
3,No log,1.025596,0.45
4,1.151500,1.025908,0.43


TrainOutput(global_step=640, training_loss=1.121945571899414, metrics={'train_runtime': 531.0356, 'train_samples_per_second': 6.026, 'train_steps_per_second': 1.205, 'total_flos': 1672314303283200.0, 'train_loss': 1.121945571899414, 'epoch': 4.0})

### 2.4. Evaluate Trainer

In [21]:
trainer.evaluate()

{'eval_loss': 1.0255961418151855,
 'eval_accuracy': 0.45,
 'eval_runtime': 21.2508,
 'eval_samples_per_second': 9.411,
 'eval_steps_per_second': 1.882,
 'epoch': 4.0}

## 3. Performing Parameter-Efficient Fine-Tuning

### 3.1. Creating Configs

#### 3.1.1. Lora Config 1

In [22]:
from peft import LoraConfig, TaskType, get_peft_model

In [23]:
config_1 = LoraConfig(
    r=16,
    lora_alpha=32.0,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_proj"],
    modules_to_save=["score"],
    task_type=TaskType.TOKEN_CLS
)

#### 3.1.2. Lora Config 2

In [24]:
config_2 = LoraConfig(
    r=8,
    lora_alpha=32.0,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_proj"],
    modules_to_save=["score"],
    task_type=TaskType.TOKEN_CLS
)

### 3.2. Converting Transformers to PEFT Models

#### 3.2.1. Lora 1 Model

In [25]:
model_1 = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=3,
    id2label={0: "No", 1: "Maybe", 2: "Yes"},
    label2id={"No": 0, "Maybe": 1, "Yes": 2}
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
model_1.config.pad_token_id = tokenizer.pad_token_id

In [27]:
model_lora_1 = get_peft_model(model_1, config_1)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [28]:
model_lora_1.print_trainable_parameters()

trainable params: 1,034,496 || all params: 125,476,608 || trainable%: 0.8244532718002705


#### 3.2.2. Lora 2 Model

In [29]:
model_2 = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=3,
    id2label={0: "No", 1: "Maybe", 2: "Yes"},
    label2id={"No": 0, "Maybe": 1, "Yes": 2}
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model_2.config.pad_token_id = tokenizer.pad_token_id

In [31]:
model_lora_2 = get_peft_model(model_2, config_2)

In [32]:
model_lora_2.print_trainable_parameters()

trainable params: 518,400 || all params: 124,960,512 || trainable%: 0.41485105310708076


### 3.3. Training with PEFT Models

#### 3.3.1. Training Lora 1 Model

In [33]:
trainer_lora_1 = Trainer(
    model=model_lora_1,
    args=TrainingArguments(
        output_dir="./data/biomedical_question_lora_1",
        learning_rate=2e-4,
        per_device_train_batch_size=6,
        per_device_eval_batch_size=6,
        num_train_epochs=4,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer, padding="max_length"),
    compute_metrics=compute_metrics,
)

trainer_lora_1.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.13867,0.52
2,No log,1.092006,0.525
3,No log,1.002002,0.53
4,1.050500,0.986839,0.53


TrainOutput(global_step=536, training_loss=1.0442770844075218, metrics={'train_runtime': 1652.9833, 'train_samples_per_second': 1.936, 'train_steps_per_second': 0.324, 'total_flos': 1692653322240000.0, 'train_loss': 1.0442770844075218, 'epoch': 4.0})

#### 3.3.2. Training Lora 2 Model

In [35]:
# Train the classifier layer
trainer_lora_2 = Trainer(
    model=model_lora_2,
    args=TrainingArguments(
        output_dir="./data/biomedical_question_lora_2",
        learning_rate=2e-4,
        per_device_train_batch_size=6,
        per_device_eval_batch_size=6,
        num_train_epochs=4,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer, padding="max_length"),
    compute_metrics=compute_metrics,
)

trainer_lora_2.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.047271,0.525
2,No log,0.980976,0.525
3,No log,0.964919,0.535
4,0.970400,0.957475,0.505


TrainOutput(global_step=536, training_loss=0.9684191746498222, metrics={'train_runtime': 1799.5746, 'train_samples_per_second': 1.778, 'train_steps_per_second': 0.298, 'total_flos': 1682506462003200.0, 'train_loss': 0.9684191746498222, 'epoch': 4.0})

### 3.4. Save the PEFT Models

In [36]:
model_lora_1.save_pretrained('gpt_lora_1')

In [37]:
model_lora_2.save_pretrained('gpt_lora_2')

## 4. Perform Inference with PEFT

### 4.1. Load Saved PEFT Models

In [40]:
from peft import AutoPeftModelForSequenceClassification, AutoPeftModelForTokenClassification

In [45]:
gpt_lora_1 = AutoPeftModelForTokenClassification.from_pretrained(
    'gpt_lora_1',
    num_labels=3,
    id2label={0: "No", 1: "Maybe", 2: "Yes"},
    label2id={"No": 0, "Maybe": 1, "Yes": 2}
)

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
gpt_lora_2 = AutoPeftModelForTokenClassification.from_pretrained(
    'gpt_lora_2',
    num_labels=3,
    id2label={0: "No", 1: "Maybe", 2: "Yes"},
    label2id={"No": 0, "Maybe": 1, "Yes": 2}
)

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 4.2. Evaluate PEFT Models

In [47]:
trainer_lora_1.evaluate()

{'eval_loss': 0.9868389964103699,
 'eval_accuracy': 0.53,
 'eval_runtime': 23.4249,
 'eval_samples_per_second': 8.538,
 'eval_steps_per_second': 1.451,
 'epoch': 4.0}

In [48]:
trainer_lora_2.evaluate()

{'eval_loss': 0.9574746489524841,
 'eval_accuracy': 0.505,
 'eval_runtime': 24.0212,
 'eval_samples_per_second': 8.326,
 'eval_steps_per_second': 1.415,
 'epoch': 4.0}

## 5. Discussion

Based on the evaluation of the PEFT models, the better one is the model with the first configuration:
```
config_1 = LoraConfig(
    r=16,
    lora_alpha=32.0,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_proj"],
    modules_to_save=["score"],
    task_type=TaskType.TOKEN_CLS
)
```
The accuracy of the above model is .53, which is an increase from the .45, where the model's parameter were being freezed.