In [1]:
%pip install -U transformers datasets peft evaluate tf-keras sacrebleu rouge_score pycocoevalcap

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxh

In [67]:
import torch
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader

In [18]:
# hyperparameters
model_name = "gpt2"
batch_size = 8
max_length = 128

In [8]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [38]:
# dataset
data = load_dataset("wikisql")

# Split into train, validation and test set
train_set = data['train']
val_set = data['validation']
test_set = data['test']

print(f"Size of train set: {len(train_set)}")
print(f"Size of test set: {len(test_set)}")
print(f"Size of validation set: {len(val_set)}")

Size of train set: 56355
Size of test set: 15878
Size of validation set: 8421


In [39]:
# Data Preprocessing (Tokenization)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# padding only works if tokenizer has a padding token specified
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(sample):
    return tokenizer(sample['question'], truncation=True, padding="max_length", max_length=max_length)

train_set_tok = train_set.map(tokenize_fn, batched=True)
val_set_tok = val_set.map(tokenize_fn, batched=True)
test_set_tok = test_set.map(tokenize_fn, batched=True)

In [40]:
print(f"Query: {train_set_tok[0]['question']}")
print(f"SQL: {train_set_tok[0]['sql']['human_readable']}")

Query: Tell me what the notes are for South Australia 
SQL: SELECT Notes FROM table WHERE Current slogan = SOUTH AUSTRALIA


In [41]:
def convert_to_torch_format(dataset):
    input_ids = torch.tensor(dataset["input_ids"])
    attention_mask = torch.tensor(dataset["attention_mask"])
    labels = input_ids.clone()
    return torch.utils.data.TensorDataset(input_ids, attention_mask, labels)

train_set_tok_torch = convert_to_torch_format(train_set_tok)
val_set_tok_torch = convert_to_torch_format(val_set_tok)
test_set_tok_torch = convert_to_torch_format(test_set_tok)

In [51]:
train_dataloader = DataLoader(train_set_tok_torch, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_set_tok_torch, batch_size=batch_size)
test_dataloader = DataLoader(test_set_tok_torch, batch_size=batch_size)
inputs = next(iter(train_dataloader))


[tensor([[ 2061,   318,   262,  ..., 50256, 50256, 50256],
         [ 2061,   318,   262,  ..., 50256, 50256, 50256],
         [10919,   338,   262,  ..., 50256, 50256, 50256],
         ...,
         [ 2061,   373,   262,  ..., 50256, 50256, 50256],
         [13828,  7756,   373,  ..., 50256, 50256, 50256],
         [ 2061,   318,   262,  ..., 50256, 50256, 50256]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[ 2061,   318,   262,  ..., 50256, 50256, 50256],
         [ 2061,   318,   262,  ..., 50256, 50256, 50256],
         [10919,   338,   262,  ..., 50256, 50256, 50256],
         ...,
         [ 2061,   373,   262,  ..., 50256, 50256, 50256],
         [13828,  7756,   373,  ..., 50256, 50256, 50256],
         [ 2061,   318,   262,  ..., 50256, 50256, 50256]])]

In [43]:
# model configuration
model = GPT2LMHeadModel.from_pretrained(model_name)
model = model.to(device)

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

In [46]:
# OPTION 1: Training with custom training loop
peft_model = get_peft_model(model, peft_config)
peft_model = peft_model.to(device)

# Training configuration
optimizer = torch.optim.AdamW(peft_model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    peft_model.train()
    train_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()

        outputs = peft_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss / len(train_dataloader)}")

KeyboardInterrupt: 

In [68]:
# OPTION 2: Training with Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    eval_strategy='epoch',
    #learning_rate=2e-4,
    #weight_decay=0.01,
    #warmup_steps=500,
    load_best_model_at_end=True,
    logging_dir='./logs',
    save_total_limit=2,
    save_strategy='epoch',
    metric_for_best_model='bleu'
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # For causal language modeling, mask language modeling is set to False
)

trainer = Trainer(model=model, args=training_args, 
                  train_dataset=train_set_tok, 
                  eval_dataset=val_set_tok,
                  data_collator=data_collator)

print("Training...")
trainer.train()

Training...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# OPTION 1: custom Validation loop
peft_model.eval()
val_loss = 0
with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = peft_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()

print(f"Validation Loss: {val_loss / len(val_dataloader)}")

# Save the model
peft_model.save_pretrained("lora_gpt2_wikisql")
print("Model saved.")

# Sources

https://medium.com/@Shrishml/lora-low-rank-adaptation-from-the-first-principle-7e1adec71541

https://github.com/microsoft/LoRA/tree/main/examples/
