In [1]:
%pip install -U transformers datasets peft evaluate tf-keras sacrebleu rouge_score pycocoevalcap

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader

2024-12-18 23:24:59.585097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734564299.598943     628 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734564299.603180     628 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-18 23:24:59.619058: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# hyperparameters
model_name = "gpt2"
batch_size = 8
max_length = 128
dataset_name = "wikisql"

In [4]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
# dataset
data = load_dataset("wikisql")

# Split into train, validation and test set
train_set = data['train']
val_set = data['validation']
test_set = data['test']

print(f"Size of train set: {len(train_set)}")
print(f"Size of test set: {len(test_set)}")
print(f"Size of validation set: {len(val_set)}")

Size of train set: 56355
Size of test set: 15878
Size of validation set: 8421


In [6]:
# Data Preprocessing (Tokenization)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# padding only works if tokenizer has a padding token specified
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(sample):
    return tokenizer(sample['question'], truncation=True, padding="max_length", max_length=max_length)

train_set_tok = train_set.map(tokenize_fn, batched=True)
val_set_tok = val_set.map(tokenize_fn, batched=True)
test_set_tok = test_set.map(tokenize_fn, batched=True)

In [7]:
print(f"Query: {train_set_tok[0]['question']}")
print(f"SQL: {train_set_tok[0]['sql']['human_readable']}")

Query: Tell me what the notes are for South Australia 
SQL: SELECT Notes FROM table WHERE Current slogan = SOUTH AUSTRALIA


In [8]:
def convert_to_torch_format(dataset):
    input_ids = torch.tensor(dataset["input_ids"])
    attention_mask = torch.tensor(dataset["attention_mask"])
    labels = input_ids.clone()
    return torch.utils.data.TensorDataset(input_ids, attention_mask, labels)

train_set_tok_torch = convert_to_torch_format(train_set_tok)
val_set_tok_torch = convert_to_torch_format(val_set_tok)
test_set_tok_torch = convert_to_torch_format(test_set_tok)

In [10]:
train_dataloader = DataLoader(train_set_tok_torch, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_set_tok_torch, batch_size=batch_size)
test_dataloader = DataLoader(test_set_tok_torch, batch_size=batch_size)
inputs = next(iter(train_dataloader))


In [11]:
# model configuration
model = GPT2LMHeadModel.from_pretrained(model_name)
model = model.to(device)

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

In [12]:
# OPTION 1: Training with custom training loop
peft_model = get_peft_model(model, peft_config)
peft_model = peft_model.to(device)

# Training configuration
optimizer = torch.optim.AdamW(peft_model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    peft_model.train()
    train_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()

        outputs = peft_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss / len(train_dataloader)}")



KeyboardInterrupt: 

In [13]:
def compute_metrics(preds): 
    logits, labels = preds
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    # accuracy
    correct = (predictions == torch.tensor(labels)).float().sum()
    accuracy = correct / len(labels)

    return {"accuracy": accuracy.item()}

In [14]:
# OPTION 2: Training with Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    eval_strategy='epoch',
    #learning_rate=2e-4,
    #weight_decay=0.01,
    #warmup_steps=500,
    load_best_model_at_end=True,
    logging_dir='./logs',
    save_total_limit=2,
    save_strategy='epoch',
    metric_for_best_model='bleu'
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # For causal language modeling, mask language modeling is set to False
)

trainer = Trainer(model=model, args=training_args, 
                  train_dataset=train_set_tok, 
                  eval_dataset=val_set_tok,
                  compute_metrics=compute_metrics,
                  data_collator=data_collator)

print("Training...")
trainer.train()

Training...


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.41 GiB. GPU 0 has a total capacty of 44.34 GiB of which 18.17 GiB is free. Process 2002530 has 26.16 GiB memory in use. Of the allocated memory 19.20 GiB is allocated by PyTorch, and 6.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Evaluation 
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Perform prediction on test dataset
test_results = trainer.predict(tokenized_val)

decoded_preds = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)
decoded_labels = [tokenizer.decode(example["input_ids"], skip_special_tokens=True) for example in tokenized_val]

# Compute additional metrics
bleu_score = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
meteor_score = meteor.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
rouge_score = rouge.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
nist_score = nist.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
cider_score = cider.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

# Aggregate metrics
test_metrics = {
    "bleu": bleu_score["bleu"],
    "meteor": meteor_score["meteor"],
    "rouge_l": rouge_score["rougeL"].mid.fmeasure,
    "nist": nist_score["nist_mt"],
    "cider": cider_score["CIDEr"]
}

# Print test metrics
print(f"Test Metrics: {test_metrics}")

# Save the model
peft_model.save_pretrained("lora_gpt2_wikisql")
print("Model saved.")

# Sources

https://medium.com/@Shrishml/lora-low-rank-adaptation-from-the-first-principle-7e1adec71541

https://github.com/microsoft/LoRA/tree/main/examples/
