# Part 2 : Preparation data and model selection

## Setup libraries

In [None]:
# Libraries import
import torch, json
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from torch.utils.data import Dataset, DataLoader
from google.colab import drive


## Device Checking

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Model Quantization and LoRA

### Quantization & LoRA Configuration

In [14]:
# Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# LoRA Config
lora_config = LoraConfig(
    r=16, # LoRA Rank (controls number of trainable parameters)
    lora_alpha=20, # LoRA Alpha (controls the size of the effect on the original model)
    target_modules="all-linear",
    lora_dropout=0.02,
    bias="none",
    task_type="CAUSAL_LM"
)

### Phi2 model loading & training preparation

In [None]:
# Model Loading
model_id = "microsoft/phi-2"
model =AutoModelForCausalLM.from_pretrained("./models/base_Phi2_model", quantization_config=bnb_config, trust_remote_code=False, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, add_bos_token=True, pad_token="[PAD]", trust_remote_code=True, use_fast=False)
model = get_peft_model(model, lora_config)


Loading checkpoint shards: 100%|██████████| 3/3 [00:19<00:00,  6.48s/it]


In [None]:
# Model training preparation
model.train()
model.enable_input_require_grads()
model.print_trainable_parameters()

trainable params: 23,592,960 || all params: 2,803,276,800 || trainable%: 0.8416


## Setup Data

### Spider data loading

In [17]:
data_spider = load_dataset("xlangai/spider")
data_spider_train = load_dataset("xlangai/spider", split="train")
data_spider_test = load_dataset("xlangai/spider", split="validation")
train_json = data_spider_train.to_json("./data/spider_json/train_data.json")
test_json = data_spider_test.to_json("./data/spider_json/test_data.json")

Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 39.52ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 105.73ba/s]


In [19]:
# SQLDataset class definition
class SQLDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=2048):
        self.tokenizer = tokenizer
        self.data = []
        self.max_length = max_length

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
              self.data.append(json.loads(line))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        record = self.data[idx]
        query = record["query"]
        question = record["question"]

        input = f"Question: {question}\nAnswer: {query}"

        encoding = self.tokenizer(input, truncation=True, max_length=self.max_length, return_tensors="pt")

        return {'input_ids': encoding.input_ids.squeeze(0), 'attention_mask': encoding.attention_mask.squeeze(0)}



In [21]:
# Training dataset creation
file_path = "./data/spider_json/train_data.json"
dataset = SQLDataset(file_path=file_path, tokenizer=tokenizer)

In [22]:
# Evaluation dataset creation
eval_file_path = "./data/spider_json/test_data.json"
eval_dataset = SQLDataset(file_path=eval_file_path, tokenizer=tokenizer)

In [None]:
# Data Collation for Instruction Tuning
instruction = """ Context: You are a Text to SQL Assistant, which goal is to convert Natural Language in to SQL query. 
Question: What are the distinct creation years of the departments managed by a secretary born in state 'Alabama'?
Answer:"""
response_template = "Answer:"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)

## Training Configuration and Initialization

In [None]:
training_args = TrainingArguments(
    output_dir="./training",
    report_to = "none",
    warmup_steps=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_checkpointing=False,
    gradient_accumulation_steps=32,
    num_train_epochs=1,
    bf16=True,
    optim="paged_lion_8bit",
    learning_rate=1e-4,
    weight_decay=0.01,
    save_strategy="epoch",
)


trainer = Trainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=collator,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=219, training_loss=0.5453730892373002, metrics={'train_runtime': 6296.336, 'train_samples_per_second': 1.112, 'train_steps_per_second': 0.035, 'total_flos': 6483228464025600.0, 'train_loss': 0.5453730892373002, 'epoch': 1.0})

## Finetuned Model Saving

In [None]:
# Open Drive
drive.mount('/content/drive')

trainer.save_model("/content/drive/MyDrive/models/Text2SQL_Phi2_model")
tokenizer.save_pretrained("/content/drive/MyDrive/models/Text2SQL_Phi2_model")