In [1]:
import os
import sys

In [2]:
import torch

from datasets import load_dataset

# Convert dataset to OAI messages
SYSTEM_MESSAGE = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

In [3]:
train_dataset = load_dataset("parquet", data_files="./dataset/spider-train.parquet", split="train")
test_dataset = load_dataset("parquet", data_files="./dataset/spider-test.parquet", split="train")

In [4]:
SCHEMA = """CREATE TABLE table_name_83 (bleeding_time VARCHAR, platelet_count VARCHAR, condition VARCHAR)"""

def preprocess_conversations(sample: str) -> str:
    return {
    "messages": [
      {"role": "system", "content": SYSTEM_MESSAGE.format(schema=SCHEMA)},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["query"]}
    ]
  }

train_dataset = train_dataset.map(preprocess_conversations, remove_columns=train_dataset.features, batched=False)
test_dataset = test_dataset.map(preprocess_conversations, remove_columns=test_dataset.features, batched=False)

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

# Hugging Face model id
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # or `codellama/CodeLlama-7b-hf` or `mistralai/Mistral-7B-v0.1`

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

In [6]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="llm-text-to-sql",              # directory to save and repository id
    num_train_epochs=10,                    # number of training epochs
    max_steps=300,
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=6,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                        # log every n steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                      # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [7]:
from trl import SFTTrainer
from peft import LoraConfig

max_seq_length = 3072 # max sequence length for model and packing of the dataset


peft_config = LoraConfig(
    r=64,  # the rank of the LoRA matrices
    lora_alpha=128, # the weight
    lora_dropout=0.1, # dropout to add to the LoRA layers
    bias="none", # add bias to the nn.Linear layers?
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj","v_proj","o_proj"], # the name of the layers to add LoRA
    modules_to_save=None, # layers to unfreeze and train from the original pre-trained model
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

# save model
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained('llm-text-to-sql', torch_dtype=torch.float16)

In [None]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

In [None]:
#### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
# from peft import AutoPeftModelForCausalLM

# # Load PEFT model on CPU
# model = AutoPeftModelForCausalLM.from_pretrained(
#     args.output_dir,
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
# )
# # Merge LoRA and base model and save
# merged_model = model.merge_and_unload()
# merged_model.save_pretrained(args.output_dir,safe_serialization=True, max_shard_size="2GB")

In [None]:
from datasets import load_dataset
from random import randint


# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, 
               max_new_tokens=256, 
               do_sample=True, 
               temperature=0.1, 
               top_k=50, 
               top_p=0.1, 
               eos_token_id=pipe.tokenizer.eos_token_id, 
               pad_token_id=pipe.tokenizer.pad_token_id,
              )

print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

In [None]:
from tqdm import tqdm


def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    if predicted_answer == sample["messages"][2]["content"]:
        return 1
    else:
        return 0

success_rate = []
number_of_eval_samples = 10
# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")

In [None]:
# # import requests as r
# from transformers import AutoTokenizer
# from datasets import load_dataset
# from random import randint

# # Load our test dataset and Tokenizer again
# tokenizer = AutoTokenizer.from_pretrained("code-llama-7b-text-to-sql")
# eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
# rand_idx = randint(0, len(eval_dataset))

# # generate the same prompt as for the first local test
# prompt = tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
# request= {"inputs":prompt,"parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}

# # send request to inference server
# resp = r.post("http://127.0.0.1:8080/generate", json=request)

# output = resp.json()["generated_text"].strip()
# time_per_token = resp.headers.get("x-time-per-token")
# time_prompt_tokens = resp.headers.get("x-prompt-tokens")

# # Print results
# print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
# print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
# print(f"Generated Answer:\n{output}")
# print(f"Latency per token: {time_per_token}ms")
# print(f"Latency prompt encoding: {time_prompt_tokens}ms")

In [None]:
# !docker stop tgi

In [None]:
print(f"{pipe.tokenizer.eos_token_id=}")
print(f"{pipe.tokenizer.pad_token_id=}")

In [None]:
import pandas as pd

data = load_dataset("parquet", data_files="dataset/spider-train.parquet", split="train")
data.rename_column('question', 'question2')

In [None]:
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset

In [None]:
dataset['answer']

In [None]:
dataset.rename_column('question', 'question2')