# **Installing Dependecies**

In [1]:
! pip install torch transformers trl accelerate peft datasets bitsandbytes pandas



In [2]:
from huggingface_hub import login
login(token="")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Siddharth\.cache\huggingface\token
Login successful


In [3]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# **Data Processing**

In [4]:
from datasets import load_dataset , Dataset
import pandas as pd

dataset_id = "b-mc2/sql-create-context"
data = load_dataset(dataset_id)

df = pd.DataFrame(data['train'])

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

In [5]:
def chat_template_for_training(context, answer, question):
    template = f"""\
    <|im_start|>user
    Given the context, generate an SQL query for the following question
    context:{context}
    question:{question}
    <|im_end|>
    <|im_start|>assistant
    {answer}
    <|im_end|>
    """
    # Remove any leading whitespace characters from each line in the template.
    template = "\n".join([line.lstrip() for line in template.splitlines()])
    return template

In [6]:
df["text"] = df.apply(lambda x: chat_template_for_training(x["context"], x["answer"], x["question"]), axis=1)

# Convert the dataframe back to a Dataset object.
formatted_data = Dataset.from_pandas(df)
print(formatted_data)

Dataset({
    features: ['answer', 'question', 'context', 'text'],
    num_rows: 78577
})


# **Training with Peft**

In [7]:
from transformers import AutoTokenizer , AutoModelForCausalLM , BitsAndBytesConfig , TrainingArguments
from peft import LoraConfig , get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
from trl import SFTTrainer
import torch

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model = prepare_model_for_kbit_training(model)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout = 0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model,lora_config)

In [9]:
training_args = TrainingArguments(
    output_dir = r"D:\newenv\codespace\Tiny_Lllama-sqlQuries-fine_tuned/",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    learning_rate = 2e-4,
    lr_scheduler_type="cosine",
    save_strategy="steps",
    save_steps=2500,
    logging_steps=500,
    num_train_epochs=3,
    max_steps=10000,
    fp16=True,
    push_to_hub=True
)

In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset = formatted_data,
    dataset_text_field="text",
    peft_config=lora_config,
    args=training_args,
    tokenizer = tokenizer,
    packing=False,
    max_seq_length=1024
)
trainer.train()

Map:   0%|          | 0/78577 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msiddharthmagesh007[0m ([33mvelammal-edu-in[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/10000 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.6842, 'grad_norm': 0.9990168213844299, 'learning_rate': 0.00019876883405951377, 'epoch': 0.05}
{'loss': 0.5676, 'grad_norm': 0.813933789730072, 'learning_rate': 0.00019510565162951537, 'epoch': 0.1}
{'loss': 0.5409, 'grad_norm': 0.7951170802116394, 'learning_rate': 0.0001891006524188368, 'epoch': 0.15}
{'loss': 0.5291, 'grad_norm': 0.9597814679145813, 'learning_rate': 0.00018090169943749476, 'epoch': 0.2}
{'loss': 0.5199, 'grad_norm': 0.6460480690002441, 'learning_rate': 0.00017071067811865476, 'epoch': 0.25}




{'loss': 0.5148, 'grad_norm': 0.7363828420639038, 'learning_rate': 0.00015877852522924732, 'epoch': 0.31}
{'loss': 0.5134, 'grad_norm': 0.708634078502655, 'learning_rate': 0.00014539904997395468, 'epoch': 0.36}
{'loss': 0.5058, 'grad_norm': 0.7342403531074524, 'learning_rate': 0.00013090169943749476, 'epoch': 0.41}
{'loss': 0.4988, 'grad_norm': 0.7297015190124512, 'learning_rate': 0.0001156434465040231, 'epoch': 0.46}
{'loss': 0.4982, 'grad_norm': 0.7008655667304993, 'learning_rate': 0.0001, 'epoch': 0.51}




KeyboardInterrupt: 

# **Saving the model**

In [4]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    load_in_8bit=False,
    device_map="auto",
    trust_remote_code=True
)
fine_tuned_path = R"D:\newenv\codespace\Tiny_Lllama-sqlQuries-fine_tuned\checkpoint-5000/"
peft_model= PeftModel.from_pretrained(model,fine_tuned_path,from_transformers=True,device_map="auto")

model = peft_model.merge_and_unload()
model.push_to_hub("siddharth-magesh/Tiny_Lllama-sqlQuries-fine_tuned")



model.safetensors:   0%|          | 0.00/2.41G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/siddharth-magesh/Tiny_Lllama-sqlQuries-fine_tuned/commit/22f1491eef7fc8cc646e021d3afc95773784908c', commit_message='Upload LlamaForCausalLM', commit_description='', oid='22f1491eef7fc8cc646e021d3afc95773784908c', pr_url=None, pr_revision=None, pr_num=None)

# **Inference**

In [2]:
def chat_template(question, context):
    template = f"""\
    <|im_start|>user
    Given the context, generate an SQL query for the following question
    context:{context}
    question:{question}
    <|im_end|>
    <|im_start|>assistant 
    """
    template = "\n".join([line.lstrip() for line in template.splitlines()])
    return template

In [3]:
#inference on CPU
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "siddharth-magesh/Tiny_Lllama-sqlQuries-fine_tuned"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")  # Ensure model is on CPU
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token

# Prepare the Prompt
question = "How many heads of the departments are older than 56?"
context = "CREATE TABLE head (age INTEGER)"
prompt = chat_template(question,context)  # Assuming chat_template concatenates the strings.

# Encode the prompt
inputs = tokenizer(prompt, return_tensors="pt").to('cpu')  # Ensure inputs are on CPU

# Generate the output
output = model.generate(**inputs, max_new_tokens=512)

# Decode the output
text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated SQL query
print(text)


<|im_start|>user
Given the context, generate an SQL query for the following question
context:CREATE TABLE head (age INTEGER)
question:How many heads of the departments are older than 56?
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHERE age > 56
<|im_end|>
<|im_start|>assistant 
SELECT COUNT(*) FROM head WHE

In [1]:
#inference on GPU , model is greater than 4GB
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "siddharth-magesh/Tiny_Lllama-sqlQuries-fine_tuned"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU
model.to('cuda')

# Prepare the Prompt
question = "How many heads of the departments are older than 56?"
context = "CREATE TABLE head (age INTEGER)"
prompt = f"{question}\n{context}"  # Assuming chat_template concatenates the strings.

# Encode the prompt
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

# Generate the output
output = model.generate(**inputs, max_new_tokens=512)

# Decode the output
text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated SQL query
print(text)




RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.