<a href="https://colab.research.google.com/github/cs1090218/conv/blob/main/Finetuning_w_Llama3_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup  & Install libraries

Tested this notebook with A100 GPU

In [1]:
base_model_name = "meta-llama/Meta-Llama-3.1-8B"
new_model_name = "code-llama-3-1-8b-text-to-sql"
dataset_name = "b-mc2/sql-create-context"

In [2]:
# Install Pytorch & other libraries
%pip install "torch==2.4.0" torchvision torchaudio tensorboard

# Install Hugging Face libraries
%pip install  --upgrade \
  "transformers==4.44.2" \
  "datasets==2.21.0" \
  "accelerate==0.33.0" \
  "evaluate==0.4.2" \
  "bitsandbytes==0.43.3" \
  "trl==0.9.6" \
  "peft==0.12.0"

Collecting torch==2.4.0
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.4.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.4.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.4.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-many

In [3]:
import torch
print(torch.__version__)
import accelerate
print(accelerate.__version__)
import bitsandbytes
print(bitsandbytes.__version__)
import datasets
print(datasets.__version__)
import evaluate
print(evaluate.__version__)
import peft
print (peft.__version__)
import transformers
print(transformers.__version__)
import trl
print(trl.__version__)
import torchvision
print(torchvision.__version__)
torchvision.ops.nms


# 2.5.0+cu121
# 1.1.0
# 0.42.0
# 3.1.0
# 0.4.3
# 0.13.2
# 4.46.1
# 0.12.0
# 0.20.0+cu121

2.4.0+cu121
0.33.0
0.43.3
2.21.0
0.4.2
0.12.0
4.44.2
0.9.6
0.19.0+cu121


In [4]:
import torch

use_flash_attention = False
if torch.cuda.get_device_capability()[0] >= 8:
  use_flash_attention = True
  !pip install ninja packaging
  !MAX_JOBS=4 pip install flash-attn --no-build-isolation


Collecting ninja
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.11.1.1
Collecting flash-attn
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24
  Stored in directory: /root/.cache/pip/wheels/7e/e

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from huggingface_hub import login
from google.colab import userdata

login(
  token=userdata.get('HF_TOKEN'),
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Prepare Dataset

In [7]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Downloading readme:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_4 (interview INTEGER, swimsuit VARCHAR, state VARCHAR, average VARCHAR, evening_gown VARCHAR)', 'role': 'system'}, {'content': 'What is the sum of the interview scores from North Dakota that have averages less than 8.697, evening gown scores less than 8.73, and swimsuit scores greater than 8.41?', 'role': 'user'}, {'content': 'SELECT SUM(interview) FROM table_name_4 WHERE average < 8.697 AND evening_gown < 8.73 AND state = "north dakota" AND swimsuit > 8.41', 'role': 'assistant'}]


Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

1192608

In [8]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

# Training Setup - Model and params

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
if use_flash_attention:
  model = AutoModelForCausalLM.from_pretrained(
      base_model_name,
      device_map="auto",
      attn_implementation="flash_attention_2",
      torch_dtype=torch.bfloat16,
      quantization_config=bnb_config
  )
else:
  model = AutoModelForCausalLM.from_pretrained(
      base_model_name,
      device_map="auto",
      # attn_implementation="flash_attention_2",
      torch_dtype=torch.bfloat16,
      quantization_config=bnb_config
  )
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.padding_side = 'right' # to prevent warnings

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [10]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [11]:
adapter_model_dir = "./" + new_model_name  # "./code-llama-3-1-8b-text-to-sql"

In [12]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir=adapter_model_dir, # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                      # dont push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [13]:
from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

In [14]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.1423
20,0.6448
30,0.6002
40,0.5809
50,0.5625
60,0.5554
70,0.5029
80,0.4797
90,0.4729
100,0.4796


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=186, training_loss=0.5195049714016658, metrics={'train_runtime': 2053.3348, 'train_samples_per_second': 0.728, 'train_steps_per_second': 0.091, 'total_flos': 1.4949482493483418e+17, 'train_loss': 0.5195049714016658, 'epoch': 2.9879518072289155})

In [15]:
# Save the adapter model files locally
trainer.save_model(adapter_model_dir)

# Optionally also save it to drive
# trainer.save_model("/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql")



In [16]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

# Cmd for copying model adapter files to or from drive

In [33]:
# !cp ./code-llama-3-1-8b-text-to-sql/adapter_config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/adapter_model.safetensors /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/generation_config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/model.safetensors.index.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/special_tokens_map.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/tokenizer.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/tokenizer_config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/training_args.bin /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql

In [7]:
# !mkdir code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/adapter_config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/adapter_model.safetensors ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/generation_config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/model.safetensors.index.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/special_tokens_map.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/tokenizer.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/tokenizer_config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/training_args.bin ./code-llama-3-1-8b-text-to-sql

cp: cannot stat '/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/config.json': No such file or directory
cp: cannot stat '/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/generation_config.json': No such file or directory
cp: cannot stat '/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/model.safetensors.index.json': No such file or directory


# Inference

In [17]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Load PEFT model on CPU
model = AutoPeftModelForCausalLM.from_pretrained(
    adapter_model_dir,  # "./code-llama-3-1-8b-text-to-sql"
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(adapter_model_dir)

# Merge LoRA and base model and save
merged_model = model.merge_and_unload()
merged_model.resize_token_embeddings(len(tokenizer))

# Save the merged model if needed
# model.save_pretrained(
#     adapter_model_dir,
#     safe_serialization=True, max_shard_size="2GB")

model = merged_model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
from datasets import load_dataset
from random import randint


# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(1, len(eval_dataset))

Generating train split: 0 examples [00:00, ? examples/s]

In [19]:
# Inference style #1 where we see the special tokens our model generates.

prompt = tokenizer.apply_chat_template(
    eval_dataset[rand_idx]["messages"][:2],
    tokenize=False,
    add_generation_prompt=True
)

# Tokenize with add_special_tokens=False
input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(model.device)

# Generate response with special tokens
outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    do_sample=False,
    temperature=0.1,
    top_k=50,
    top_p=0.1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

# Decode the output while skipping special tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"\nGenerated Answer:\n{generated_text}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Query:
What episode featured entrepreneur Richard Ernest?
Original Answer:
SELECT first_aired FROM table_name_15 WHERE entrepreneur_s_ = "richard ernest"

Generated Answer:
<|im_start|>system
You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
CREATE TABLE table_name_15 (first_aired VARCHAR, entrepreneur_s_ VARCHAR)<|im_end|>
<|im_start|>user
What episode featured entrepreneur Richard Ernest?<|im_end|>
<|im_start|>assistant
SELECT first_aired FROM table_name_15 WHERE entrepreneur_s_ = "richard ernest"<|im_end|>


In [20]:
# Inference style #2 where we see the cleaned up final output.

from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Query:
What episode featured entrepreneur Richard Ernest?
Original Answer:
SELECT first_aired FROM table_name_15 WHERE entrepreneur_s_ = "richard ernest"
Generated Answer:
SELECT first_aired FROM table_name_15 WHERE entrepreneur_s_ = "richard ernest"


# Upload the model to HuggingFace Hub

In [None]:
model.push_to_hub("shashankverma590/" + new_model_name, check_pr=True)
tokenizer.push_to_hub("shashankverma590/" + new_model_name,check_pr=True)