# Finetune Llama-3 with LLaMA Factory

Please use a **free** Tesla T4 Colab GPU to run this!

Project homepage: https://github.com/hiyouga/LLaMA-Factory

## Install Dependencies

In [None]:
%cd /content/
%rm -rf LLaMA-Factory
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
!pip uninstall -y jax
!pip install -e .[torch,bitsandbytes,liger-kernel]

### Check GPU environment

In [None]:
import torch
try:
  assert torch.cuda.is_available() is True
except AssertionError:
  print("Please set up a GPU before using LLaMA Factory: https://medium.com/mlearning-ai/training-yolov4-on-google-colab-316f8fff99c6")

## Update Identity Dataset

In [None]:
import json

%cd /content/LLaMA-Factory/

NAME = "LLM"
AUTHOR = "V"

with open("data/identity.json", "r", encoding="utf-8") as f:
  dataset = json.load(f)

for sample in dataset:
  sample["output"] = sample["output"].replace("{{"+ "name" + "}}", NAME).replace("{{"+ "author" + "}}", AUTHOR)

with open("data/identity.json", "w", encoding="utf-8") as f:
  json.dump(dataset, f, indent=2, ensure_ascii=False)

## Fine-tune model via LLaMA Board

In [None]:
# %cd /content/LLaMA-Factory/
# !GRADIO_SHARE=1 llamafactory-cli webui

## Fine-tune model via Command Line

It takes ~30min for training.

In [None]:
# import json

# args = dict(
#   stage="sft",                        # do supervised fine-tuning
#   do_train=True,
#   # model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # use bnb-4bit-quantized Llama-3-8B-Instruct model
#   # https://huggingface.co/unsloth/Qwen2.5-3B-Instruct-bnb-4bit
#   model_name_or_path="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
#   dataset="output2XOXO",             # use alpaca and identity datasets
#   template="qwen",                     # use llama3 prompt template
#   finetuning_type="lora",                   # use LoRA adapters to save memory
#   lora_target="all",                     # attach LoRA adapters to all linear layers
#   output_dir="llama3_lora",                  # the path to save LoRA adapters
#   per_device_train_batch_size=2,               # the batch size
#   gradient_accumulation_steps=4,               # the gradient accumulation steps
#   lr_scheduler_type="cosine",                 # use cosine learning rate scheduler
#   logging_steps=10,                      # log every 10 steps
#   warmup_ratio=0.1,                      # use warmup scheduler
#   save_steps=1000,                      # save checkpoint every 1000 steps
#   learning_rate=5e-5,                     # the learning rate
#   num_train_epochs=3.0,                    # the epochs of training
#   max_samples=500,                      # use 500 examples in each dataset
#   max_grad_norm=1.0,                     # clip gradient norm to 1.0
#   loraplus_lr_ratio=16.0,                   # use LoRA+ algorithm with lambda=16.0
#   fp16=True,                         # use float16 mixed precision training
#   use_liger_kernel=True,                   # use liger kernel for efficient training
# )

# json.dump(args, open("train_llama3.json", "w", encoding="utf-8"), indent=2)

# %cd /content/LLaMA-Factory/

# !llamafactory-cli train train_llama3.json

In [None]:
# # archive /content/LLaMA-Factory/llama3_lora

# !zip -r llama3_lora.zip /content/LLaMA-Factory/llama3_lora
# # download zip


## Infer the fine-tuned model

In [None]:
# !pip install numba

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()

In [None]:
!gdown https://drive.google.com/file/d/1F1sy6uHNjP_twAPUzSfFOuXGVVodnNXb/view?usp=sharing

In [None]:
!wget https://raw.githubusercontent.com/V3D4N7V2/filedump/refs/heads/main/database_mapping.csv
import pandas as pd
from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc

# Navigate to the LLaMA-Factory directory
%cd /content/LLaMA-Factory/

# Define model arguments
args = dict(
    model_name_or_path="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
    template="qwen",
    adapter_name_or_path="llama3_lora",
    finetuning_type="lora",
    quantization_bit=4,
)
chat_model = ChatModel(args)

# Read the CSV file
input_file = "database_mapping.csv"
output_file = "output.csv"
data = pd.read_csv(input_file)

# Prepare a list to store results
results = []

# Iterate through each row in the CSV
for _, row in data.iterrows():
    schema = row['database_structure']
    natural_language_query = row['question']
    instance_id = row['instance_id']
    db_name = row['db']

    # Construct the prompt
    prompt = (
        "Using the following database schema, write an SQL query to perform the operation described below:\n"
        f"Schema:\n{schema}\n"
        f"Natural Language Query: {natural_language_query}\n"
    )

    # Create a message structure for the model
    messages = [{"role": "user", "content": prompt}]

    # Generate the response
    print(f"Processing instance_id: {instance_id}...")
    response = ""
    for new_text in chat_model.stream_chat(messages):
        response += new_text

    # Append the result to the results list
    results.append({
        "instance_id": instance_id,
        "db": db_name,
        "question": natural_language_query,
        "response": response
    })

    torch_gc()  # Clear GPU memory if necessary

# Save the results to a new CSV file
output_df = pd.DataFrame(results)
output_df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}.")


## Merge the LoRA adapter and optionally upload model

NOTE: the Colab free version has merely 12GB RAM, where merging LoRA of a 8B model needs at least 18GB RAM, thus you **cannot** perform it in the free version.

In [None]:
!huggingface-cli login