# LLM Finetuning on Modal Cloud

Write Python code and collaborate in real time. Your code runs in Modal's
**serverless cloud**, and anyone in the same workspace can join.

This notebook comes with some common Python libraries installed. Run
cells with `Shift+Enter`.

In [11]:
!pip uninstall -y transformers torch accelerate peft
!pip install \
peft==0.12.0 \
  torch==2.2.2 \
  transformers==4.45.2 \
  accelerate==0.34.2 \
    sentencepiece \
  bitsandbytes


Found existing installation: transformers 4.45.2
Uninstalling transformers-4.45.2:
  Successfully uninstalled transformers-4.45.2
Found existing installation: torch 2.2.2
Uninstalling torch-2.2.2:
  Successfully uninstalled torch-2.2.2
Found existing installation: accelerate 0.34.2
Uninstalling accelerate-0.34.2:
  Successfully uninstalled accelerate-0.34.2
Found existing installation: peft 0.18.0
Uninstalling peft-0.18.0:
  Successfully uninstalled peft-0.18.0
Collecting peft==0.12.0
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting torch==2.2.2
  Downloading torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
Downloading torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl (755.5 MB)
[?25l   [9

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

os.environ["HF_TOKEN"] = "your-hf-api-ke"

MODEL_ID = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True,
    token=os.environ["HF_TOKEN"],
    
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=os.environ["HF_TOKEN"],
    
)


# bf16 is numerically stable on A100
# use_cache=False avoids gradient bugs
# device_map="auto" works correctly on Modal GPUs


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/site-packages/ipykernel/__main__.py", line 5, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/site-packages/tornado/

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Downloading datasets-4.4.2-py3-none-any.whl (512 kB)
Downloading dill-0.4.0-py3-none-any.whl (119 kB)
Downloading multiprocess-0.70.18-py312-none-any.whl (150 kB)
Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺

In [5]:
from datasets import load_dataset

In [6]:
dataset = load_dataset("b-mc2/sql-create-context")

# Optional but recommended: limit initially
dataset = dataset["train"]


README.md: 0.00B [00:00, ?B/s]

sql_create_context_v4.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

In [9]:
def format_prompt(example):
    return f"""### Instruction:
{example['question']}

### Context:
{example['context']}

### Response:
{example['answer']}"""


In [10]:
def tokenize(example):
    text = format_prompt(example)
    return tokenizer(
        text,
        truncation=True,
        max_length=2048,
        padding=False
    )

tokenized_dataset = dataset.map(
    tokenize,
    remove_columns=dataset.column_names,
    num_proc=4
)


Map (num_proc=4):   0%|          | 0/78577 [00:00<?, ? examples/s]

In [8]:
pip install --upgrade peft


Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading peft-0.18.0-py3-none-any.whl (556 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/556.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.12.0
    Uninstalling peft-0.12.0:
      Successfully uninstalled peft-0.12.0
Successfully installed peft-0.18.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
from peft import LoraConfig
from peft import get_peft_model

lora_config = LoraConfig(
    r=16,                         # Good balance for 3B
    lora_alpha=32,                # 2 × r is standard
    lora_dropout=0.05,            # Prevents overfitting
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj"
    ]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 9,175,040 || all params: 3,221,924,864 || trainable%: 0.2848


In [13]:
model.config.use_cache = False
model.gradient_checkpointing_enable()


In [14]:
for name, module in model.named_modules():
    if "lora" in name.lower():
        print(name)
        break


base_model.model.model.layers.0.self_attn.q_proj.lora_dropout


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/mnt/models/llama3-sql-lora",

    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,   # effective batch = 16

    learning_rate=2e-4,              # correct for LoRA
    num_train_epochs=3,

    bf16=True,
    logging_steps=50,

    save_steps=500,
    save_total_limit=2,

    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,

    max_grad_norm=1.0,
    report_to="none",

    gradient_checkpointing=True,
)


In [17]:
pip install trl==0.9.6


Collecting trl==0.9.6
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting numpy<2.0.0,>=1.18.2 (from trl==0.9.6)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting tyro>=0.5.11 (from trl==0.9.6)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting docstring-parser>=0.15 (from tyro>=0.5.11->trl==0.9.6)
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting typeguard>=4.0.0 (from tyro>=0.5.11->trl==0.9.6)
  Downloading typeguard-4.4.4-py3-none-any.whl.metadata (3.3 kB)
Collecting typing-extensions>=4.8.0 (from torch>=1.4.0->trl==0.9.6)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Downloading trl-0.9.6-py3-none-any.whl (245 kB)
Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/18.0 MB[0m [31m?[0m eta [36m-:--

TEST RUN

In [56]:
from datasets import load_dataset

raw_ds = load_dataset("b-mc2/sql-create-context", split="train")



In [57]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True,
    token=os.environ["HF_TOKEN"],
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id


In [58]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=os.environ["HF_TOKEN"],
)

model.config.use_cache = False
model.gradient_checkpointing_enable()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [59]:
model = get_peft_model(model, lora_config)


In [62]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/mnt/finetuned-models/llama3-sql-lora",

    # --------- BATCHING ----------
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,   # effective batch = 16

    # --------- TRAINING ----------
    num_train_epochs=3,              # correct for dataset size
    learning_rate=2e-4,              # LoRA-optimal
    bf16=True,

    # --------- OPTIMIZER ----------
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    max_grad_norm=1.0,

    # --------- PERFORMANCE ----------
    gradient_checkpointing=True,
    logging_steps=50,

    # --------- CHECKPOINTING ----------
    save_steps=500,
    save_total_limit=2,

    # --------- MISC ----------
    report_to="none",
    remove_unused_columns=True,
)


In [63]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=raw_ds,
    tokenizer=tokenizer,
    formatting_func=format_prompt,
)




Map:   0%|          | 0/78577 [00:00<?, ? examples/s]

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [64]:
trainer.train()


Step,Training Loss
50,2.9958
100,1.9988
150,1.3392
200,1.24
250,1.2067
300,1.1478
350,1.0513
400,1.0363
450,1.0121
500,1.0281


TrainOutput(global_step=14733, training_loss=0.8054362740915079, metrics={'train_runtime': 22222.2066, 'train_samples_per_second': 10.608, 'train_steps_per_second': 0.663, 'total_flos': 2.904829196499456e+17, 'train_loss': 0.8054362740915079, 'epoch': 2.999923642749879})

In [65]:
FINAL_PATH = "/mnt/finetuned-models/llama3-sql-lora/final"

trainer.model.save_pretrained(FINAL_PATH)
tokenizer.save_pretrained(FINAL_PATH)

print("Final files:", os.listdir(FINAL_PATH))


Final files: ['README.md', 'adapter_config.json', 'adapter_model.safetensors', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json']


In [66]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

MODEL_ID = "meta-llama/Llama-3.2-3B"
ADAPTER_PATH = "/mnt/finetuned-models/llama3-sql-lora/final"

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model = PeftModel.from_pretrained(
    base_model,
    ADAPTER_PATH
)

model.eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Li

In [67]:
def ask_sql(question, context, max_new_tokens=256):
    prompt = f"""### Instruction:
{question}

### Context:
{context}

### Response:
"""

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,        # IMPORTANT: deterministic SQL
            temperature=0.0,
            top_p=1.0,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(
        output[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True,
    )

    return response.strip()


In [68]:
question = "List the names of employees who joined after 2020."

context = """
Table: employees
Columns:
- id (int)
- name (varchar)
- join_date (date)
"""

print(ask_sql(question, context))


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


SELECT name FROM employees WHERE join_date > 2020 ORDER BY name DESC LIMIT 5


In [69]:
question = "Find customer names and total order amount for customers with more than 3 orders."

context = """
Table: customers
- customer_id (int)
- name (varchar)

Table: orders
- order_id (int)
- customer_id (int)
- amount (decimal)
"""

print(ask_sql(question, context))


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


SELECT T1.name, SUM(T2.amount) FROM customers AS T1 JOIN orders AS T2 ON T1.customer_id = T2.customer_id GROUP BY T1.customer_id HAVING COUNT(*) > 3 ORDER BY SUM(T2.amount) DESC LIMIT 1
