In [None]:
!git clone https://huggingface.co/datasets/Hieu-Pham/kaggle_food_recipes

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
CUDA available: True
GPU: Tesla T4


In [None]:
model_name = "unsloth/Llama-3.2-3B-Instruct"

max_seq_length = 2048
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

In [None]:
import pandas as pd
from datasets import Dataset
import json

df = pd.read_csv("/content/kaggle_food_recipes/Food Ingredients and Recipe Dataset with Image Name Mapping.csv")
df = df.fillna("")

df['input'] = df['Title'] + "\nIngredients: " + df['Ingredients']
df['output'] = df['Instructions']

def format_prompt(row):
    return f"### Input: {row['input']}\n### Output: {json.dumps(row['output'])}<|endoftext|>"

formatted_data = df.apply(format_prompt, axis=1).tolist()
dataset = Dataset.from_dict({"text": formatted_data})

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=3,
        max_steps = 100,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
        report_to="none",
    ),
)

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 13,501 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 97,255,424 of 3,310,005,248 (2.94% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,1.5359


Step,Training Loss
25,1.5359
50,1.3279
75,1.295
100,1.2726


In [None]:

FastLanguageModel.for_inference(model)

# Test prompt
messages = [
    {"role": "user", "content": "provide  Instructions for recipe.Crispy Salt and Pepper Potatoes"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")


outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    do_sample=True,
    top_p=0.9,
)


response = tokenizer.batch_decode(outputs)[0]
print(response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 24 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

provide  Instructions for recipe.Crispy Salt and Pepper Potatoes<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Ingredients: ['2 tablespoons olive oil', '1 1/2 teaspoons salt', '1 1/2 teaspoons freshly ground black pepper', '1/2 teaspoon sugar', '1/4 teaspoon crushed red pepper flakes (optional)', '6-8 medium-sized potatoes, peeled, cut into 1/2-inch cubes', '1/4 cup dry white wine (optional)']
### Cooking: '1. In a large bowl, combine olive oil, salt, black pepper, sugar, and red pepper flakes (if using). Add potatoes and toss to coat. Let sit for 15 minutes to allow potatoes to soak up flavors. If using wine, add it to the bowl and let sit for 15 minutes. Drain potatoes well. (You can let them sit longer, but they will get soggy.)', '2. Preheat oven to 500\u00b0F. Line a large rimmed baking sheet with 

In [None]:
model.save_pretrained_gguf("local_model", tokenizer, quantization_method="q4_k_m")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.4G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.08 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:02<00:00,  9.49it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving local_model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving local_model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at local_model into f16 GGUF format.
The output location will be /content/local_model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: local_model
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INF

In [15]:
from google.colab import files
import os

gguf_files = [f for f in os.listdir("local_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("local_model", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)

Downloading: local_model/unsloth.Q4_K_M.gguf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>