In [2]:
import json

#Loading the Training Dataset
file = json.load(open("json_extraction_dataset_500.json", "r"))
print(file[1])

{'input': "Extract the product information:\n<div class='product'><h2>iPad Air</h2><span class='price'>$1344</span><span class='category'>audio</span><span class='brand'>Dell</span></div>", 'output': {'name': 'iPad Air', 'price': '$1344', 'category': 'audio', 'manufacturer': 'Dell'}}


In [3]:
# Install all the required libraries for fine-tuning and optimizing large language models
!pip install unsloth trl peft accelerate bitsandbytes

    # üöÄ Unsloth - A library for *super fast fine-tuning* of LLMs (especially LLaMA, Mistral, etc.)
    # ü§ñ TRL (Transformer Reinforcement Learning) - used for fine-tuning models with reinforcement learning (like PPO, DPO)
    # üß© PEFT (Parameter-Efficient Fine-Tuning) - enables low-rank adaptation (LoRA) and similar lightweight fine-tuning techniques
    # ‚ö° Accelerate - from Hugging Face, helps to easily train on multi-GPU/TPU setups and manage mixed precision
    # üíæ BitsAndBytes - enables *8-bit/4-bit quantization* to reduce memory usage during training and inference


Collecting unsloth
  Downloading unsloth-2025.10.10-py3-none-any.whl.metadata (61 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/61.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.5/61.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting unsloth_zoo>=2025.10.11 (from unsloth)
  Downloading unsloth_zoo-2025.10.12-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.w

In [4]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [5]:
# Import required modules
from unsloth import FastLanguageModel   # üöÄ FastLanguageModel - Unsloth's optimized wrapper for fast loading and fine-tuning of LLMs
import torch                            # ‚öôÔ∏è PyTorch - backend framework for tensor operations and deep learning

# Specify the pre-trained model name from Unsloth's model hub
model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
# üß† This is a lightweight, instruction-tuned version of Phi-3 Mini model,
# quantized to 4-bit (bnb = bitsandbytes) for memory-efficient training/inference.

# Define model parameters
max_seq_length = 2048   # üìè Maximum sequence length (number of tokens model can handle in one go)
dtype = None            # ‚öôÔ∏è Let Unsloth auto-detect the data type (e.g., float16, bfloat16)

# Load the model and tokenizer using Unsloth‚Äôs optimized loader
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,     # üß© Name or path of the model to load
    max_seq_length=max_seq_length,  # üìè Sets token window length
    dtype=dtype,                   # ‚öôÔ∏è Data precision type (auto by default)
    load_in_4bit=True,             # üíæ Loads model weights in 4-bit precision for huge memory savings
)

# ‚úÖ 'model' now holds the quantized Phi-3 model ready for fine-tuning or inference
# ‚úÖ 'tokenizer' will handle text-to-token and token-to-text conversions


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.




ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.10: Fast Mistral patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [6]:
from datasets import Dataset   # Load Dataset utility
import json                    # For JSON formatting

def format_prompt(example):    # Format each data sample
    return f"### Input: {example['input']}\n### Output: {json.dumps(example['output'])}<|endoftext|>"

formatted_data = [format_prompt(item) for item in file]   # Apply formatting to all items
dataset = Dataset.from_dict({"text": formatted_data})     # Create HF dataset from text


In [7]:
dataset[1]

{'text': '### Input: Extract the product information:\n<div class=\'product\'><h2>iPad Air</h2><span class=\'price\'>$1344</span><span class=\'category\'>audio</span><span class=\'brand\'>Dell</span></div>\n### Output: {"name": "iPad Air", "price": "$1344", "category": "audio", "manufacturer": "Dell"}<|endoftext|>'}

In [8]:
# Add LoRA (Low-Rank Adaptation) adapters for lightweight fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank ‚Äì controls adapter size & learning capacity
    target_modules=[  # Model layers where LoRA is applied
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # Scaling factor for LoRA updates
    lora_dropout=0,  # Dropout for regularization (0 = faster training)
    bias="none",     # Exclude bias for better optimization speed
    use_gradient_checkpointing="unsloth",  # Saves GPU memory during training
    random_state=3407,  # Ensures reproducible results
    use_rslora=False,   # Disable Rank-Stabilized LoRA
    loftq_config=None,  # No quantization fine-tuning
)


Unsloth 2025.10.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [9]:
from trl import SFTTrainer                  # Trainer for supervised fine-tuning (SFT)
from transformers import TrainingArguments  # Config class for all training hyperparameters

# Initialize SFT trainer with optimized settings for Unsloth
trainer = SFTTrainer(
    model=model,                           # LoRA-optimized model
    tokenizer=tokenizer,                   # Tokenizer for text preprocessing
    train_dataset=dataset,                 # Training data
    dataset_text_field="text",             # Field containing text data
    max_seq_length=max_seq_length,         # Max tokens per sample
    dataset_num_proc=2,                    # Parallel data preprocessing

    # Training configuration
    args=TrainingArguments(
        per_device_train_batch_size=2,     # Batch size per GPU
        gradient_accumulation_steps=4,     # Accumulate gradients (effective batch = 8)
        warmup_steps=10,                   # Steps to ramp up LR
        num_train_epochs=3,                # Total epochs
        learning_rate=2e-4,                # Base LR
        fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 if BF16 unsupported
        bf16=torch.cuda.is_bf16_supported(),      # Use BF16 if available
        logging_steps=25,                  # Log progress every 25 steps
        optim="adamw_8bit",                # 8-bit Adam optimizer (memory-efficient)
        weight_decay=0.01,                 # Regularization
        lr_scheduler_type="linear",        # Linear LR decay
        seed=3407,                         # Reproducibility
        output_dir="outputs",              # Save model checkpoints
        save_strategy="epoch",             # Save every epoch
        save_total_limit=2,                # Keep last 2 checkpoints
        dataloader_pin_memory=False,       # Disable pinned memory for stability
        report_to="none",                  # Disable W&B or HF logging
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Step,Training Loss
25,0.4515
50,0.15
75,0.1354
100,0.1235
125,0.1163
150,0.1118
175,0.1098


In [11]:
# Enable optimized inference mode (2x faster generation)
FastLanguageModel.for_inference(model)

# Sample test prompt (HTML snippet to extract product info)
messages = [
    {"role": "user", "content": "Extract the product information:\n<div class='product'><h2>iPad Air</h2><span class='price'>$1344</span><span class='category'>audio</span><span class='brand'>Dell</span></div>"},
]

# Tokenize the input using chat template for model compatibility
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")  # Move input tensors to GPU

# Generate model response
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=256,  # Max tokens to generate
    use_cache=True,      # Use KV cache for speed
    temperature=0.7,     # Sampling randomness
    do_sample=True,      # Enable sampling
    top_p=0.9,           # Nucleus sampling threshold
)

# Decode and display the model‚Äôs response
response = tokenizer.batch_decode(outputs)[0]
print(response)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|user|> Extract the product information:
<div class='product'><h2>iPad Air</h2><span class='price'>$1344</span><span class='category'>audio</span><span class='brand'>Dell</span></div><|end|><|assistant|> {"name": "iPad Air", "price": "$1344", "category": "audio", "manufacturer": "Dell"}<|end|>


In [12]:
# Save the fine-tuned model in GGUF format (for use with llama.cpp or Ollama)
model.save_pretrained_gguf(
    "gguf_model",                # Output folder name
    tokenizer,                   # Associated tokenizer
    quantization_method="q4_k_m" # Quantization type (4-bit, balanced for speed & quality)
)


Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1/2 [02:52<02:52, 172.83s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [04:23<00:00, 131.73s/it]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [06:24<00:00, 192.48s/it]


Unsloth: Merge process complete. Saved to `/content/gguf_model`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['phi-3-mini-4k-instruct.F16.gguf']
Unslot

{'save_directory': 'gguf_model',
 'gguf_files': ['phi-3-mini-4k-instruct.Q4_K_M.gguf'],
 'modelfile_location': '/content/Modelfile',
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': False}

In [13]:
from google.colab import files   # üì• For downloading files from Colab
import os                        # üóÇÔ∏è To interact with the file system

# Find all GGUF files in the output directory
gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]

# If a GGUF file exists, prepare and download it
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])  # Get first GGUF file path
    print(f"Downloading: {gguf_file}")                     # Show which file is being downloaded
    files.download(gguf_file)                              # Trigger file download in Colab
