In [1]:
# Install required libraries
!pip install unsloth==2025.2.4
!pip install unsloth_zoo==2025.2.3
!pip install torch==2.5.1
!pip install torchaudio==2.5.1
!pip install torchvision==0.20.1
!pip install vllm==0.7.2
!pip install xformers==0.0.28.post3
!pip install xgrammar==0.1.11
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install trl==0.8.2

Collecting unsloth==2025.2.4
  Downloading unsloth-2025.2.4-py3-none-any.whl.metadata (57 kB)
Collecting protobuf<4.0.0 (from unsloth==2025.2.4)
  Downloading protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (679 bytes)
Downloading unsloth-2025.2.4-py3-none-any.whl (181 kB)
Downloading protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.3 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: protobuf, unsloth
[2K  Attempting uninstall: protobuf
[2K    Found existing installation: protobuf 6.31.1
[2K    Uninstalling protobuf-6.31.1:
[2K      Successfully uninstalled protobuf-6.31.1
[2K  Attempting uninstall: unsloth
[2K    Found existing installation: unsloth 2025.8.4
[2K    Uninstalling unsloth-2025.8.4:
[2K      Successfully uninstalled unsloth-2025.8.4━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [unsloth]
[2K   [90m━━━━━━━

In [13]:
pip install --upgrade "unsloth[full]"


Note: you may need to restart the kernel to use updated packages.


In [2]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [3]:
#Load the original Model and Tokenizer
import unsloth  # <-- must be first


from unsloth import FastLanguageModel
import torch

# Define configurations for loading the model
max_seq_length = 2048
dtype = None  # Automatically choose the best data type (float16, bfloat16, etc.)
load_in_4bit = True  # Enable 4-bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-11 15:11:35 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.8.4: Fast Llama patching. Transformers: 4.55.0. vLLM: 0.7.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 21.964 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [4]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
   tokenizer,
   chat_template = "llama-3.2",
)
# Set the PAD token to be the same as the EOS token to avoid tokenization issues
tokenizer.pad_token = tokenizer.eos_token
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
   {"role": "user", "content": "What is the input voltage to machine?"}]
# Tokenize the user input with the chat template
inputs = tokenizer.apply_chat_template(
   messages,
   tokenize=True,
   add_generation_prompt=True,
   return_tensors="pt",
   padding=True,  # Add padding to match sequence lengths
).to("cuda")

attention_mask = inputs != tokenizer.pad_token_id

outputs = model.generate(
   input_ids=inputs,
   attention_mask=attention_mask,
   max_new_tokens=64,
   use_cache=True,  # Use cache for faster token generation
   temperature=0.6,  # Controls randomness in responses
   min_p=0.1,  # Set minimum probability threshold for token selection
)

# Decode the generated tokens into human-readable text
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)

system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

What is the input voltage to machine?assistant

I'm not sure what you're referring to. Can you provide more context or clarify what "input voltage to machine" means?


In [5]:
# Apply LoRA Adapters for Efficient Fine-Tuning

model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank (controls low-rank approximation quality)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # Layers to apply LoRA
    lora_alpha=16, # Scaling factor for LoRA weights
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

Unsloth 2025.8.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [6]:
from huggingface_hub import login

In [7]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
#dataset.push_to_hub("dharsandip/sandip_faq_data")

In [8]:
# Prepare the Training Dataset

from datasets import load_dataset  # Load datasets from Hugging Face Hub

# Load a dataset
#dataset = load_dataset(dataset, split="train")
dataset = load_dataset("Prathamesh1420/CNC_machine_manual", split="train")

man1_merged.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/526 [00:00<?, ? examples/s]

In [9]:
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 526
})

In [10]:
type(dataset)

datasets.arrow_dataset.Dataset

In [11]:
import unsloth

In [12]:
# Format Prompts
# Once the dataset is prepared, we need to ensure that the data is structured correctly to be used by the model. For this, we apply
# the appropriate chat template (we have used the Llama-3.2 format.) using the get_chat_template function. This function basically prepares
# the tokenizer with the Llama-3.2 chat format for conversation-style fine-tuning

from unsloth.chat_templates import get_chat_template

# Apply the Llama-3.2 chat template to the tokenizer
tokenizer = get_chat_template(
    tokenizer,  # Tokenizer being used
    chat_template="llama-3.2",  # The chat template format
)

# Function to format the conversation data into tokenized text
def formatting_prompts_func(examples):
    convos = examples["conversation"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

Model does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


Map:   0%|          | 0/526 [00:00<?, ? examples/s]

KeyError: 'conversation'

In [13]:
from unsloth.chat_templates import get_chat_template

# Apply the Llama-3.2 chat template to the tokenizer
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.2",
)

# Function to format your QA dataset into a conversation format
def formatting_prompts_func(examples):
    # Convert your {"question": "...", "answer": "..."} format to conversation format
    convos = []
    for q, a in zip(examples["question"], examples["answer"]):
        convos.append([
            {"role": "user", "content": q},
            {"role": "assistant", "content": a}
        ])
    
    # Apply chat template
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        for convo in convos
    ]
    return {"text": texts}

# Map over dataset
dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/526 [00:00<?, ? examples/s]

In [14]:
dataset['text']

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the importance of fully understanding the content of the MAINTENANCE MANUAL HCN-6800 before operating the machine?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nTo ensure proper operation and prevent serious personal injury or material damage.<|eot_id|>',
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat should be done if any questions arise regarding the machine operation?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAsk the nearest Technical Center or Technology Center.<|eot_id|>',
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_h

In [17]:
print(dataset[0].keys())
print(dataset[0]["text"])   # if "text" exists

dict_keys(['question', 'answer', 'text'])
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the importance of fully understanding the content of the MAINTENANCE MANUAL HCN-6800 before operating the machine?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

To ensure proper operation and prevent serious personal injury or material damage.<|eot_id|>


In [18]:
# Setup and Configure the Trainer
# configure the fine-tuning process using Hugging Face’s SFTTrainer. It automates key tasks like tokenization, batching, and optimization,
# making fine-tuning easier. SFTTrainer works efficiently with Unsloth, reducing VRAM usage and speeding up training

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported


# Define training configurations
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,

    args=TrainingArguments(
        per_device_train_batch_size=2,  # Number of examples per GPU batch
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 batches before updating model
        warmup_steps=5,  # Number of warmup steps for learning rate schedule
      #  max_steps=0,
        num_train_epochs=10,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,  # Log training metrics after every step
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",  # Linear decay of learning rate
        seed=3407,
        output_dir="outputs",  # Directory to save model checkpoints
        report_to="none",  # Use this for WandB etc

    ),
)

Map (num_proc=2):   0%|          | 0/526 [00:00<?, ? examples/s]

In [19]:
# Train Only on Assistant Responses
# To improve training efficiency, we will focus only on the assistant’s responses rather than user inputs.

from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",  # Mark user input
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",  # Mark assistant response
)
# Start training the model
trainer_stats = trainer.train()

Map (num_proc=8):   0%|          | 0/526 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 526 | Num Epochs = 10 | Total steps = 660
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
1,3.3071
2,2.9493
3,2.9688
4,3.8925
5,3.5481
6,3.5464
7,2.659
8,2.4896
9,2.8065
10,2.2749


In [19]:
#trainer_stats = trainer.train()

In [21]:
# Inference

tokenizer = get_chat_template(
   tokenizer,
   chat_template = "llama-3.2",
)
# Set the PAD token to be the same as the EOS token to avoid tokenization issues
tokenizer.pad_token = tokenizer.eos_token
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
   {"role": "user", "content": "What is the input voltage to machine?"}]
# Tokenize the user input with the chat template
inputs = tokenizer.apply_chat_template(
   messages,
   tokenize=True,
   add_generation_prompt=True,
   return_tensors="pt",
   padding=True,  # Add padding to match sequence lengths
).to("cuda")

attention_mask = inputs != tokenizer.pad_token_id

outputs = model.generate(
   input_ids=inputs,
   attention_mask=attention_mask,
   max_new_tokens=64,
   use_cache=True,  # Use cache for faster token generation
   temperature=0.6,  # Controls randomness in responses
   min_p=0.1,  # Set minimum probability threshold for token selection
)

# Decode the generated tokens into human-readable text
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)

Model does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

What is the input voltage to machine?assistant

400V, 480V, 600V


In [22]:
my_model="Llama-3.2-3B-Instruct-bnb-4bit-finetuned"
model.save_pretrained(my_model)  # Local saving
tokenizer.save_pretrained(my_model)

('Llama-3.2-3B-Instruct-bnb-4bit-finetuned/tokenizer_config.json',
 'Llama-3.2-3B-Instruct-bnb-4bit-finetuned/special_tokens_map.json',
 'Llama-3.2-3B-Instruct-bnb-4bit-finetuned/chat_template.jinja',
 'Llama-3.2-3B-Instruct-bnb-4bit-finetuned/tokenizer.json')

In [23]:
from huggingface_hub import HfApi

api = HfApi()
repo_id = "Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned"
api.create_repo(repo_id=repo_id, private=False)  # set private=True if you want it private


RepoUrl('https://huggingface.co/Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned')

In [24]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path="Llama-3.2-3B-Instruct-bnb-4bit-finetuned",
    repo_id="Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned",
    repo_type="model"
)


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...t-bnb-4bit-finetuned/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

  ...finetuned/adapter_model.safetensors:   0%|          | 45.8kB / 97.3MB            

CommitInfo(commit_url='https://huggingface.co/Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned/commit/970b3dedfe625c68c69685f87312da228730f0b2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='970b3dedfe625c68c69685f87312da228730f0b2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned'), pr_revision=None, pr_num=None)

In [29]:
from unsloth import FastLanguageModel

# Load your quantized 4-bit fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Prathamesh1420/Llama-3.2-3B-Instruct-bnb-4bit-finetuned",
    max_seq_length = 2048,
    dtype = None,   # Auto-detect
    load_in_4bit = True,
)

# Chat format example
prompt = "What is the input voltage to a CNC machine?"
FastLanguageModel.for_inference(model)  # Optimizes for inference

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


==((====))==  Unsloth 2025.8.4: Fast Llama patching. Transformers: 4.55.0. vLLM: 0.7.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 21.964 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




What is the input voltage to a CNC machine? 120V, 220/240V, 380V, 420-440V, 480V
The input voltage to a CNC machine is 220/240V.
