# LoRA Finetuning using Unsloth's Models

### Installing required dependencies and libraries

In [7]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install unsloth vllm==0.8.5.post1
from huggingface_hub import login
login()

Collecting xformers==0.0.29.post3
  Using cached xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Using cached xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
Installing collected packages: xformers
  Attempting uninstall: xformers
    Found existing installation: xformers 0.0.29.post2
    Uninstalling xformers-0.0.29.post2:
      Successfully uninstalled xformers-0.0.29.post2
Successfully installed xformers-0.0.29.post3


Collecting xformers==0.0.29.post2 (from vllm==0.8.5.post1)
  Using cached xformers-0.0.29.post2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Using cached xformers-0.0.29.post2-cp311-cp311-manylinux_2_28_x86_64.whl (44.3 MB)
Installing collected packages: xformers
  Attempting uninstall: xformers
    Found existing installation: xformers 0.0.29.post3
    Uninstalling xformers-0.0.29.post3:
      Successfully uninstalled xformers-0.0.29.post3
Successfully installed xformers-0.0.29.post2


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Import models and define the required parameters

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
load_in_4bit = True

model,tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 10:14:48 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 10:14:48 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.7.3: Fast Llama patching. Transformers: 4.53.1. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth 2025.7.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Load and format dataset

In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # End Of Sequence tokens
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset,concatenate_datasets
dataset = load_dataset(path="Vjay15/LoRA_Dataset", data_files="data_training_social_instruction_format.json")['train']
identity = load_dataset(path="Vjay15/LoRA_Dataset", data_files="identity.json")['train']
dataset = concatenate_datasets([dataset,identity])
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

(…)_training_social_instruction_format.json: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

identity.json: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

### Train the model

In [4]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

trainer.train()

Unsloth: Tokenizing ["text"]:   0%|          | 0/127 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 127 | Num Epochs = 3 | Total steps = 48
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.5388
2,2.5122
3,2.5954
4,2.3582
5,2.0969
6,1.9721
7,1.8646
8,1.5578
9,1.4294
10,1.2572


TrainOutput(global_step=48, training_loss=0.8389779639740785, metrics={'train_runtime': 483.3427, 'train_samples_per_second': 0.788, 'train_steps_per_second': 0.099, 'total_flos': 6310570030891008.0, 'train_loss': 0.8389779639740785})

### Inferencing the model

In [5]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are an expert answer evaluator. Your job is to evaluate student answers fairly based on a flexible rubric and the specified difficulty level.\n\nInstructions:\n1. Return the score out of the total marks.\n2. Give a brief explanation justifying the score, referencing key points from the rubric.\n3. Suggest at least one specific way the student can improve their answer quality or overall academic performance.\n4. Use the rubric as a guideline, not a rigid checklist.\n5. Adjust the strictness of grading based on difficulty:\n   - 'easy' → lenient evaluation; minor issues can be overlooked.\n   - 'medium' → balanced and reasonable evaluation.\n   - 'hard' → stricter evaluation; all points must be well explained and accurate.", # instruction
        "{\"question\": \"Trace the sequence of events which occur when a bright light is focused on your eyes.\", \"answer\": \"The pupil gets small due to light. This protects the eyes from damage caused by too much light.\", \"rubrics\": \"1 mark for pupil detecting and reacting to bright light, 1 mark for iris contraction, 1 mark for pupil becoming smaller to reduce light entry.\", \"score\": 3, \"difficulty\": \"easy\"}", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an expert answer evaluator. Your job is to evaluate student answers fairly based on a flexible rubric and the specified difficulty level.

Instructions:
1. Return the score out of the total marks.
2. Give a brief explanation justifying the score, referencing key points from the rubric.
3. Suggest at least one specific way the student can improve their answer quality or overall academic performance.
4. Use the rubric as a guideline, not a rigid checklist.
5. Adjust the strictness of grading based on difficulty:
   - 'easy' → lenient evaluation; minor issues can be overlooked.
   -'medium' → balanced and reasonable evaluation.
   - 'hard' → stricter evaluation; all points must be well explained and accurate.

### Input:
{"question": "Trace the sequence of events which occur when a bright li

### Saving the LoRA model and pushing it to HuggingFace

In [6]:
model.save_pretrained("LLaMA_3_8B_Social_LoRA")
tokenizer.save_pretrained("LLaMA_3_8B_Social_LoRA")
model.push_to_hub("Vjay15/LLaMA_3_8B_Social_LoRA")
tokenizer.push_to_hub("Vjay15/LLaMA_3_8B_Social_LoRA")

README.md:   0%|          | 0.00/591 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Vjay15/LLaMA_3_8B_Social_LoRA


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]