In [None]:
!pip install -q unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

In [None]:
max_seq_length = 2048 # choosing inputs max tokens  context window llama 128k, llama3.2's embedding size is 3072 first input layer also 3072,  ffn expansion 2.7x 8192 ,attention head 24in number. each learns its qkv and attention score , each dimension by attention head 128,gqa,  kv head 8
dtype = None # None for auto detection.
load_in_4bit = True # Using 4bit quantization to reduce memory usage.I can put it to false also


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit, # Will load the 4Bit Quantized Model
)

In [None]:
#getting peft and defining paramweters for LORA

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, #Hyperparameter
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",], #llama has a exclusive gate proj  eg  , wq plus delta w adapter
    #x>>W1up, w2up, >>xw1up, activations (xw2up),>>>w down


   lora_alpha = 16, # a higher alpha value will assign more weight to the LoRA activations  alpha/rank which is delata  a scaling paramter
   lora_dropout = 0, #dropout regularization
    bias = "none",    #I can also keep it all or lora_only
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 1997,
    use_rslora = False,# my rank stabilised lora in case lora alpha is unstable
    loftq_config = None#lora and qlora fusion wehn i am quantizing its learn low rank adaption simuletaneously for better accuracy
)

In [None]:
from datasets import load_dataset
dataset = load_dataset("ServiceNow-AI/R1-Distill-SFT",'v0', split = "train")

#https://huggingface.co/datasets/ServiceNow-AI/R1-Distill-SFT

In [None]:
print(dataset[:5])

In [None]:
#Now creating a prompt  that i will  use to finetune our Llama model
r1_prompt = """You are a reflective assistant engaging in thorough, iterative reasoning, mimicking human stream-of-consciousness thinking. Your approach emphasizes exploration, self-doubt, and continuous refinement before coming up with an answer.
<problem>
{}
</problem>

{}
{}
"""
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
  problems = examples["problem"]
  thoughts = examples["reannotated_assistant_content"]
  solutions = examples["solution"]
  texts = []

  for problem, thought, solution in zip(problems, thoughts, solutions):
    text = r1_prompt.format(problem, thought, solution)+EOS_TOKEN
    texts.append(text)

  return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched = True,)





In [None]:
#creating a trainer object from transformer reinforcement library from hugging face


from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,  #that LoRA model
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2, # Number of processors to use for processing the dataset
    packing = False, # It can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2, # The batch size per GPU/TPU core
        gradient_accumulation_steps = 4, # Number of steps to perform befor each gradient accumulation
        warmup_steps = 5, # Few updates with low learning rate before actual training
        max_steps = 60, # Specifies the total number of training steps (batches) to run.
        learning_rate = 2e-4,# controlling gradient update step
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit", # Optimizer
        weight_decay = 0.01,# L2 regulirazrion discourages huge weights
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",#directory where it will be stored
        report_to = "none", # can be used  for obervability in tensorboard
    ),
)

#2x4x60 480 examples
#low enropy model better ate predicting next token

In [None]:
trainer_stats = trainer.train()

In [None]:
import math

final_loss = trainer_stats.training_loss  # average loss across training
perplexity = math.exp(final_loss)
print(f"Final loss: {final_loss:.4f}, Perplexity: {perplexity:.4f}")

In [None]:
# MY TAKEAWAYS


# In LLM fine-tuning, the training objective is next-token prediction, not numeric regression.

# Therefore, loss values don’t approach zero( close to 0.5 is good enough depends on guidelines) — instead, they reflect how well the model predicts tokens from a large vocabulary.

# In my run, the model achieved:

# Final Loss: ~0.6264

# Perplexity : ~1.87

# A loss of ~0.6 indicates the model is learning the dataset’s reasoning patterns effectively.

# A perplexity of ~1.87 means that, on average, the model is choosing between fewer than 2 plausible tokens at each step → showing high confidence.

# Unlike regression tasks, the goal here isn’t driving loss toward zero, but ensuring a steady downward trend in loss and perplexity, paired with qualitative improvements in reasoning outputs.

In [None]:
#INFERENCE

In [None]:
from unsloth.chat_templates import get_chat_template
sys_prompt = """You are a reflective assistant engaging in thorough, iterative reasoning, mimicking human stream-of-consciousness thinking. Your approach emphasizes exploration, self-doubt, and continuous refinement before coming up with an answer

{problem}
"""

message = sys_prompt.format(problem="If its raining why is the sky grey")
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enables 2x faster inference

messages = [
    {"role": "user", "content": message},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 1024, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
response = tokenizer.batch_decode(outputs)  #batch decode helps me to convert ids back into natural language so that i can see

In [None]:
print(response[0])

In [None]:
response = tokenizer.batch_decode(outputs,skip_special_tokens=True)

In [None]:
print(response[0])

In [None]:
# Saving final model weights LORA  and tokenizer
trainer.save_model("outputs/final_model")
tokenizer.save_pretrained("outputs/final_model")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r /content/outputs/final_model /content/drive/MyDrive/


In [None]:
!ls /content/drive/MyDrive/final_model