## Prompting

In [4]:
system_prompt="You are CodeFixerGPT, an expert software engineer that helps users debug and optimize their code. \nYou explain your reasoning clearly, ask clarifying questions when needed, and always \nprovide complete, working examples."
print(system_prompt)


You are CodeFixerGPT, an expert software engineer that helps users debug and optimize their code. 
You explain your reasoning clearly, ask clarifying questions when needed, and always 
provide complete, working examples.


In [6]:
user_prompt="""Here's my Python code for sorting a list of dictionaries by the "age" key. \nIt doesn't seem to work correctly — can you fix it?

data = [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]
data.sort("age")
"""
print(user_prompt)

Here's my Python code for sorting a list of dictionaries by the "age" key. 
It doesn't seem to work correctly — can you fix it?

data = [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]
data.sort("age")



## Prompt Template

In [None]:
"['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']"

In [None]:
if assistant_prompt:
    prompt=f'''<|start_header_id|>system<|end_header_id|>\n\n
        { system_prompt }<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n
        {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_prompt}<|eot_id|>'''
else:
    prompt=f'''<|start_header_id|>system<|end_header_id|>\n\n
        { system_prompt }<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n
        { user_prompt }<|eot_id|><|start_header_id|>assistant<|end_header_id|>'''

 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

tokenizer.apply_chat_template(chat, tokenize=False)

In [None]:
"""<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"""

## Loading Open Source LLM

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
import torch

In [None]:
#For gated models, you need to login to access the model
from huggingface_hub import login
token="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"
login(token = token)

In [None]:
base_model = "google/gemma-2b-it"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    trust_remote_code=True
)
#Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model,truncation_side="left")

### Loading optimizations

datatype -- lower the precision

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

change device to use gpu

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="cuda:0",,
    trust_remote_code=True
)

change attention type

In [None]:

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    attn_implementation="flash_attention_2",
    trust_remote_code=True
)

#### Quantization

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quantization_config,
    device_map="cuda:0", ## works only with GPU
    trust_remote_code=True
)

In [None]:
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": "cpu",
    "transformer.h": 0,
    "transformer.ln_f": 0,
}

In [None]:
model.dequantize()

## Decoding

### Greedy search

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
# explicitly set to default length because Llama2 generation length is 4096
outputs = model.generate(**inputs, max_new_tokens=20)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing'

### Sampling

In [None]:
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'

### Beam Search

In [None]:
outputs = model.generate(**inputs, max_new_tokens=50, num_beams=2)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

## Tuning

### Dataset Format

In [None]:
{"messages": [{"role": "system", "content": "You are helpful"}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are helpful"}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are helpful"}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "..."}]}

In [None]:
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}

In [None]:
# Below is an instruction ...

### Instruction
# {prompt}

### Response:
# {completion}

### Full tuning (Trainer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    optim="paged_adamw_32bit",
    learning_rate=5e-5,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    
    report_to=["wandb"],
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
)

### Full tuning (SFT Trainer)

In [None]:
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=epochs,
    optim="paged_adamw_32bit",
    evaluation_strategy="epoch",
    save_strategy='epoch',  
    warmup_steps=10,
    logging_strategy="epoch",
    learning_rate=lr,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to=["wandb"],
)
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    max_seq_length=context_length,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,   
    # data_collator=data_collator,
)

## PEFT

In [None]:
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

## QLORA

In [None]:
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
model= AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)

## ✅ Advantages of QLoRA

- 🔋 **Low memory usage**: Fine-tunes large models (e.g. LLaMA 13B/65B) on consumer GPUs (e.g. 24GB VRAM).
- ⚡ **Efficient training**: Only small adapter weights are updated, making training faster and cheaper.
- 🎯 **Competitive accuracy**: Achieves similar performance to full fine-tuning on many benchmarks.
- 🧩 **Modular and reusable**: LoRA adapters are small, easy to store/share, and support multi-task fine-tuning.
- 🔁 **Supports multiple adapters**: Load and switch between domain-specific adapters without retraining the base model.
- 🧠 **Keeps base model intact**: Fine-tuning does not modify pretrained weights — good for safety and reproducibility.

## ❌ Disadvantages of QLoRA

- 🎯 **Limited adaptability**: Only fine-tunes a subset of weights — can underperform on complex or domain-shifted tasks.
- 🧮 **Quantization noise**: 4-bit approximation may reduce precision, especially in less common tasks.
- 🧰 **Tooling complexity**: Requires managing quantization, adapter configs, and training-specific frameworks.


## More Fast

### Unsloth

In [None]:
from unsloth import FastLanguageModel, FastModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!
# Get LAION dataset
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files = {"train" : url}, split = "train")

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4B-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = SFTConfig(
        max_seq_length = max_seq_length,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)
trainer.train()


## 🚀 How Unsloth Achieves Faster Fine-Tuning

### ✅ 1. Static Linear Rewriting
- Replaces standard `torch.nn.Linear` with optimized LoRA + quantization fused layer.
- Eliminates dynamic injection overhead.

### ✅ 2. Flash Attention v2
- Speeds up attention via memory-efficient CUDA kernels.
- Great for long context lengths.

### ✅ 3. Fast Tokenization
- Uses optimized Rust-backed tokenizer for faster input preprocessing.

### ✅ 4. Optimized QLoRA Integration
- True 4-bit quantization with efficient memory layout.
- Avoids excess GPU memory allocations.

### ✅ 5. Graph-Level Optimization
- Injects LoRA into compiled computation graph.
- Avoids dynamic PyTorch dispatch.

### ✅ 6. CUDA Kernel Enhancements
- Custom matrix mult and fused ops for speed.

## 🧪 Benefits

- ⚡ 2–5× faster training
- 💾 50–70% lower VRAM
- 🧠 Better for long-context tasks


### llama factory

https://github.com/hiyouga/LLaMA-Factory

## Fine Tuning GPT

In [None]:
{"messages": [{"role": "system", "content": "You are teaching assistant for Machine Learning. You should help to user to answer on his question."}, {"role": "user", "content": "What is machine learning?"}, {"role": "assistant", "content": "'Tis but the art of teaching machines to think, to learn from data most fine, and decisions to link."}]}

{"messages": [{"role": "system", "content": "You are teaching assistant for Machine Learning. You should help to user to answer on his question."}, {"role": "user", "content": "How doth neural networks work?"}, {"role": "assistant", "content": "They're like the brains of yon digital beast, with layers many, and nodes that cease."}]}

In [None]:
client = OpenAI()


client.files.create(
  file=open("train.jsonl", "rb"),
  purpose="fine-tune"
)

client.fine_tuning.jobs.create(
  training_file="file-rIua39sJX1O64gzxTYfpvJx7",
  model="gpt-3.5-turbo" #change to gpt-4-0613 if you have access
)

completion = client.chat.completions.create(
  model="ft:gpt-3.5-turbo-0613:personal::8k01tfYd",
  messages=[
    {"role": "system", "content": "You are a teaching assistant for Machine Learning. You should help to user to answer on his question."},
    {"role": "user", "content": "What is a loss function?"}
  ]
)

### Azure

https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/fine-tune?tabs=command-line

## Distillation

In [None]:
from setfit import DistillationTrainer

distillation_args = TrainingArguments(
    batch_size=16,
    max_steps=500,
)

distillation_trainer = DistillationTrainer(
    teacher_model=teacher_model,
    student_model=model,
    args=distillation_args,
    train_dataset=unlabeled_train_dataset,
    eval_dataset=eval_dataset,
)
# Train student with knowledge distillation
distillation_trainer.train()

## Reinforcement Learning

### PPO

In [None]:
from trl import PPOConfig

config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5,
)

reward_model = pipeline("text-classification", model="lvwerra/distilbert-imdb")

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from SFTModel
    response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute reward score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = reward_model(texts)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

#### Save model
ppo_trainer.save_model("my_ppo_model")

### DPO

In [None]:
dataset={'chosen':[], 'rejected':[],"prompt":[]}

In [None]:
from trl import DPOConfig, DPOTrainer

training_args = DPOConfig(
    output_dir=new_model,
    beta=0.4,
    learning_rate=2e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_strategy="epoch",
    num_train_epochs=1,
)
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    max_length=1024,
    train_dataset=train_dataset,
    tokenizer=tokenizer,  # for visual language models, use tokenizer=processor instead
)
dpo_trainer.train()