In [None]:
%%capture
!pip install unsloth xformers "trl<0.9.0" peft accelerate bitsandbytes wandb


In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
max_seq_length = 548  # Supports long context

In [None]:
import json
import wandb
from huggingface_hub import login, whoami

# Remember to visit the URLs below to get your Hugging Face and Weights & Biases (W&B) API keys!
# Hugging Face: https://huggingface.co/settings/tokens
# W&B:          https://wandb.ai/authorize

# Access the API key
HF_API_Key = " "
WANDB_API_KEY = " "

# Log into Hugging Face
login(token=HF_API_Key)

# Log into Wandb
wandb.login(key=WANDB_API_KEY)


print("Login setup complete!")

# Get the current user info
user_info = whoami()


# Print normal text line by line
for key, value in user_info.items():
    print(f"{key}: {value}")

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct-bnb-4bit",  # 4-bit quantized base (1-3B ideal for beginners) unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit
    max_seq_length=max_seq_length,
    dtype=None,  # Auto-detect float16/bfloat16
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=8,  # LoRA rank (1-10% params: ~10M trainable for 1B model)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],  # QLoRA targets
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # Memory-efficient
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


In [None]:
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = load_dataset("yahma/alpaca-cleaned", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)


In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=30,  # Adjust for full train (e.g., 1000+)
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to = "wandb", # Use TrackIO/WandB etc
    ),
)

trainer.train()


In [None]:
FastLanguageModel.for_inference(model)

prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Explain machine learning in simple terms.

### Input:
None

### Response:
"""
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=164, use_cache=True)
print(tokenizer.batch_decode(outputs)[0].split("### Response:")[-1])

In [None]:
# Save LoRA adapters (~10MB)
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


# Load later
model2, tokenizer2 = FastLanguageModel.from_pretrained(
    model_name="lora_model",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model2)


In [None]:
if True: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "model.gguf",
        tokenizer,
        quantization_method = "q4_k_m", # For now only Q8_0, BF16, F16 supported
    )

In [None]:
if True: # Change to True to upload GGUF
    model.push_to_hub_gguf(
        "hf/model.gguf",
        tokenizer,
        quantization_method = "q4_k_m", # Only Q8_0, BF16, F16 supported
        token = "",
    )