<a href="https://colab.research.google.com/github/Sounakray2003/Asmadiya-tech/blob/main/SFT_trainer_for_Llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes xformers datasets huggingface_hub torch --extra-index-url https://download.pytorch.org/whl/cu118

In [None]:
from huggingface_hub import notebook_login
notebook_login()  # Paste your HF token (must accept Llama license)

In [None]:
# =====================================================
# QLoRA Fine-Tuning: Llama-3.2-1B-Instruct
# 4-bit + LoRA | Free Colab T4 | ~10-15 mins | 1k samples
# =====================================================

# --- CELL 1: Install ---
!pip install -q bitsandbytes accelerate peft trl transformers datasets huggingface_hub

# --- CELL 2: HF Login ---
from huggingface_hub import notebook_login
print("Paste your Hugging Face token (required for Llama):")
notebook_login()

# --- CELL 3: Load 4-bit Model + LoRA ---
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_name = "meta-llama/Llama-3.2-1B-Instruct"

# 4-bit config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer + add [PAD]
tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load 4-bit model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="cuda",
    token=True,
)

# Resize embeddings for [PAD]
model.resize_token_embeddings(len(tokenizer))

# Prepare for QLoRA
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print(f"VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB")

# --- CELL 4: Load & Format Dataset ---
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned", split="train")

alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def format(examples):
    texts = [alpaca_prompt.format(i, o) + EOS_TOKEN for i, o in zip(examples["instruction"], examples["output"])]
    return {"text": texts}

dataset = dataset.map(format, batched=True, remove_columns=dataset.column_names)
dataset = dataset.shuffle(seed=42).select(range(1000))

# --- CELL 5: Tokenize ---
def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
        padding=False,
    )

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

# --- CELL 6: Data Collator ---
from torch.utils.data import DataLoader

def collator(features):
    input_ids = [f["input_ids"] for f in features]
    attention_mask = [f["attention_mask"] for f in features]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)

    labels = input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_loader = DataLoader(tokenized, batch_size=2, shuffle=True, collate_fn=collator)

# --- CELL 7: Training Loop ---
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-4)
num_epochs = 1
total_steps = len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=total_steps)

model.train()
accum_steps = 4
step = 0

print(f"Starting QLoRA training – {total_steps} steps")

for epoch in range(num_epochs):
    for batch in train_loader:
        step += 1
        batch = {k: v.to("cuda") for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss / accum_steps
        loss.backward()

        if step % accum_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if step % 10 == 0:
            print(f"Step {step} | Loss: {loss.item()*accum_steps:.4f} | VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB")

print("Training complete!")

# --- CELL 8: Save LoRA + Merge ---
lora_dir = "llama32-1b-qlora"
model.save_pretrained(lora_dir)
tokenizer.save_pretrained(lora_dir)
print(f"LoRA adapter saved: ~30 MB → {lora_dir}")

# Merge into full 16-bit
if input("Merge & save full 16-bit model? (y/n): ").lower() == "y":
    from peft import PeftModel

    print("Loading base model with [PAD] token...")
    tokenizer_merged = AutoTokenizer.from_pretrained(model_name, token=True)
    if tokenizer_merged.pad_token is None:
        tokenizer_merged.add_special_tokens({'pad_token': '[PAD]'})

    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=True,
    )
    base_model.resize_token_embeddings(len(tokenizer_merged))

    print("Merging LoRA...")
    model_peft = PeftModel.from_pretrained(base_model, lora_dir)
    merged_model = model_peft.merge_and_unload()

    merged_dir = "llama32-1b-qlora-merged"
    merged_model.save_pretrained(merged_dir)
    tokenizer_merged.save_pretrained(merged_dir)
    print(f"Merged 16-bit model saved: ~2 GB → {merged_dir}")

# --- CELL 9: Inference (on merged model) ---
from transformers import pipeline

merged_dir = "llama32-1b-qlora-merged"
gen = pipeline(
    "text-generation",
    model=merged_dir,
    tokenizer=merged_dir,
    device=0,
    torch_dtype=torch.bfloat16,
)

prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is the capital of Japan?

### Response:
"""

print("\nGenerating...")
out = gen(prompt, max_new_tokens=64, do_sample=True, temperature=0.7)
response = out[0]["generated_text"].split("### Response:")[-1].strip()
print(f"Model says:\n{response}")

print("\nAll done! Full QLoRA pipeline complete.")

In [None]:
# =====================================================
# QLoRA Fine-Tuning: Llama-3.2-1B-Instruct (Custom JSON Dataset)
# 4-bit + LoRA | Free Colab T4 | ~10-15 mins | Your JSON Dataset
# =====================================================

# --- CELL 1: Install ---
!pip install -q bitsandbytes accelerate peft trl transformers datasets huggingface_hub

# --- CELL 2: HF Login ---
from huggingface_hub import notebook_login
print("Paste your Hugging Face token (required for Llama):")
notebook_login()

# --- CELL 3: Prepare Custom Dataset ---
# Step 1: Save your JSON to 'dataset.json' (copy from your message)
# In Colab Files panel: Create 'dataset.json', paste the JSON array, save.

# Step 2: Load the JSON
from datasets import load_dataset

dataset = load_dataset("json", data_files="dataset.json")["train"]
print(f"Loaded custom dataset with {len(dataset)} examples")
print(f"Columns: {dataset.column_names}")

# --- CELL 4: Load 4-bit Model + LoRA ---
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_name = "meta-llama/Llama-3.2-1B-Instruct"

# 4-bit config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer + add [PAD]
tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load 4-bit model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="cuda",
    token=True,
)

# Resize embeddings for [PAD]
model.resize_token_embeddings(len(tokenizer))

# Prepare for QLoRA
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print(f"VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB")

# --- CELL 5: Format Dataset ---
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def format(examples):
    # Concat 'input' to 'instruction' if present and non-empty
    instructions = examples["instruction"]
    if "input" in examples:
        instructions = [f"{instr}\n{examples['input'][i]}" if examples['input'][i] else instr
                        for i, instr in enumerate(examples["instruction"])]

    texts = [alpaca_prompt.format(instr, out) + EOS_TOKEN
             for instr, out in zip(instructions, examples["output"])]
    return {"text": texts}

dataset = dataset.map(format, batched=True)
dataset = dataset.shuffle(seed=42).select(range(min(1000, len(dataset))))  # Use up to 1000 examples

print(f"Formatted custom dataset ready – {len(dataset)} examples")

# --- CELL 6: Tokenize ---
def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
        padding=False,
    )

tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

# --- CELL 7: Data Collator ---
from torch.utils.data import DataLoader

def collator(features):
    input_ids = [f["input_ids"] for f in features]
    attention_mask = [f["attention_mask"] for f in features]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)

    labels = input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_loader = DataLoader(tokenized, batch_size=2, shuffle=True, collate_fn=collator)

# --- CELL 8: Training Loop ---
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-4)
num_epochs = 4
total_steps = len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=total_steps)

model.train()
accum_steps = 4
step = 0

print(f"Starting QLoRA training on custom dataset – {total_steps} steps")

for epoch in range(num_epochs):
    for batch in train_loader:
        step += 1
        batch = {k: v.to("cuda") for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss / accum_steps
        loss.backward()

        if step % accum_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if step % 10 == 0:
            print(f"Step {step} | Loss: {loss.item()*accum_steps:.4f} | VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB")

print("Training complete!")

# --- CELL 9: Save LoRA + Merge ---
lora_dir = "llama32-1b-qlora-custom"
model.save_pretrained(lora_dir)
tokenizer.save_pretrained(lora_dir)
print(f"LoRA adapter saved: ~30 MB → {lora_dir}")

# Merge into full 16-bit
if input("Merge & save full 16-bit model? (y/n): ").lower() == "y":
    from peft import PeftModel

    print("Loading base model with [PAD] token...")
    tokenizer_merged = AutoTokenizer.from_pretrained(model_name, token=True)
    if tokenizer_merged.pad_token is None:
        tokenizer_merged.add_special_tokens({'pad_token': '[PAD]'})

    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=True,
    )
    base_model.resize_token_embeddings(len(tokenizer_merged))

    print("Merging LoRA...")
    model_peft = PeftModel.from_pretrained(base_model, lora_dir)
    merged_model = model_peft.merge_and_unload()

    merged_dir = "llama32-1b-qlora-merged-custom"
    merged_model.save_pretrained(merged_dir)
    tokenizer_merged.save_pretrained(merged_dir)
    print(f"Merged 16-bit model saved: ~2 GB → {merged_dir}")

# --- CELL 10: Inference (on merged model) ---
from transformers import pipeline

merged_dir = "llama32-1b-qlora-merged-custom"
gen = pipeline(
    "text-generation",
    model=merged_dir,
    tokenizer=merged_dir,
    device=0,
    torch_dtype=torch.bfloat16,
)

prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is the capital of Japan?

### Response:
"""

print("\nGenerating...")
out = gen(prompt, max_new_tokens=64, do_sample=True, temperature=0.7)
response = out[0]["generated_text"].split("### Response:")[-1].strip()
print(f"Model says:\n{response}")

print("\nAll done! QLoRA on custom JSON dataset complete.")

In [9]:
# --- QUICK TEST: Ask about Asmadiya ---
from transformers import pipeline

gen = pipeline(
    "text-generation",
    model="llama32-1b-qlora-merged-custom",
    tokenizer="llama32-1b-qlora-merged-custom",
    device=0,
    torch_dtype=torch.bfloat16,
)

def ask_question(instruction):
    prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""
    out = gen(prompt, max_new_tokens=128, do_sample=True, temperature=0.7)
    return out[0]["generated_text"].split("### Response:")[-1].strip()

# Test questions
print("Q1:", ask_question("What services does Asmadiya Technologies offer?"))
print("\nQ2:", ask_question("What is AlphaTrain?"))
print("\nQ3:", ask_question("Who is the CEO of Asmadiya Technologies?"))

Device set to use cuda:0


Q1: Services: Software Development, AI/ML, Cybersecurity, DevOps, Quality Assurance, Mobile App Development, IT Consulting.

Q2: AlphaTrain is an AI/ML platform for training multiple models simultaneously, providing parallel training and real-time monitoring.

Q3: Ashish Mishra is the CEO of Asmadiya Technologies.
