In [None]:
!pip install pandas datasets

In [None]:
!pip install transformers torch

In [None]:
!pip install xformers trl peft accelerate bitsandbytes

In [None]:
import json
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

In [None]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("We would be using this device:", device)

# Loading Dataset

In [None]:
# Load JSONL data (Custom Dataset)
print("\n2-[1/8] Loading dataset...")
data = []
with open("custom_dataset.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

custom_dataset = Dataset.from_dict({
    "messages": [item["messages"] for item in data]
})

print(f"✓ Loaded {len(custom_dataset)} examples")

# Preparing the Train and Test Dataset

we have decided to split 90% for training and 10% for testing

In [None]:
# Split dataset
split_dataset = custom_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]
print(f"✓ Train: {len(train_dataset)} | Val: {len(val_dataset)}")

In [None]:
print("\n[2/8] Configuring 8-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16,
)

# Model

## Note

we have login inside hugging face and make sure we request for access to use the LLaMA 3.2 3B model

In [None]:
!huggingface-cli login

In [None]:
# Load model and tokenizer
print("\n[3/8] Loading LLaMA 3.2 3B model...")
model_name = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

# LLaMA models usually do not have a pad token by default
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

print("✓ Model loaded in 8-bit")
print("✓ Model size: ~3B parameters")

In [None]:
# Prepare for k-bit training
print("\n[4/8] Preparing model for QLoRA...")
model = prepare_model_for_kbit_training(
    model,
    use_gradient_checkpointing=True
)

# (Optional but recommended) disable cache during training
model.config.use_cache = False

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = get_peft_model(model, lora_config)
print("✓ LoRA adapters added")
model.print_trainable_parameters()

# Tokenization function with proper chat formatting
print("\n[5/8] Preparing tokenization...")

In [None]:
def format_and_tokenize(example, force=4096):
    # Apply LLaMA chat template
    formatted_text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )

    # Tokenize (NO padding, NO manual labels)
    tokenized = tokenizer(
        formatted_text,
        truncation=True,
        max_length=force,   # IMPORTANT: increase for long transcripts
        padding=False,
    )

    return tokenized


## Note

we need to set the max_length for the tokenization function to 32768 for the custom dataset

In [1]:
format_and_tokenize_for_cds = lambda x: format_and_tokenize(x, force=2 ** 15)

In [1]:
# Apply tokenization
print("✓ Tokenizing train dataset...")
train_dataset = train_dataset.map(
    format_and_tokenize_for_cds,
    remove_columns=["messages"],
    batched=False,
    desc="Tokenizing train"
)


In [None]:
print("✓ Tokenizing validation dataset...")
val_dataset = val_dataset.map(
    format_and_tokenize_for_cds,
    remove_columns=["messages"],
    batched=False,
    desc="Tokenizing validation"
)

In [None]:
# Show sample stats (pre-training)
sample = train_dataset[0]

print("\n✓ Sample stats:")
print(f"  - Input length: {len(sample['input_ids'])} tokens")
print(f"  - Attention tokens: {sum(sample['attention_mask'])} tokens")
print(f"  - Truncated: {'Yes' if len(sample['input_ids']) == 2 ** 15 else 'No'}")


In [None]:
print("\n[6/8] Configuring training...")
training_args = TrainingArguments(
    output_dir="./llama3.2-3b-qlora-summary",
    num_train_epochs=6,

    # Memory-optimized batch settings
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,

    # Learning settings
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=100,
    max_grad_norm=1.0,

    # Evaluation and saving
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,

    # Optimization
    bf16=True,
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,

    # Efficiency
    group_by_length=True,
    dataloader_num_workers=4,

    # Logging
    logging_steps=10,
    report_to="none",
)

In [None]:
print("✓ Training configuration:")
print(f"  - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  - Total training steps: ~{len(train_dataset) * 3 // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")
print(f"  - Learning rate: {training_args.learning_rate}")

In [None]:
# Data collator with dynamic padding
print("\n[7/8] Creating data collator...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8  # Efficient padding for GPU
)

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)


In [None]:
# Start training
print("\n[8/8] Starting training...")
print("=" * 60)
print("TRAINING IN PROGRESS")
print("=" * 60)

trainer.train()

In [None]:
# Save final model
print("\n" + "=" * 60)
print("TRAINING COMPLETE")
print("=" * 60)
print("\nSaving model...")
trainer.save_model("./llama3.2-3b-qlora-summary")
tokenizer.save_pretrained("./llama3.2-3b-qlora-summary")