# Part3: LoRA Variants

## Classic LoRA fine-tune (Transformers Trainer + PEFT)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType

model_id = "meta-llama/Llama-3.1-8B"  # example; use a model you have access to
dataset_id = "tatsu-lab/alpaca"       # example dataset

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# LoRA config: pick target_modules for your architecture
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # common for LLaMA-like
    bias="none",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

ds = load_dataset(dataset_id, split="train")

def format_example(ex):
    # Basic Alpaca-style formatting
    inst = ex.get("instruction", "")
    inp = ex.get("input", "")
    out = ex.get("output", "")
    prompt = f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}"
    return {"text": prompt}

ds = ds.map(format_example)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=1024,
        padding="max_length",
    )

tokenized = ds.map(tokenize, batched=True, remove_columns=ds.column_names)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="./lora_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=200,
    bf16=True,
    optim="adamw_torch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    data_collator=collator,
)

trainer.train()
model.save_pretrained("./lora_adapter")
tokenizer.save_pretrained("./lora_adapter")


## QLoRA fine-tune (4-bit base + LoRA adapters)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType

model_id = "meta-llama/Llama-3.1-8B"  # example
dataset_id = "tatsu-lab/alpaca"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",         # NF4 is a key QLoRA detail
    bnb_4bit_use_double_quant=True,    # double quantization
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
)

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

ds = load_dataset(dataset_id, split="train")

def format_example(ex):
    inst = ex.get("instruction", "")
    inp = ex.get("input", "")
    out = ex.get("output", "")
    return {"text": f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}"}

ds = ds.map(format_example)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=1024, padding="max_length")

tokenized = ds.map(tokenize, batched=True, remove_columns=ds.column_names)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="./qlora_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=200,
    bf16=True,
    optim="paged_adamw_8bit",  # common with bitsandbytes; helps VRAM spikes
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    data_collator=collator,
)

trainer.train()
model.save_pretrained("./qlora_adapter")
tokenizer.save_pretrained("./qlora_adapter")


## DoRA fine-tune (PEFT: use_dora=True)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

# Compare to LoRA setup, DoRA is often just:
dora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    use_dora=True,  # <-- DoRA switch exposed by PEFT
)

model = get_peft_model(base_model, dora_config)


## LongLoRA with Hugging Face

1. **Practical path A (recommended): use the official LongLoRA repo scripts**

The repo is already built around the HF ecosystem and includes:
* supervised fine-tuning scripts (and a QLoRA variant)
* a script to merge LoRA weights and save a Hugging Face model

Example command (structure; exact flags may vary by repo version):
  ```bash
  git clone https://github.com/JIA-Lab-research/LongLoRA.git
  cd LongLoRA
  pip install -r requirements.txt
  pip install flash-attn --no-build-isolation

  # SFT (or use supervised-fine-tune-qlora.py for 4-bit)
  torchrun --nproc_per_node 8 supervised-fine-tune.py \
    --base_model path_to/Llama-2-7b-hf \
    --data_path path_to/your_long_sft_data.json \
    --context_size 8192 \
    --output_dir ./longlora_ckpt
  ```

Then merge adapter weights into a standard HF model using the repo’s merge script (the repo shows a merge command pattern like below).
  ```bash
  python3 merge_lora_weights_and_save_hf_model.py \
    --base_model path_to/Llama-2-7b-hf \
    --peft_model ./longlora_ckpt \
    --context_size 8192 \
    --save_path ./longlora_merged_hf
  ```

2. **Practical path B: DIY patch inside Transformers**

If you really want everything in one script, the essence is:
1. Extend position handling (e.g., RoPE scaling / long-context setup for your model family).
2. Swap attention during training to S²-Attn (shifted local attention).
3. Apply LoRA/QLoRA with PEFT, plus train embeddings/norms as LongLoRA suggests.

But step (2) is the nontrivial bit: you must either import LongLoRA’s attention module or implement the shift-window logic.

## LoHA in Hugging Face (PEFT)

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from peft import LoHaConfig, get_peft_model, TaskType

model_id = "facebook/opt-125m"  # replace
dataset_id = "tatsu-lab/alpaca" # replace

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)

# LoHA config (note: params differ from LoRA: alpha, rank_dropout, module_dropout)
peft_config = LoHaConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    alpha=8,
    rank_dropout=0.0,
    module_dropout=0.0,
    target_modules="all-linear",  # or ["q_proj","v_proj",...]
    init_weights=True,
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

ds = load_dataset(dataset_id, split="train")

def format_ex(ex):
    inst, inp, out = ex.get("instruction",""), ex.get("input",""), ex.get("output","")
    return {"text": f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}"}

ds = ds.map(format_ex)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=1024, padding="max_length")

tokenized = ds.map(tok, batched=True, remove_columns=ds.column_names)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="./loha_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    bf16=True,
    logging_steps=10,
    save_steps=200,
    report_to="none",
)

Trainer(model=model, args=args, train_dataset=tokenized, data_collator=collator).train()
model.save_pretrained("./loha_adapter")
tokenizer.save_pretrained("./loha_adapter")


## VeRA in Hugging Face (PEFT)

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from peft import VeraConfig, get_peft_model, TaskType

model_id = "facebook/opt-125m"
dataset_id = "tatsu-lab/alpaca"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)

vera_config = VeraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=128,                        # VeRA often uses higher "r" than LoRA
    target_modules="all-linear",  # VeRA supports nn.Linear (per docs)
    vera_dropout=0.0,
    projection_prng_key=0,
    save_projection=True,         # set False to shrink checkpoints (see docs)
    bias="none",
)

model = get_peft_model(base_model, vera_config)
model.print_trainable_parameters()

ds = load_dataset(dataset_id, split="train")

def format_ex(ex):
    inst, inp, out = ex.get("instruction",""), ex.get("input",""), ex.get("output","")
    return {"text": f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}"}

ds = ds.map(format_ex)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=1024, padding="max_length")

tokenized = ds.map(tok, batched=True, remove_columns=ds.column_names)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="./vera_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    bf16=True,
    logging_steps=10,
    save_steps=200,
    report_to="none",
)

Trainer(model=model, args=args, train_dataset=tokenized, data_collator=collator).train()
model.save_pretrained("./vera_adapter")
tokenizer.save_pretrained("./vera_adapter")