<a href="https://colab.research.google.com/github/Xujia118/Etude_Advanced_NeuralNetwork/blob/main/LearnLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

In [None]:
BATCH, CHANNELS = 8, 64

X = torch.randn((BATCH, CHANNELS)).float()
weights = torch.randn((CHANNELS, CHANNELS)).float()

out = X @ weights
out.shape
print("Num trainable:", {weights.numel()})

Num trainable: {4096}


In [None]:
# Lora
BATCH, CHANNELS = 8, 64

rank = 2
scaling = 0.5

X = torch.randn((BATCH, CHANNELS)).float()
W = torch.randn((CHANNELS, CHANNELS)).float()

lora_B = torch.randn((CHANNELS, rank)).float()
lora_A = torch.randn((rank, CHANNELS)).float()

base_out = X @ W                 # frozen output
lora_out = X @ lora_B @ lora_A   # trainable LoRA update

out = base_out + scaling * lora_out

num_trainable_params = lora_B.numel() + lora_A.numel()
print("Trainable LoRA params:", num_trainable_params)

Trainable LoRA params: 256


## Merge a LoRA adapter

In [None]:
import torch
import torch.nn as nn

class LoRALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, scale=0.5):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.B = nn.Parameter(torch.randn(output_dim, rank))
        self.A = nn.Parameter(torch.randn(rank, input_dim))
        self.scale = scale

    def forward(self, x):
        lora_adjustment = self.B @ self.A
        return self.linear(x) + self.scale * (x @ lora_adjustment.T)

input_dim = 64
output_dim = 32
rank = 16


lora = LoRALayer(input_dim, output_dim, rank)
input_tensor = torch.randn(1, input_dim)
output_tensor = lora(input_tensor)

'''
x = input_tensor = [1, input_dim] # [batch, input]

linear: [input, ouput]
B = [output, rank]
A = [rank, input]

x + scale * (x @ B @ A)
= [1, input] + [1, input] @ [output, r]


X = [batch, input]
W = [input, output] # canonical W. But in pytorch, it's transposed.
B = [output, rank]
A = [rank, input]

output = X @ W + X @ (B @ A).T
= [batch, input] @ [input, output] + [batch, input] @ ([output, rank] @ [rank, input]).T

'''
print("input:", input_tensor.shape)
print("output:", output_tensor.shape)


input: torch.Size([1, 64])
output: torch.Size([1, 32])


In [None]:
scale = 2 / rank

W = torch.randn(input_dim, output_dim)
B = torch.randn(output_dim, rank)
A = torch.randn(rank, input_dim)

print("W:", W.shape)

merged_W = W + scale * (B @ A).T
print("merged W:", merged_W.shape)

W: torch.Size([64, 32])
merged W: torch.Size([64, 32])


## LoRA with HuggingFace

In [14]:
from transformers import AutoTokenizer

model_tag = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_tag)

print("before:", tokenizer.pad_token)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("after:", tokenizer.pad_token)


before: None
after: <|endoftext|>


In [18]:
from datasets import load_dataset

dataset = load_dataset("garage-bAInd/Open-Platypus")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'data_source'],
        num_rows: 24926
    })
})


In [19]:
def test_tokenize(words):
    return tokenizer.tokenize(words)


print(test_tokenize("I love China!"))



['I', 'Ġlove', 'ĠChina', '!']


In [20]:
dataset = (
    dataset
    .filter(lambda x: x['input'] == '')
    .filter(lambda x: len(tokenizer.tokenize(x['instruction'] + x['output'])) < 256)
    .remove_columns(['input', 'data_source'])
)

dataset = dataset['train'].train_test_split(test_size=0.1)

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['output', 'instruction'],
        num_rows: 7324
    })
    test: Dataset({
        features: ['output', 'instruction'],
        num_rows: 814
    })
})


In [None]:
dataset['train'][0]

{'output': "First, let's calculate the number of rainy days with thunderstorms:\n\nRainy Days with Thunderstorms = Total Rainy Days * Percentage of Thunderstorms\n                              = 120 days * 0.60\n                              = 72 days\n\nNow, let's find the probability of a randomly chosen day being a rainy day with a thunderstorm:\n\nProbability = Rainy Days with Thunderstorms / Total Days in a Year\n            = 72 days / 365 days\n            ≈ 0.197\n\nSo, the probability of a randomly chosen day from this city being a rainy day with a thunderstorm is approximately 19.7%.",
 'instruction': 'In a certain city, it rains on average 120 days per year. Out of these rainy days, 60% experience thunderstorms. What is the probability that a randomly chosen day from this city will be a rainy day with a thunderstorm?'}

In [21]:
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}

# create text field with formatted prompt + completion + EOS_TOKEN
dataset = dataset.map(formatting_prompts_func, batched=True)

print(dataset['train'][0]['text'])

Map:   0%|          | 0/7324 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Jo has given Aiden at least one toy car every birthday. The number of cars has corresponded to Aiden's age (one car for his first birthday, two cars for his second birthday, and so on). After receiving the cars for his 12th birthday, how many total toy cars will Aiden have received from Jo?

### Response:
I need to find the sum of the first 12 positive integers, since those are the numbers of cars Aiden got each year. I recall that there is a formula for this: the sum of the first n positive integers is n(n+1)/2. I plug in n = 12 and get 12(12+1)/2 = 12(13)/2 = 6(13) = 78. Therefore, Aiden has received 78 toy cars from Jo over the years.

<|endoftext|>


In [22]:
max_length = 256

# [BATCH, TIMESTEPS, FEATURES] # [256, 270 -> 256, 240 -> 256 (add padding tokens) ]

def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        prompt["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_output = generate_and_tokenize_prompt(dataset['train'][0])

print("Tokenizer eos token id: ", tokenizer.eos_token_id)
print(f"{tokenized_output}")


print(f"keys: {tokenized_output.keys()}")

# print(tokenized_output['input_ids'] != tokenizer.pad_token)
print(tokenized_output['attention_mask'])



Tokenizer eos token id:  50256
{'input_ids': [21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 9908, 468, 1813, 317, 14029, 379, 1551, 530, 13373, 1097, 790, 10955, 13, 383, 1271, 286, 5006, 468, 6053, 276, 284, 317, 14029, 338, 2479, 357, 505, 1097, 329, 465, 717, 10955, 11, 734, 5006, 329, 465, 1218, 10955, 11, 290, 523, 319, 737, 2293, 6464, 262, 5006, 329, 465, 1105, 400, 10955, 11, 703, 867, 2472, 13373, 5006, 481, 317, 14029, 423, 2722, 422, 5302, 30, 198, 198, 21017, 18261, 25, 198, 40, 761, 284, 1064, 262, 2160, 286, 262, 717, 1105, 3967, 37014, 11, 1201, 883, 389, 262, 3146, 286, 5006, 317, 14029, 1392, 1123, 614, 13, 314, 10014, 326, 612, 318, 257, 10451, 329, 428, 25, 262, 2160, 286, 262, 717, 299, 3967, 37014, 318, 299, 7, 77, 10, 16, 20679, 17, 13, 314, 6107, 287, 299, 796, 1105, 290, 651, 1105, 7, 1065, 10, 16, 20679, 17, 796, 1105, 7, 1485, 20679, 17, 796, 718, 7, 1485, 8, 796, 8699, 13

In [23]:
tokenized_train_dataset = dataset["train"].map(generate_and_tokenize_prompt)
tokenized_test_dataset = dataset["test"].map(generate_and_tokenize_prompt)

Map:   0%|          | 0/7324 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

## Load in base model

In [1]:
!pip install -U bitsandbytes

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig  # <--- new import

# 1️⃣ Model ID
base_model_id = "microsoft/phi-2"

# 2️⃣ Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # load weights in 8-bit
    bnb_8bit_use_double_quant=True,  # optional, improves quantization quality
    bnb_8bit_quant_type="nf4",       # nf4 or fp4, nf4 usually better for LLMs
)

# 3️⃣ Load model with new config
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config  # <--- pass BitsAndBytesConfig here
)

# 4️⃣ Load tokenizer (unchanged)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)




model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [2]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (dense): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear8bitLt(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_

In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 262364160 || all params: 2779683840 || trainable%: 9.438633136061977


In [None]:
'''
Only 9.43% parameters are trainable, because we used 8-bit quantization.
'''

'\nOnly 9.43% parameters are trainable, because we used 8-bit quantization.\n'

## Create LoRA Model

In [4]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "fc1",
        "fc2",
        "dense",
        "lm_head"
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

'''
The choice of target_modules in LoRA is a crucial hyperparameter that balances performance and computational efficiency. Here's a breakdown of how to make that judgment:

Focus on Attention Layers (q_proj, k_proj, v_proj): These are often the primary targets because they govern how the model processes input sequences and generates context-aware representations. Modifying them allows the model to learn new patterns relevant to your specific task without retraining the entire model. Many studies and practical applications show that fine-tuning just these layers can yield significant improvements.

Including Feed-Forward Networks (fc1, fc2, dense): Adding the linear layers in the MLP (Feed-Forward) block allows for even more expressivity in adapting the model's internal representations. This can sometimes lead to better performance, especially for tasks that require more complex feature transformations.

Including the Output Head (lm_head): The lm_head is the final layer that maps the model's internal representation to the vocabulary probabilities. Including it in target_modules can be beneficial, particularly when fine-tuning for tasks where the output distribution needs significant shifting or when dealing with new tokens/vocabulary.

The Trade-off: The more target_modules you include, the more trainable parameters LoRA will add, increasing memory usage and training time. However, this can also lead to better performance if the original model's frozen weights are not sufficiently expressive for the new task.

Best Judgment:

Start Simple: A common approach is to begin by targeting only the query and value projections (q_proj, v_proj) or all three attention projections (q_proj, k_proj, v_proj). This provides a good balance of efficiency and effectiveness.
Iterate and Experiment: If you find that performance is still lacking, gradually add more modules like the MLP layers (fc1, fc2, dense) and potentially the lm_head. Monitor your validation metrics closely.
Resource Constraints: Your available GPU memory and compute will also influence how many modules you can comfortably target. Fewer modules mean less VRAM and faster training.
In our current example, targeting a broader set of layers (q_proj, k_proj, v_proj, fc1, fc2, dense, lm_head) suggests an attempt to achieve comprehensive adaptation for the instruction-following task, potentially aiming for higher quality responses, even if it introduces slightly more trainable parameters than a minimal LoRA setup.
'''



trainable params: 48906240 || all params: 2828590080 || trainable%: 1.728997083946501


"\nThe choice of target_modules in LoRA is a crucial hyperparameter that balances performance and computational efficiency. Here's a breakdown of how to make that judgment:\n\nFocus on Attention Layers (q_proj, k_proj, v_proj): These are often the primary targets because they govern how the model processes input sequences and generates context-aware representations. Modifying them allows the model to learn new patterns relevant to your specific task without retraining the entire model. Many studies and practical applications show that fine-tuning just these layers can yield significant improvements.\n\nIncluding Feed-Forward Networks (fc1, fc2, dense): Adding the linear layers in the MLP (Feed-Forward) block allows for even more expressivity in adapting the model's internal representations. This can sometimes lead to better performance, especially for tasks that require more complex feature transformations.\n\nIncluding the Output Head (lm_head): The lm_head is the final layer that map

In [5]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Line

## Data Collator

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
from transformers import DataCollatorForLanguageModeling, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-2')
if tokenizer.pad_token == None:
    tokenizer.pad_token = tokenizer.eos_token

# Example text data
texts = ["Hello, this is a test.", "This is another example of a text."]

# Tokenize the texts
inputs = tokenizer(
    texts,
    padding=True, # dynamic padding, padding to the longest sentence in the batch
    truncation=True,
    return_tensors="pt" # return tensor rather than python list
)

print("raw:", inputs)

# Initialize the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set mlm to False for causal language modeling
)

# Use the data collator to prepare batch
batch = data_collator([inputs])

# The outputs are ready to be used with a model
print("batch:", batch)

# Just for demonstration, showing keys and tensor shapes
for key, value in batch.items():
    print(f"{key}: {value.shape}")

raw: {'input_ids': tensor([[15496,    11,   428,   318,   257,  1332,    13, 50256],
        [ 1212,   318,  1194,  1672,   286,   257,  2420,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}
batch: {'input_ids': tensor([[[15496,    11,   428,   318,   257,  1332,    13, 50256],
         [ 1212,   318,  1194,  1672,   286,   257,  2420,    13]]]), 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1]]]), 'labels': tensor([[[15496,    11,   428,   318,   257,  1332,    13,  -100],
         [ 1212,   318,  1194,  1672,   286,   257,  2420,    13]]])}
input_ids: torch.Size([1, 2, 8])
attention_mask: torch.Size([1, 2, 8])
labels: torch.Size([1, 2, 8])


In the previous example, inputs are already processed.

inputs = tokenizer(texts, padding=True, return_tensors="pt")

So when you call:
batch = data_collator([inputs])

They look almost similar. We shouldn't do that.

Below is the correct way of using collating.



In [None]:
tokenized_samples = [
    tokenizer("Hello world"),
    tokenizer("This is a longer sentence")
]


batch = data_collator(tokenized_samples)
batch


{'input_ids': tensor([[15496,   995, 50256, 50256, 50256],
        [ 1212,   318,   257,  2392,  6827]]), 'attention_mask': tensor([[1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1]]), 'labels': tensor([[15496,   995,  -100,  -100,  -100],
        [ 1212,   318,   257,  2392,  6827]])}

In [None]:
tokenized_texts = [tokenizer(t) for t in texts]  # only lists, not tensors

tokenized_batch = data_collator(tokenized_texts)

print(tokenized_batch)

{'input_ids': tensor([[15496,    11,   428,   318,   257,  1332,    13, 50256],
        [ 1212,   318,  1194,  1672,   286,   257,  2420,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[15496,    11,   428,   318,   257,  1332,    13,  -100],
        [ 1212,   318,  1194,  1672,   286,   257,  2420,    13]])}


In [9]:
import os
from google.colab import userdata

# Retrieve the secret value using the name you set (WANDB_API_KEY)
wandb_api_key = userdata.get('WANDB_API_KEY')

# Set the WANDB_API_KEY environment variable
os.environ["WANDB_API_KEY"] = wandb_api_key

In [25]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

output_dir = "/content/drive/MyDrive/llm_finetune_checkpoints"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    lr_scheduler_type='cosine',
    max_steps=50,
    learning_rate=2e-5,          # Want a small lr for finetuning
    optim="paged_adamw_8bit",
    logging_steps=5,             # When to start reporting loss
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=25,               # Save checkpoints every 50 steps
    eval_strategy="steps", # Evaluate the model every logging step
    eval_steps=10,               # Evaluate and save checkpoints every 50 steps
    do_eval=True,                # Perform evaluation at the end of training
    report_to="wandb",           # Comment this out if you don't want to use weights & biases
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mxujia118[0m ([33mxujia1001[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
10,1.5105,1.538775
20,1.5468,1.461344
30,1.3732,1.414906
40,1.3272,1.393354
50,1.3987,1.391146




TrainOutput(global_step=50, training_loss=1.4165775203704833, metrics={'train_runtime': 246.7942, 'train_samples_per_second': 0.81, 'train_steps_per_second': 0.203, 'total_flos': 828677554176000.0, 'train_loss': 1.4165775203704833, 'epoch': 0.027307482250136537})