<a href="https://colab.research.google.com/github/Xujia118/Etude_Advanced_NeuralNetwork/blob/main/LearnLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch

In [3]:
BATCH, CHANNELS = 8, 64

X = torch.randn((BATCH, CHANNELS)).float()
weights = torch.randn((CHANNELS, CHANNELS)).float()

out = X @ weights
out.shape
print("Num trainable:", {weights.numel()})

Num trainable: {4096}


In [4]:
# Lora
BATCH, CHANNELS = 8, 64

rank = 2
scaling = 0.5

X = torch.randn((BATCH, CHANNELS)).float()
W = torch.randn((CHANNELS, CHANNELS)).float()

lora_B = torch.randn((CHANNELS, rank)).float()
lora_A = torch.randn((rank, CHANNELS)).float()

base_out = X @ W                 # frozen output
lora_out = X @ lora_B @ lora_A   # trainable LoRA update

out = base_out + scaling * lora_out

num_trainable_params = lora_B.numel() + lora_A.numel()
print("Trainable LoRA params:", num_trainable_params)

Trainable LoRA params: 256


## Merge a LoRA adapter

In [5]:
import torch
import torch.nn as nn

class LoRALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, scale=0.5):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.B = nn.Parameter(torch.randn(output_dim, rank))
        self.A = nn.Parameter(torch.randn(rank, input_dim))
        self.scale = scale

    def forward(self, x):
        lora_adjustment = self.B @ self.A
        return self.linear(x) + self.scale * (x @ lora_adjustment.T)

input_dim = 64
output_dim = 32
rank = 16


lora = LoRALayer(input_dim, output_dim, rank)
input_tensor = torch.randn(1, input_dim)
output_tensor = lora(input_tensor)

'''
x = input_tensor = [1, input_dim] # [batch, input]

linear: [input, ouput]
B = [output, rank]
A = [rank, input]

x + scale * (x @ B @ A)
= [1, input] + [1, input] @ [output, r]


X = [batch, input]
W = [input, output] # canonical W. But in pytorch, it's transposed.
B = [output, rank]
A = [rank, input]

output = X @ W + X @ (B @ A).T
= [batch, input] @ [input, output] + [batch, input] @ ([output, rank] @ [rank, input]).T

'''
print("input:", input_tensor.shape)
print("output:", output_tensor.shape)


input: torch.Size([1, 64])
output: torch.Size([1, 32])


In [6]:
scale = 2 / rank

W = torch.randn(input_dim, output_dim)
B = torch.randn(output_dim, rank)
A = torch.randn(rank, input_dim)

print("W:", W.shape)

merged_W = W + scale * (B @ A).T
print("merged W:", merged_W.shape)

W: torch.Size([64, 32])
merged W: torch.Size([64, 32])


## LoRA with HuggingFace

In [3]:
from transformers import AutoTokenizer

model_tag = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_tag)

print("before:", tokenizer.pad_token)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("after:", tokenizer.pad_token)


before: None
after: <|endoftext|>


In [1]:
from datasets import load_dataset

dataset = load_dataset("garage-bAInd/Open-Platypus")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'data_source'],
        num_rows: 24926
    })
})


In [9]:
def test_tokenize(words):
    return tokenizer.tokenize(words)


print(test_tokenize("I love China!"))



['I', 'Ġlove', 'ĠChina', '!']


In [4]:
dataset = (
    dataset
    .filter(lambda x: x['input'] == '')
    .filter(lambda x: len(tokenizer.tokenize(x['instruction'] + x['output'])) < 256)
    .remove_columns(['input', 'data_source'])
)

dataset = dataset['train'].train_test_split(test_size=0.1)

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['output', 'instruction'],
        num_rows: 7324
    })
    test: Dataset({
        features: ['output', 'instruction'],
        num_rows: 814
    })
})


In [11]:
dataset['train'][0]

{'output': 'It seems easiest to first solve for $x$ and then for $y$. We can solve for $x$ by adding the two equations together, giving $2x = 16$, or $x = 8$. Plugging $x$ into the first equation gives $8 + y = 7$, so $y = -1$. So, $x\\cdot y = -8$.',
 'instruction': 'If $x + y = 7$ and $x - y = 9$, find the product of $x$ and $y$.'}

In [5]:
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}

# create text field with formatted prompt + completion + EOS_TOKEN
dataset = dataset.map(formatting_prompts_func, batched=True)

print(dataset['train'][0]['text'])

Map:   0%|          | 0/7324 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
A triangle has three sides of the following side lengths: $7$, $10$, and $x^2$. What are all of the positive integer values of $x$ such that the triangle exists? Separate your answers using commas and express them in increasing order.

### Response:
For a triangle to exist, the sum of two sides of the triangle must be greater than the third. Therefore, we have three formulas: $x^2+7>10 \to x^2>3$, $x^2+10>7 \to x^2>-3$, and $7+10>x^2 \to x^2<17$. Thus, we have two quadratics, $x^2>3$ and $x^2<17$. Therefore, possible values for $x$ are $2, 3, \text{ and } 4$.<|endoftext|>


In [6]:
max_length = 256

# [BATCH, TIMESTEPS, FEATURES] # [256, 270 -> 256, 240 -> 256 (add padding tokens) ]

def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        prompt["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_output = generate_and_tokenize_prompt(dataset['train'][0])

print("Tokenizer eos token id: ", tokenizer.eos_token_id)
print(f"{tokenized_output}")


print(f"keys: {tokenized_output.keys()}")

# print(tokenized_output['input_ids'] != tokenizer.pad_token)
print(tokenized_output['attention_mask'])



Tokenizer eos token id:  50256
{'input_ids': [21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 32, 22950, 468, 1115, 5389, 286, 262, 1708, 1735, 20428, 25, 720, 22, 47113, 720, 940, 47113, 290, 720, 87, 61, 17, 35307, 1867, 389, 477, 286, 262, 3967, 18253, 3815, 286, 720, 87, 3, 884, 326, 262, 22950, 7160, 30, 8621, 30748, 534, 7429, 1262, 725, 292, 290, 4911, 606, 287, 3649, 1502, 13, 198, 198, 21017, 18261, 25, 198, 1890, 257, 22950, 284, 2152, 11, 262, 2160, 286, 734, 5389, 286, 262, 22950, 1276, 307, 3744, 621, 262, 2368, 13, 8447, 11, 356, 423, 1115, 32126, 25, 720, 87, 61, 17, 10, 22, 29, 940, 3467, 1462, 2124, 61, 17, 29, 18, 47113, 720, 87, 61, 17, 10, 940, 29, 22, 3467, 1462, 2124, 61, 17, 29, 12, 18, 47113, 290, 720, 22, 10, 940, 29, 87, 61, 17, 3467, 1462, 2124, 61, 17, 27, 1558, 35307, 6660, 11, 356, 423, 734, 15094, 10366, 873, 11, 720, 87, 61, 17, 29, 18, 3, 290, 720, 87, 61, 17, 27, 155

In [7]:
tokenized_train_dataset = dataset["train"].map(generate_and_tokenize_prompt)
tokenized_test_dataset = dataset["test"].map(generate_and_tokenize_prompt)

Map:   0%|          | 0/7324 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

## Load in base model

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig  # <--- new import

# 1️⃣ Model ID
base_model_id = "microsoft/phi-2"

# 2️⃣ Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # load weights in 8-bit
    bnb_8bit_use_double_quant=True,  # optional, improves quantization quality
    bnb_8bit_quant_type="nf4",       # nf4 or fp4, nf4 usually better for LLMs
)

# 3️⃣ Load model with new config
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config  # <--- pass BitsAndBytesConfig here
)

# 4️⃣ Load tokenizer (unchanged)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (dense): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear8bitLt(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_