# EXERCISE 7.1 CHANGING PROMPT STYLES
After finetuning the model with the Alpaca prompt style, try the Phi-3 prompt style
shown in figure 7.4 and observe if it affects the response quality of the model.

## 微调

In [None]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch07", "01_main-chapter-code")
print(file_path)
sys.path.append(file_path)

import torch
if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

In [None]:
def format_input(entry):
    instruction_text = (
        f"<|user|>\n"
        f"{entry['instruction']}\n"
        f"<|assistant|>\n"
    )

    # 处理Input为空/非空的情况
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [None]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:
            # 拼接指令
            instruction_plus_input = format_input(entry)
            # 拼接输出text
            response_text = f"\n\n### Response:\n{entry['output']}"
            # 合并指令+输出
            full_text = instruction_plus_input + response_text
            # 编码上述信息
            self.encoded_texts.append(tokenizer.encode(full_text))
    
    def __getitem__(self, index):
        return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)

In [None]:
# 增加输入和目标
# 填充方法
def custom_collate_fn(
        batch, 
        pad_token_id=50256, 
        ignore_index=-100,
        allowed_max_length=None,
        device="cpu"
):
    # 填充至当前batch的最大长度+1
    # 至少会填充一个<|endoftext|>
    batch_max_length = max(len(item)+1 for item in batch)

    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        # 填充endoftext
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] * 
            (batch_max_length - len(new_item))
        )
        # 去除最后一个token, 作为输入
        # 相对的，如果去掉第一个token，则作为目标
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])

        # targets中仅保留一个<|endoftext|>，其余填充为ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index
        
        # 最大长度截断
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)
    # stack to batch
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [None]:
from functools import partial
# 将部分参数提前填充，并生成一个新的函数，以适配collate函数的要求
customized_collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

In [None]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [None]:
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt

BASE_CONFIG = {
    "vocab_size": 50257,     # 词表大小
    "context_length": 1024,  # 上下文长度
    "drop_rate": 0.0,        # Dropout率
    "qkv_bias": True         # 查询-键-值偏置
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.to(device)
model.eval()

In [None]:
from previous_chapters import (
    calc_loss_loader,
    train_model_simple
)

# 模型训练
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

In [None]:
# 绘制损失曲线
from previous_chapters import plot_losses

epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
torch.manual_seed(123)

for entry in test_data[:3]:

    # 输入格式化
    input_text = format_input(entry)

    # 生成回答
    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    geneerated_text = token_ids_to_text(token_ids, tokenizer)
    # 提取有效回答
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )

    # 对比output
    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

In [None]:
# 保存测试集的结果
from tqdm import tqdm

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):

    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )

    test_data[i]["model_response"] = response_text

with open("instruction-data-with-response-phi3.json", "w") as file:
    json.dump(test_data, file, indent=4)

In [None]:
import re

file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL)}-phi3-sft.pth"
torch.save(model.state_dict(), file_name)
print(f"Model saved as {file_name}")

## 效果评估

In [None]:
# 检测ollama是否正在运行
import psutil 

def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(["name"]):
        if process_name in proc.info["name"]:
            running = True
            break
    return running

ollama_running = check_if_running("ollama")

if not ollama_running:
    raise RuntimeError("Ollama not running. Lanunch ollama before proceeding.")
print("Ollama running:", check_if_running("ollama"))

In [None]:
# 通过REST api访问ollama
import urllib.request

def query_model(
        prompt,
        model="llama3",
        url="http://localhost:11434/api/chat"
):
    # 构造请求数据
    data = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "options": {
            "seed": 123,
            "temperature": 0.,
            "num_ctx": 2048
        }
    }

    # dict转化json并编码
    payload = json.dumps(data).encode("utf-8")
    request = urllib.request.Request(
        url,
        data=payload,
        method="POST"
    )
    request.add_header("Content-Type", "application/json")

    # 解析返回结果
    response_data = ""
    with urllib.request.urlopen(request) as response:
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]
    
    return response_data

model = "llama3"
result = query_model("What do Llamas eat?", model)
print(result)

In [None]:
# 封装上述功能
def generate_model_scores(json_data, json_key, model="llama3"):
    scores = []

    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"score the model response `{entry[json_key]}`"
            f" on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with the integer number only."
        )
        score = query_model(prompt, model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Could not convert score: {score}")
            continue

    return scores

scores = generate_model_scores(test_data, "model_response")
print(f"Number of scores: {len(scores)} of {len(test_data)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")

In [None]:
scores

# EXERCISE 7.2 INSTRUCTION AND INPUT MASKING
After completing the chapter and finetuning the model with the InstructionDataset
implemented in this section, replace the instruction and input tokens with the -100
mask to implement the instruction masking method illustrated in Figure 7.13. Then,
evaluate whether this has a positive effect on model performance.

In [None]:
# 主要的思路是：在计算loss时，进行mask
# 这里的loss计算，主要是使用torch.nn.functional.cross_entropy(logits, labels)
# 将需要mask的token对应的label设置为-100，然后计算loss就能实现mask的效果

In [None]:
import torch
from torch.utils.data import Dataset

class InstructionDataset_mask(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:
            # 拼接指令
            instruction_plus_input = format_input(entry)
            # 拼接输出text
            response_text = f"\n\n### Response:\n{entry['output']}"
            # 合并指令+输出
            full_text = instruction_plus_input + response_text
            # 编码上述信息
            self.encoded_texts.append(
                {
                    "text": tokenizer.encode(full_text),
                    "prefix_len": len(instruction_plus_input)
                }
            )
    
    def __getitem__(self, index):
        return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)

In [None]:
# 增加输入和目标
# 填充方法
def custom_collate_fn(
        batch, 
        pad_token_id=50256, 
        ignore_index=-100,
        allowed_max_length=None,
        device="cpu",
        ignore_prompt_tokens=False
):
    # 填充至当前batch的最大长度+1
    # 至少会填充一个<|endoftext|>
    batch_max_length = max(len(item)+1 for item in batch)

    inputs_lst, targets_lst = [], []
    for items in batch:
        mask_len = 0
        if ignore_prompt_tokens:
            item = items["text"]
            mask_len = items["prefix_len"]
        else:
            item = items

        new_item = item.copy()
        # 填充endoftext
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] * 
            (batch_max_length - len(new_item))
        )
        # 去除最后一个token, 作为输入
        # 相对的，如果去掉第一个token，则作为目标
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])

        # 将指令和输入的token设置为ignore_index
        if ignore_prompt_tokens:
            targets[:mask_len] = ignore_index

        # targets中仅保留一个<|endoftext|>，其余填充为ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index
        
        # 最大长度截断
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)
    # stack to batch
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

# EXERCISE 7.3 FINETUNING ON THE ORIGINAL ALPACA DATASET
The so-called Alpaca dataset by researchers at Stanford is one of the earliest and
most popular openly shared instruction datasets, consisting of 52,002 entries. As an
alternative to the instruction-data.json file we use in this chapter, consider
finetuning an LLM on this dataset. The dataset is available at the following URL:
https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json
This dataset contains 52,002 entries, which is approximately 50 times more than
those we used in this chapter, and most entries are longer as well. Thus, it's highly
recommended to conduct the training using a GPU to accelerate the finetuning
process. If you encounter out-of-memory errors, consider reducing the batch_size
from 8 to 4, 2, or even 1. Additionally, lowering the allowed_max_length from 1024
to 512 or 256 can further help manage memory issues.

In [None]:
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    # else:
    #     with open(file_path, "r", encoding="utf-8") as file:
    #         text_data = file.read()
    
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    return data

In [None]:
file_path = "alpaca_data.json"
url = (
    "https://raw.githubusercontent.com"
    "/tatsu-lab/stanford_alpaca/main/alpaca_data.json"
)

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

# EXERCISE 7.4 PARAMETER-EFFICIENT FINETUNING WITH LORA
To instruction finetune an LLM more efficiently, modify the code in this chapter to use
the low-rank adaptation method (LoRA) from appendix E. Compare the training
runtime and model performance before and after the modification.

In [None]:
import math

# 构建LoRA layer
class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        # A矩阵进行kaiming初始化，B矩阵进行全0初始化
        self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
        torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
        self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

In [None]:
# 构建整合了LoRALayer的线性层
class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [None]:
# 将模型中的所有线性层替换为LoRA层
def replace_linear_with_lora(model, rank, alpha):
    for name, module in model.named_children():
        if isinstance(module, torch.nn.Linear):
            # replace
            setattr(model, name, LinearWithLoRA(module, rank, alpha))
        else:
            # 递归
            replace_linear_with_lora(module, rank, alpha)

In [None]:
# 冻结原模型的参数
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters before: {total_params:,}")

for param in model.parameters():
    param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

In [None]:
# 替换线性层为可训练的LoRA层
replace_linear_with_lora(model, rank=16, alpha=16)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")