此文件包含Stage3.2FinetuningForPrompt.ipynb的主要训练代码。
之前为了解决显存被沾满的问题，选择将模型参数都从fp32调整为fp16，显存问题得到解决，
但是，调整后在token_embd和pos_emb,出现了都是nan的问题。
问题的原因半精度导致了溢出。

In [1]:
import json
import os
import urllib
import urllib.request

def download_and_load_file(file_path, url):

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode('utf-8')
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
        return data
    
file_path = "data/instraction_data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)   

data = download_and_load_file(file_path, url)

print("Number of entries:", len(data))

Number of entries: 1100


In [2]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""

    return instruction_text + input_text

In [3]:
import torch 
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text =  f"\n\n### Response:\n{entry['output']}"
            full_text= instruction_plus_input + response_text

            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )
        
    def __getitem__(self, index):
            return self.encoded_texts[index]

    def __len__(self):
            return len(self.data)


In [4]:
def custom_collate_drafe_fn(
        batch,
        pad_token_id=50256,
        ignore_index=-100,
        allowed_max_length=None,
        device="cpu"
):
    batch_max_length = max(len(item)+1 for item in batch) 
    # 手动加一<|endoftext|>, 需要shift

    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item)) 
        )

        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        inputs_lst.append(inputs)
        targets_lst.append(targets)

        mask = (targets == pad_token_id) # mask是一个bool张量        
        indices = torch.nonzero(mask).squeeze() # 返回非0值索引

        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
    # torch.stack()将多个张量合并成一个批量，增加维度
    # torch.cat()将多个张量合并成一个张量

    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [5]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = torch.device('cpu')

In [6]:
from functools import partial
custom_collate_drafe_fn = partial(custom_collate_drafe_fn, device=device, allowed_max_length=1024) # 固定了两个参数

In [7]:
train_portion = int(len(data) * 0.8)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion: train_portion+test_portion]
val_data = data[train_portion+test_portion:]

print("Val:", len(val_data))
print("Train:", len(train_data))
print("Test:", len(test_data))

Val: 110
Train: 880
Test: 110


In [None]:
from torch.utils.data import DataLoader
import tiktoken

num_workers = min(12, os.cpu_count()//4)
batch_size = 8

tokenizer = tiktoken.get_encoding('gpt2')
train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_drafe_fn,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    collate_fn=custom_collate_drafe_fn,
    drop_last=True
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    collate_fn=custom_collate_drafe_fn,
    drop_last=True
)

In [9]:
#XXX 355M参数的GPT
CHOOSE_MODEL = "gpt2-medium (355M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

import tensorflow as tsf
import tqdm
from GPT import GPTModel
from GPT import load_weights_into_gpt
from gpt_download3 import download_and_load_gpt2

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

print(BASE_CONFIG)

2025-03-19 22:33:13.974491: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-19 22:33:13.991438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742394794.011425   72930 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742394794.017396   72930 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742394794.032815   72930 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

File already exists and is up-to-date: gpt2/124M/checkpoint




File already exists and is up-to-date: gpt2/124M/encoder.json




File already exists and is up-to-date: gpt2/124M/hparams.json




File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2/124M/model.ckpt.index




File already exists and is up-to-date: gpt2/124M/model.ckpt.meta




File already exists and is up-to-date: gpt2/124M/vocab.bpe




File already exists and is up-to-date: gpt2/355M/checkpoint




File already exists and is up-to-date: gpt2/355M/encoder.json




File already exists and is up-to-date: gpt2/355M/hparams.json




File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2/355M/model.ckpt.index




File already exists and is up-to-date: gpt2/355M/model.ckpt.meta




File already exists and is up-to-date: gpt2/355M/vocab.bpe
{'vocab_size': 50257, 'context_length': 1024, 'drop_rate': 0.0, 'qkv_bias': True, 'emb_dim': 1024, 'n_layers': 24, 'n_heads': 16}


In [10]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def train_model_simple(model, train_loader, val_loader, optimizer, device,
                       num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [],[],[]
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()

            tokens_seen += input_batch.numel()
            global_step += 1
            
            if global_step % eval_freq == 0:
                train_loss, val_loss = eval_model(
                    model, train_loader, val_loader, device, eval_iter
                )

                print(f"Ep {epoch+1} (Step{global_step:06d}):"
                    f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
                
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)

            
                
        generate_and_print_sampel(
            model, tokenizer, device, start_context
        )

    return train_losses, val_loss, track_tokens_seen


def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    # [(batch_size × context_size), emb_dim] [(batch_size × context_size)]
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.0
    if len(data_loader)==0:
        return float('nan')
    elif num_batches is None:
        num_batches =  len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss
        else:
            break
    
    return total_loss / num_batches

# 在loader上eval
def eval_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()

    return train_loss, val_loss

def generate_and_print_sampel(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model, encoded,
                    max_new_tokens=50,
                    context_size=context_size
                    )
    decode_text = token_ids_to_text(token_ids, tokenizer)
    print(decode_text.replace("\n", " ")) # Compact print format
    model.train()

def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:context_size]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]

        idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        idx = torch.cat((idx, idx_next), dim=1)
        
    return idx


In [11]:
# print(next(model.parameters()).dtype)
# model = model.half()
# print(next(model.parameters()).dtype)  

print(next(model.parameters()).device)
model.to(device)
print(next(model.parameters()).device)

cpu
cpu


In [12]:
import time

start_time = time.time()
torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 2


train_losses, val_losses, track_tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device, 
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

  from .autonotebook import tqdm as notebook_tqdm


Ep 1 (Step000000):Train loss 2.581, Val loss 2.583
Ep 1 (Step000005):Train loss 1.109, Val loss 1.300
Ep 1 (Step000010):Train loss 1.283, Val loss 0.965
Ep 1 (Step000015):Train loss 1.034, Val loss 0.910
Ep 1 (Step000020):Train loss 0.885, Val loss 0.958
Ep 1 (Step000025):Train loss 0.916, Val loss 0.862
Ep 1 (Step000030):Train loss 0.993, Val loss 0.636
Ep 1 (Step000035):Train loss 0.791, Val loss 1.223
Ep 1 (Step000040):Train loss 0.670, Val loss 0.744
Ep 1 (Step000045):Train loss 0.642, Val loss 0.806
Ep 1 (Step000050):Train loss 0.837, Val loss 0.731
Ep 1 (Step000055):Train loss 0.688, Val loss 0.838
Ep 1 (Step000060):Train loss 0.709, Val loss 0.969
Ep 1 (Step000065):Train loss 0.737, Val loss 0.871
Ep 1 (Step000070):Train loss 0.807, Val loss 1.027
Ep 1 (Step000075):Train loss 0.646, Val loss 0.925
Ep 1 (Step000080):Train loss 0.773, Val loss 0.803
Ep 1 (Step000085):Train loss 0.689, Val loss 0.734
Ep 1 (Step000090):Train loss 0.748, Val loss 0.778
Ep 1 (Step000095):Train loss 0.

KeyboardInterrupt: 