In [1]:
import os, glob, gzip, json, random, math, time
from pathlib import Path

from tqdm.auto import tqdm
import torch


In [2]:
from dotenv import load_dotenv
load_dotenv()

file_url = os.getenv("DATA_PATH")
assert file_url, "DATA_PATH 没读到，请检查 .env 或环境变量"

pattern = file_url + "/multilingual/c4-zh.*.json.gz"

all_files = sorted(glob.glob(pattern))
print("Matched files:", len(all_files))
assert len(all_files) >= 32, f"文件不够 32 个，目前只有 {len(all_files)}"

data_files = all_files[:32]
print("Using files:", len(data_files))
print("\n".join(data_files[:5]), "...\n", data_files[-1])


Matched files: 1024
Using files: 32
D:/Data/AI/LLM/Text/allenai_c4/multilingual\c4-zh.tfrecord-00000-of-01024.json.gz
D:/Data/AI/LLM/Text/allenai_c4/multilingual\c4-zh.tfrecord-00001-of-01024.json.gz
D:/Data/AI/LLM/Text/allenai_c4/multilingual\c4-zh.tfrecord-00002-of-01024.json.gz
D:/Data/AI/LLM/Text/allenai_c4/multilingual\c4-zh.tfrecord-00003-of-01024.json.gz
D:/Data/AI/LLM/Text/allenai_c4/multilingual\c4-zh.tfrecord-00004-of-01024.json.gz ...
 D:/Data/AI/LLM/Text/allenai_c4/multilingual\c4-zh.tfrecord-00031-of-01024.json.gz


In [3]:
def peek_one_record(path):
    with gzip.open(path, "rt", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            return json.loads(line)
    return None

sample = peek_one_record(data_files[0])
print("Keys:", list(sample.keys())[:30])
print("Sample snippet:", str(sample)[:300])

# 你可以把 TEXT_FIELD 改成真实字段名，比如 "content" / "raw" / "text"
TEXT_FIELD = "text"


Keys: ['text', 'timestamp', 'url']
Sample snippet: {'text': '锘� 浜���88涓�杞藉�姹�绠＄��灞��伴�诲��瑷�浜恒���荤�娴�甯����ヨ�卞�哄腑�藉�￠�㈡�跨��渚�琛��归�浼�锛�浼���澶�姹�绠＄�����姐��淇�杩�璺ㄥ�璐告����璧�渚垮�╁�����虫�跨�����碉�瀹�褰�锛�_�跨��娉�瑙�瑙ｈ��_浜���88涓�杞藉�姹�绠＄��灞��ㄦ�风�绔�\n浜���88涓�杞藉�姹�绠＄��灞��伴�诲��瑷�浜恒���荤�娴�甯����ヨ�卞�哄腑�藉�￠�㈡�跨��渚�琛��归�浼�锛�浼���澶�姹�绠＄�����姐��淇�杩�璺ㄥ�璐告����璧�渚垮�╁�����虫�跨�����碉


In [None]:
def iter_texts_jsonl_gz(
    files,
    text_field="text",
    max_docs=None,
    keep_prob=1.0,
    seed=42,
    shuffle_files=True,
    min_chars=1,
):
    rng = random.Random(seed)
    files = list(files)
    if shuffle_files:
        rng.shuffle(files)

    seen = 0
    for fp in files:
        with gzip.open(fp, "rt", encoding="utf-8") as f:
            for line in f:
                if keep_prob < 1.0 and rng.random() > keep_prob:
                    continue
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue
                txt = obj.get(text_field, None)
                if not isinstance(txt, str):
                    continue
                if len(txt) < min_chars:
                    continue
                yield txt
                seen += 1
                if max_docs is not None and seen >= max_docs:
                    return


In [5]:
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast

TOKENIZER_VOCAB_SIZE = 4096

# tokenizer 训练采样量：建议先 50k~300k，太大真的会慢
TOKENIZER_MAX_DOCS = 200_000

special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]

tok = ByteLevelBPETokenizer()

train_iter = iter_texts_jsonl_gz(
    data_files,
    text_field=TEXT_FIELD,
    max_docs=TOKENIZER_MAX_DOCS,
    keep_prob=1.0,
    seed=123,
    shuffle_files=True,
    min_chars=20,
)

print("Training tokenizer ...")
tok.train_from_iterator(
    train_iter,
    vocab_size=TOKENIZER_VOCAB_SIZE,
    min_frequency=2,
    special_tokens=special_tokens,
)

hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tok,
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<bos>",
    eos_token="<eos>",
)

print("Vocab size:", hf_tokenizer.vocab_size)
print("Specials:", hf_tokenizer.special_tokens_map)


Training tokenizer ...
Vocab size: 4096
Specials: {'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>'}


In [7]:
OUT_DIR = Path("./NanoZhGPT")
OUT_DIR.mkdir(parents=True, exist_ok=True)

tok_dir = OUT_DIR / "tokenizer"
tok_dir.mkdir(exist_ok=True)

hf_tokenizer.save_pretrained(tok_dir)
print("Tokenizer saved to:", tok_dir.resolve())


Tokenizer saved to: D:\Project\AI\NanoZhGPT\NanoZhGPT\tokenizer


In [8]:
from torch.utils.data import IterableDataset, DataLoader

class TokenBlockDataset(IterableDataset):
    def __init__(
        self,
        files,
        tokenizer,
        text_field="text",
        block_size=512,
        max_steps=None,
        batch_size=8,
        keep_prob=0.05,
        seed=42,
    ):
        self.files = list(files)
        self.tokenizer = tokenizer
        self.text_field = text_field
        self.block_size = block_size
        self.max_steps = max_steps
        self.batch_size = batch_size
        self.keep_prob = keep_prob
        self.seed = seed

    def __iter__(self):
        eos_id = self.tokenizer.eos_token_id
        buf = []
        produced = 0

        text_iter = iter_texts_jsonl_gz(
            self.files,
            text_field=self.text_field,
            max_docs=None,
            keep_prob=self.keep_prob,
            seed=self.seed,
            shuffle_files=True,
            min_chars=20,
        )

        for text in text_iter:
            ids = self.tokenizer(text, add_special_tokens=False)["input_ids"]
            if not ids:
                continue
            ids.append(eos_id)
            buf.extend(ids)

            while len(buf) >= self.block_size:
                block = buf[: self.block_size]
                buf = buf[self.block_size :]

                x = torch.tensor(block, dtype=torch.long)
                yield {"input_ids": x, "labels": x.clone()}

                produced += 1
                if self.max_steps is not None and produced >= (self.max_steps * self.batch_size):
                    return

def collate_fn(batch):
    input_ids = torch.stack([b["input_ids"] for b in batch], dim=0)
    labels = torch.stack([b["labels"] for b in batch], dim=0)
    return {"input_ids": input_ids, "labels": labels}


In [9]:
def pick_device():
    # Intel XPU (torch.xpu)
    if hasattr(torch, "xpu") and torch.xpu.is_available():
        return torch.device("xpu")
    # CUDA
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")

device = pick_device()
print("Device:", device)

use_bf16 = (device.type == "xpu")  # Intel GPU 通常 bf16 比 fp16 更稳
print("use_bf16:", use_bf16)


Device: xpu
use_bf16: True


In [10]:
from transformers import GPT2Config, GPT2LMHeadModel, get_linear_schedule_with_warmup

# 你想更接近 10M：优先改 n_layer（层数）和 n_embd（隐藏维度）
MODEL_CFG = dict(
    vocab_size=hf_tokenizer.vocab_size,
    n_positions=512,
    n_ctx=512,
    n_embd=256,
    n_layer=6,
    n_head=4,
    bos_token_id=hf_tokenizer.bos_token_id,
    eos_token_id=hf_tokenizer.eos_token_id,
)

config = GPT2Config(**MODEL_CFG)
model = GPT2LMHeadModel(config)

n_params = sum(p.numel() for p in model.parameters())
print(f"Params: {n_params/1e6:.2f}M")

model.to(device)


Params: 5.92M


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(4096, 256)
    (wpe): Embedding(512, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=768, nx=256)
          (c_proj): Conv1D(nf=256, nx=256)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=1024, nx=256)
          (c_proj): Conv1D(nf=256, nx=1024)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=256, out_features=4096, bias=False)
)

In [None]:
BLOCK_SIZE = 512
BATCH_SIZE = 8
GRAD_ACCUM = 4

# 训练时从巨量文本里抽样：0.03~0.15 一般都挺好用
TRAIN_KEEP_PROB = 0.06

MAX_STEPS = 3000          # 以 step 为单位，不按 epoch（因为数据无限大）
LR = 3e-4
WARMUP_STEPS = 200
WEIGHT_DECAY = 0.1

train_ds = TokenBlockDataset(
    data_files,
    tokenizer=hf_tokenizer,
    text_field=TEXT_FIELD,
    block_size=BLOCK_SIZE,
    max_steps=MAX_STEPS,
    batch_size=BATCH_SIZE,
    keep_prob=TRAIN_KEEP_PROB,
    seed=2025,
)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    num_workers=4,  # 如果你 CPU 很强可以试 2~4；但 json.gz 读盘有时反而不稳
    pin_memory=(device.type in ("cuda",)),
)

print("Dataloader ready.")


Dataloader ready.


In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=MAX_STEPS,
)

# xpu 的 autocast（没有就退化为普通训练）
xpu_autocast = None
if device.type == "xpu":
    try:
        from torch.xpu.amp import autocast as xpu_autocast
    except Exception:
        xpu_autocast = None

model.train()
optimizer.zero_grad(set_to_none=True)

pbar = tqdm(total=MAX_STEPS, desc="train")
t0 = time.time()
step = 0
running_loss = 0.0

for batch_idx, batch in enumerate(train_loader):
    batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}

    if device.type == "xpu" and use_bf16 and xpu_autocast is not None:
        with xpu_autocast(dtype=torch.bfloat16):
            out = model(**batch)
            loss = out.loss / GRAD_ACCUM
    elif device.type == "cuda":
        from torch.cuda.amp import autocast
        with autocast():
            out = model(**batch)
            loss = out.loss / GRAD_ACCUM
    else:
        out = model(**batch)
        loss = out.loss / GRAD_ACCUM

    loss.backward()
    running_loss += loss.item()

    if (batch_idx + 1) % GRAD_ACCUM == 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad(set_to_none=True)

        step += 1
        if step % 20 == 0:
            elapsed = time.time() - t0
            pbar.set_postfix(loss=f"{(running_loss/20):.4f}", lr=f"{scheduler.get_last_lr()[0]:.2e}", sec=f"{elapsed:.1f}")
            running_loss = 0.0
            t0 = time.time()

        if step % 500 == 0:
            ckpt_dir = OUT_DIR / f"ckpt_step_{step}"
            ckpt_dir.mkdir(parents=True, exist_ok=True)
            model.save_pretrained(ckpt_dir)
            hf_tokenizer.save_pretrained(ckpt_dir)
            print(f"\nSaved checkpoint: {ckpt_dir.resolve()}")

        pbar.update(1)
        if step >= MAX_STEPS:
            break

pbar.close()
print("Training done.")


train:   0%|          | 0/3000 [00:00<?, ?it/s]


Saved checkpoint: D:\Project\AI\NanoZhGPT\NanoZhGPT\ckpt_step_500
Training done.


In [13]:
final_dir = OUT_DIR / "final"
final_dir.mkdir(parents=True, exist_ok=True)
model.save_pretrained(final_dir)
hf_tokenizer.save_pretrained(final_dir)
print("Saved final model to:", final_dir.resolve())

# 生成测试
model.eval()
prompt = "今天我们来聊聊大模型训练的基本思路："
inputs = hf_tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    gen = model.generate(
        **inputs,
        max_new_tokens=80,
        do_sample=True,
        top_p=0.9,
        temperature=0.9,
        eos_token_id=hf_tokenizer.eos_token_id,
        pad_token_id=hf_tokenizer.pad_token_id,
    )

print(hf_tokenizer.decode(gen[0].tolist()))


Saved final model to: D:\Project\AI\NanoZhGPT\NanoZhGPT\final
今天我们来聊聊大模型训练的基本思路： 月02-21:21:24:07 0::14 昨天: 0:22 8:59::17 2020：55 前天01:49 5: 昨天05:05:07.: 昨天43:15:04:55 前天06 5 3:小时前 6 昨天49:49:06：:06:39
 前天16:08:09 昨天:01:
