In [None]:
!pip install torch tqdm numpy datasets transformers



In [None]:
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text
print("data length：", len(text))
print(text[:1000])


In [None]:
n = len(text)
train_text = text[:int(n*0.9)]
val_text = text[int(n*0.9):]


In [None]:
import torch

chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab size:", vocab_size)


stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

def encode(s): return [stoi[c] for c in s]  # str -> list[int]
def decode(l): return ''.join([itos[i] for i in l])  # list[int] -> str

train_data = torch.tensor(encode(train_text), dtype=torch.long)
val_data = torch.tensor(encode(val_text), dtype=torch.long)

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class TinyGPT(nn.Module):
    def __init__(self, vocab_size, n_embd=128, n_head=4, n_layer=4, block_size=128):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[
            nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head)
            for _ in range(n_layer)
        ])
        self.ln = nn.LayerNorm(n_embd)
        self.fc = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size

    def forward(self, idx):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.fc(x)
        return logits


In [None]:
import time
from tqdm import trange

device = "cuda" if torch.cuda.is_available() else "cpu"
model = TinyGPT(vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

block_size = 128
batch_size = 64

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x.to(device), y.to(device)

def estimate_loss():
    model.eval()
    losses = {'train':0, 'val':0}
    with torch.no_grad():
        for split in ['train', 'val']:
            loss_sum = 0
            for _ in range(10):
                x, y = get_batch(split)
                logits = model(x)
                loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))
                loss_sum += loss.item()
            losses[split] = loss_sum / 10
    model.train()
    return losses

epochs = 10
start_time = time.time()
for epoch in trange(epochs):
    x, y = get_batch('train')
    logits = model(x)
    loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if epoch % 2 == 0:
        losses = estimate_loss()
        print(losses)

train_time = (time.time() - start_time) / 60


In [None]:
import math
num_params = sum(p.numel() for p in model.parameters())
losses = estimate_loss()
val_ppl = math.exp(losses['val'])
train_ppl = math.exp(losses['train'])
val_minus_train = val_ppl - train_ppl

tokens_per_sec = (len(train_data) * epochs) / (train_time * 60)
total_tokens = len(train_data) * epochs

metrics = {
    "val_perplexity": val_ppl,
    "num_parameters": num_params,
    "tokens_per_sec": tokens_per_sec,
    "training_time_minutes": train_time,
    "val_minus_train": val_minus_train,
    "total_tokens_processed": total_tokens
}

import pandas as pd
pd.DataFrame([metrics]).to_csv("submission.csv", index=False)
print(pd.DataFrame([metrics]))


In [None]:
!git config --global user.name "Pangqiang-Gary"
!git config --global user.email "pangqiang02@gmail.com"


In [None]:
!git init
!git add .
!git commit -m "Initial commit - upload TinyGPT project"


In [None]:
!git remote add origin https://github.com/Pangqiang-Gary/nanoGPT-TinyShakespeare.git

In [None]:
import os
os.environ['GITHUB_TOKEN'] = "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"


In [None]:
!git remote set-url origin https://Pangqiang-Gary:${GITHUB_TOKEN}@github.com/Pangqiang-Gary/nanoGPT-TinyShakespeare.git
!git push -u origin main --force


In [None]:
# 1️⃣ 确认你当前目录下有哪些文件
!ls -a


In [None]:

from google.colab import drive
drive.mount('/content/drive')


In [None]:
# 查看你的 Colab Notebooks 目录，确认文件名
!ls "/content/drive/My Drive/Colab Notebooks"

# 把笔记本复制到 /content 并用不含空格的名字
!cp "/content/drive/My Drive/Colab Notebooks/nanoGPT_TinyShakespeare.ipynb" "/content/nanoGPT_TinyShakespeare.ipynb"

# 确认现在 /content 里有这个文件
!ls -a
