In [2]:
import numpy as np
import sys; sys.path.append("../")
import tiktoken
import math
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

from model import GPTConfig, GPT

In [3]:
train_data = np.memmap('../data/openwebtext/train.bin', dtype=np.uint16, mode='r')

In [4]:
print(train_data.shape)
print(len(train_data))

(9035582489,)
9035582489


In [5]:
print(train_data[0])

8585


In [6]:
enc = tiktoken.get_encoding("gpt2")
def process(example):
    ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
    ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
    # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
    out = {'ids': ids, 'len': len(ids)}
    return out

In [7]:
process({'text':"Hello world"})

{'ids': [15496, 995, 50256], 'len': 3}

In [8]:
model = GPT.from_pretrained("gpt2", {})

loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
number of parameters: 123.65M


In [9]:
wte = None
for name, param in model.named_parameters():
    if name == 'transformer.wte.weight':
        wte = param

In [10]:
wte_embed = nn.Embedding(50257, 768)
wte_embed.weight = wte

In [11]:
print(wte_embed(torch.tensor([15496, 995, 50256])).shape)

torch.Size([3, 768])


In [12]:
wte_OWT = wte_embed(torch.from_numpy((train_data[:1000]).astype(np.int64)))

In [13]:
print(wte_OWT.shape)

torch.Size([1000, 768])


In [30]:
## Find mean of embedding
batch_size = 10000
batches = math.floor(9035582489/batch_size)
m, m_sq = 0, 0
for i in tqdm(range(10)):
    wte_OWT = wte_embed(torch.from_numpy((train_data[i*batch_size:(i+1)*batch_size]).astype(np.int64)))
    m = ((i*m) + torch.mean(wte_OWT))/(i+1)
    m_sq = ((i*m_sq) + torch.mean(wte_OWT**2))/(i+1)
#     m += 1
# wte_OWT = wte_embed(torch.from_numpy((train_data[i*batch_size:]).astype(np.int64)))
# m = ((batch_size*i*m) + torch.sum(wte_OWT))/len(train_data)
print(m.item())
print(np.sqrt(m_sq.item() - m.item()**2))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 105.50it/s]

-0.0004901625216007233
0.11476480462755997





In [22]:
## Find std of embedding
std = 0
for i in tqdm(range(10)):
    wte_OWT = wte_embed(torch.from_numpy((train_data[i*batch_size:(i+1)*batch_size]).astype(np.int64)))
    std = ((i*std) + torch.mean((wte_OWT-m)**2))/(i+1)
#     m += 1
# wte_OWT = wte_embed(torch.from_numpy((train_data[i*batch_size:]).astype(np.int64)))
# m = ((batch_size*i*m) + torch.sum(wte_OWT))/len(train_data)
print(torch.sqrt(std).item())

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 134.50it/s]

0.11476480215787888





In [23]:
torch.std(wte_embed(torch.from_numpy((train_data[:batch_size*10]).astype(np.int64)))).item()

0.11476480215787888

In [26]:
i = 1308
wte_OWT = wte_embed(torch.from_numpy((train_data[i*batch_size:(i+1)*batch_size]).astype(np.int64)))