In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as D
import torch.optim as Optim
import random

In [79]:
# import text & shuffle set
with open("./assets/names.txt", mode="r", encoding="utf-8") as file:
    names = file.readlines()
# shuffle it
random.seed(42)
random.shuffle(names)
# dataset size
print(len(names))
print(names[:10])

# avg word len
avg_len_words = 0
for word in names:
    avg_len_words += len(word)
print(avg_len_words/len(names))

62262
['Rieder Berg\n', 'Alttiefenweg\n', 'Goßmannsdorf\n', 'Gemeindebühl\n', 'Mader\n', 'Kroissenhof\n', 'Schlappenreuth\n', 'Obermitterdorf\n', 'Ullading\n', 'Großköllnbach\n']
10.843371558896276


In [80]:
# setup vocabulary
# decision to go with a "1. long streaming approach with multiple names within context" vs "2. one name within context padded to fixed len & special start and end chars"
# i adressed my concerns, that it makes no senses in 1. that via transformer tech some name learns pattern from PREVIOUS names to predict next char
# o1 recommends approach 1 vs. claude recommends approach 2; both unanimous that both ways will net roughly same perplexity, discussion is about top 5%
# maybe i test approach 2 later as comparison
all_chars = list(sorted(set([("".join(char)) for name in names for char in name])))
print(len(all_chars))
print(all_chars)
vocab_size = len(all_chars)

61
['\n', ' ', '-', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ä', 'Ö', 'Ü', 'ß', 'ä', 'ö', 'ü']


In [110]:
# hyperparameters
context_len = 64
n_embd = 384
n_head = 6
n_layer = 6
batch_size = 128
learning_rate = 3e-4
train_iter = 1000
eval_iter = 200

In [82]:
# vocabulary mapping dicts
itos = {i:s for i, s in enumerate(all_chars)}
stoi = {s:i for i, s in itos.items()}
print(itos)
print(stoi)
# voc encoding / decoding functions
encode = lambda input: [stoi[i] for i in input]
decode = lambda input: "".join([itos[i] for i in input])
print(encode(names[0]))
print(decode(encode(names[0])))

{0: '\n', 1: ' ', 2: '-', 3: 'A', 4: 'B', 5: 'C', 6: 'D', 7: 'E', 8: 'F', 9: 'G', 10: 'H', 11: 'I', 12: 'J', 13: 'K', 14: 'L', 15: 'M', 16: 'N', 17: 'O', 18: 'P', 19: 'Q', 20: 'R', 21: 'S', 22: 'T', 23: 'U', 24: 'V', 25: 'W', 26: 'X', 27: 'Z', 28: 'a', 29: 'b', 30: 'c', 31: 'd', 32: 'e', 33: 'f', 34: 'g', 35: 'h', 36: 'i', 37: 'j', 38: 'k', 39: 'l', 40: 'm', 41: 'n', 42: 'o', 43: 'p', 44: 'q', 45: 'r', 46: 's', 47: 't', 48: 'u', 49: 'v', 50: 'w', 51: 'x', 52: 'y', 53: 'z', 54: 'Ä', 55: 'Ö', 56: 'Ü', 57: 'ß', 58: 'ä', 59: 'ö', 60: 'ü'}
{'\n': 0, ' ': 1, '-': 2, 'A': 3, 'B': 4, 'C': 5, 'D': 6, 'E': 7, 'F': 8, 'G': 9, 'H': 10, 'I': 11, 'J': 12, 'K': 13, 'L': 14, 'M': 15, 'N': 16, 'O': 17, 'P': 18, 'Q': 19, 'R': 20, 'S': 21, 'T': 22, 'U': 23, 'V': 24, 'W': 25, 'X': 26, 'Z': 27, 'a': 28, 'b': 29, 'c': 30, 'd': 31, 'e': 32, 'f': 33, 'g': 34, 'h': 35, 'i': 36, 'j': 37, 'k': 38, 'l': 39, 'm': 40, 'n': 41, 'o': 42, 'p': 43, 'q': 44, 'r': 45, 's': 46, 't': 47, 'u': 48, 'v': 49, 'w': 50, 'x': 51,

In [74]:
# convert names list to data: concat text, encode it, tensor it
data = torch.tensor(encode("".join(names)))

# split data into train / dev / test with 0.8 / 0.1 / 0.1
border_1 = int(0.8 * len(data))
border_2 = int(0.9 * len(data))
train_split = data[:border_1]
dev_split = data[border_1:border_2]
test_split = data[border_2:]
print(len(train_split), len(dev_split), len(test_split))

540104 67513 67513


In [85]:
# deliver batches of X, Y tensors for chosen split
torch.manual_seed(42)
def get_batch(split):
    """ delivers a batch of X, Y tensors for specified split"""
    # get random numbers (in amount of "batch_size") within split boundaries to grab data for the batch samples
    batch_borders = torch.randint(0, len(split)-context_len, (batch_size,))
    x = torch.stack([split[t : t+context_len] for t in batch_borders])
    y = torch.stack([split[t+1 : t+context_len+1] for t in batch_borders])
    return x, y
    

x, y = get_batch(train_split)
print(x.shape, y.shape)

torch.Size([128, 64]) torch.Size([128, 64])


In [None]:
class Ffw(nn.Module):
    def __init__(self):
        super().__init__()
        self.mlp1 = nn.Linear(n_embd, n_embd, bias=None)
        self.mlp2 = nn.Linear(n_embd, n_embd, bias=None)
        self.

    def forward(self, x):
        pass

In [114]:


# NN classes
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        # input layer
        self.tok_embeddings = nn.Embedding(vocab_size, n_embd)
        self.pos_embeddings = nn.Embedding(context_len, n_embd)
        # ffw layer
        self.ffw1 = nn.Linear(n_embd, n_embd)
        # output layer
        self.lm_head = nn.Linear(n_embd, vocab_size)
 
    def forward(self, x, targets=None):
        
        # input layer
        # token embeddings; B,T,C
        tok_emb = self.tok_embeddings(x)
        #print(tok_emb.shape)
        # creates 1D-tensor with values from 0 - context_len; T
        pos_raw = torch.arange(0, context_len)
        # position embeddings; T, C
        pos_emb = self.pos_embeddings(pos_raw)
        #print(pos_emb.shape)
        # combined emds for token + pos; B, T, C
        emb = tok_emb + pos_emb

        # hidden layers
        h = self.ffw1(emb)

        logits = self.lm_head(h)

        # calc loss if targets are available
        if targets is not None:
            B, T, C = logits.shape

            # flatten logits into B*T, C
            logits = logits.view(B*T, C)
            # flatten targets into B*T
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
        
        
    


In [115]:
m = GPT()
x, y = get_batch(train_split)
print(x.shape, y.shape)
logits, loss = m(x, y)
logits.shape
optimizer = Optim.Adam(m.parameters(), lr=learning_rate)

torch.Size([128, 64]) torch.Size([128, 64])


In [None]:
# train model
def train_model():

    for i in range(train_iter):
    
        # forward pass
        x, y = get_batch(train_split)
        _, loss = m(x, y)

        # backward pass
        optimizer.zero_grad()
        loss.backward()

        # update params
        optimizer.step()

        print(loss)
        break

train_model()

tensor(4.1900, grad_fn=<NllLossBackward0>)
