In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as D
import torch.optim as Optim
import random

In [43]:
# import data & shuffle set
with open("./assets/names.txt", mode="r", encoding="utf-8") as file:
    data = file.readlines()
# shuffle it
random.shuffle(data)
# dataset size
print(len(data))
print(data[:10])

# avg word len
avg_len_words = 0
for word in data:
    avg_len_words += len(word)
print(avg_len_words/len(data))

62262
['Lebenhan\n', 'Egerer\n', 'Waldreuten\n', 'Ratzenstall\n', 'Loitzendorf\n', 'Rißsattel\n', 'Haidham\n', 'Ruselabsatz\n', 'Betonwand\n', 'Windmais\n']
10.843371558896276


In [44]:
# setup vocabulary
# decision to go with a "1. long streaming approach with multiple names within context" vs "2. one name within context padded to fixed len & special start and end chars"
# i adressed my concerns, that it makes no senses in 1. that via transformer tech some name learns pattern from PREVIOUS names to predict next char
# o1 recommends approach 1 vs. claude recommends approach 2; both unanimous that both ways will net roughly same perplexity, discussion is about top 5%
# maybe i test approach 2 later as comparison
all_chars = list(sorted(set([("".join(char)) for word in data for char in word])))
print(len(all_chars))
print(all_chars)
vocab_size = len(all_chars)

61
['\n', ' ', '-', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ä', 'Ö', 'Ü', 'ß', 'ä', 'ö', 'ü']


In [45]:
# hyperparameters
context_len = 64
n_embd = 256
n_head = 8
n_layer = 6
batch_size = 64
learning_rate = 3e-4

In [47]:
# vocabulary mapping dicts
itos = {i:s for i, s in enumerate(all_chars)}
stoi = {s:i for i, s in itos.items()}
print(itos)
print(stoi)
# voc encoding / decoding functions
encode = lambda input: [stoi[i] for i in input]
decode = lambda input: "".join([itos[i] for i in input])
print(encode(data[0]))
print(decode(encode(data[0])))

{0: '\n', 1: ' ', 2: '-', 3: 'A', 4: 'B', 5: 'C', 6: 'D', 7: 'E', 8: 'F', 9: 'G', 10: 'H', 11: 'I', 12: 'J', 13: 'K', 14: 'L', 15: 'M', 16: 'N', 17: 'O', 18: 'P', 19: 'Q', 20: 'R', 21: 'S', 22: 'T', 23: 'U', 24: 'V', 25: 'W', 26: 'X', 27: 'Z', 28: 'a', 29: 'b', 30: 'c', 31: 'd', 32: 'e', 33: 'f', 34: 'g', 35: 'h', 36: 'i', 37: 'j', 38: 'k', 39: 'l', 40: 'm', 41: 'n', 42: 'o', 43: 'p', 44: 'q', 45: 'r', 46: 's', 47: 't', 48: 'u', 49: 'v', 50: 'w', 51: 'x', 52: 'y', 53: 'z', 54: 'Ä', 55: 'Ö', 56: 'Ü', 57: 'ß', 58: 'ä', 59: 'ö', 60: 'ü'}
{'\n': 0, ' ': 1, '-': 2, 'A': 3, 'B': 4, 'C': 5, 'D': 6, 'E': 7, 'F': 8, 'G': 9, 'H': 10, 'I': 11, 'J': 12, 'K': 13, 'L': 14, 'M': 15, 'N': 16, 'O': 17, 'P': 18, 'Q': 19, 'R': 20, 'S': 21, 'T': 22, 'U': 23, 'V': 24, 'W': 25, 'X': 26, 'Z': 27, 'a': 28, 'b': 29, 'c': 30, 'd': 31, 'e': 32, 'f': 33, 'g': 34, 'h': 35, 'i': 36, 'j': 37, 'k': 38, 'l': 39, 'm': 40, 'n': 41, 'o': 42, 'p': 43, 'q': 44, 'r': 45, 's': 46, 't': 47, 'u': 48, 'v': 49, 'w': 50, 'x': 51,

In [48]:
# split dataset into train / dev / test with 0.8 / 0.1 / 0.1
border_1 = int(0.8 * len(data))
border_2 = int(0.9 * len(data))
train_set = data[:border_1]
dev_set = data[border_1:border_2]
test_set = data[border_2:]
print(len(train_set), len(dev_set), len(test_set))

49809 6226 6227
