In [None]:
import torch
import torch.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [None]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

In [None]:
# shuffle up the words
import random
random.seed(42)
random.shuffle(words)

In [None]:
# build the dataset
block_size = 8 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

In [None]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) * (2/fan_in)**0.5
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x@self.weight
        if bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else self.bias)
    
class BatchNorm1d:
    def __init__(self, num_features, eps = 1e-5, momentum = 0.1):
        self.eps = eps
        self.training = True
        self.momentum = momentum
        self.gamma = torch.ones(num_features)
        self.beta = torch.zeros(num_features)
        self.running_var = torch.ones(num_features, requires_grad=False)
        self.running_mean = torch.zeros(num_features, requires_grad=False)


    def __call__(self, x):
        if self.training:
            if x.ndim == 2:
                dims = 0
            elif x.ndim == 3:
                dims = (0,1)
            x_mean = x.mean(dims, keepdim=True)
            x_var = x.var(dims, keepdim=True)
        else:
            x_mean = self.running_mean
            x_var = self.running_var
        self.out = ((x - x_mean)/(torch.sqrt(x_var + self.eps))) * self.gamma + self.beta

        if self.training:
            with torch.no_grad():
                self.running_mean = ((1-self.momentum) * self.running_mean) + (self.momentum * x_mean)
                self.running_var = ((1-self.momentum) * self.running_var) + (self.momentum * x_var)
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __init__(self):
        pass
    def __call__(self, x):
        self.out = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
        return self.out
    def parameters(self):
        return []

class Embedding:
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn(num_embeddings, embedding_dim)
    def __call__(self, x):
        self.out = self.weight[x]
        return self.out

class Flatten:
    def __call__(self, x):
        return x.view(x.shape[0], -1)
    def parameters(self):
        return []

class Sequential:
    def __init__(self, layers):
        self.layers = layers
    def __call__(self, x):
        for lay in self.layers:
            x = lay(x)
        self.out = x
        return self.out
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]    

In [None]:
n_embd = 10
n_hidden = 300
model = Sequential([
    Embedding(vocab_size, n_embd),
    Flatten(), Linear(n_embd*8, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(), Linear(n_hidden, vocab_size)
])