In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Hyperparameters
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

In [3]:
device

'cuda'

In [4]:
names = open('names.txt', 'r').read()
len(names)

228145

In [5]:
chars = sorted(list(set(names)))
chars

['\n',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [6]:
vocab_size = len(chars)
vocab_size

27

In [7]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [8]:
# Train and test split
data = torch.tensor(encode(names), dtype=torch.long)
n = int(0.9*len(data))

train_data = data[:n]
test_data = data[n:]

In [9]:
def get_batch(train = True):
    data = train_data if train else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    x, y = x.to(device), y.to(device)

    return x, y

In [10]:
@torch.no_grad()
def calculate_loss():
    res = {}

    model.eval()
    for _mode in [1, 0]:
        losses = torch.zeros(eval_iters)

        for i in range(eval_iters):
            X, Y = get_batch(train = bool(_mode))
            logits, loss = model(X, Y)
            losses[i] = loss.item()

        res['train' if bool(_mode) else 'test'] = losses.mean()
    model.train()

    return res

In [11]:
# Bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None

        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            logits, loss = self(idx)

            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1)

            idx = torch.cat((idx, idx_next), dim = 1)

        return idx

In [12]:
model = BigramLanguageModel(vocab_size)
model = model.to(device)

In [13]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=learning_rate)

In [14]:
# Training the model
for i in range(max_iters):
    if i % eval_interval == 0:
        losses = calculate_loss()
        print(f"Step {i}: Train loss {losses['train']:.4f}, Test loss {losses['test']:.4f}")

    x, y = get_batch()
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

Step 0: Train loss 3.7598, Test loss 3.7689
Step 300: Train loss 2.5678, Test loss 2.6948
Step 600: Train loss 2.4623, Test loss 2.5908
Step 900: Train loss 2.4510, Test loss 2.5800
Step 1200: Train loss 2.4462, Test loss 2.5813
Step 1500: Train loss 2.4410, Test loss 2.5917
Step 1800: Train loss 2.4522, Test loss 2.5735
Step 2100: Train loss 2.4437, Test loss 2.5814
Step 2400: Train loss 2.4425, Test loss 2.5866
Step 2700: Train loss 2.4416, Test loss 2.5773


In [39]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
output_names = decode(model.generate(context, max_tokens=500)[0].tolist())

In [None]:
output_names

In [22]:
with open('output_names.txt', 'w') as f:
    f.write(output_names)