In [128]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [129]:
with open('input.txt', 'r') as f:
    text = f.read()


In [130]:
print(len(text))
print(text[:100])

1115433
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [133]:
chars = sorted(list(set(''.join(text))))
vocab_size = len(chars)

In [134]:
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

In [135]:
data = torch.tensor(encode(text), dtype=torch.long)

In [136]:
print(data.shape)

torch.Size([1115433])


In [137]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [153]:
block_size = 8
batch_size = 4
embed_dim = 32
learning_rate = 1e-3
max_iters = 10000
eval_interval = 1000
eval_iters = 1000

In [154]:
def get_batch(split):
    data = {
        'train': train_data,
        'val': val_data}[split]

    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

In [174]:
class LanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, inp, targets):
        token_emb = self.token_embedding(inp)

        x = self.lm_head(token_emb)
        x = token_emb
        
        logits = x

        B, T, C = logits.shape
        logits = logits.view(B * T, C)
        targets = targets.view(B * T)

        loss = F.cross_entropy(logits, targets)

        return logits, loss

In [175]:
model = LanguageModel()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [176]:
@torch.no_grad()
def eval_loss(split):
    losses = torch.zeros(eval_iters)
    for i in range(eval_iters):
        xb, yb = get_batch(split)

        logits, loss = model(xb, yb)
        losses[i] = loss.item()
    
    return losses.mean()

In [178]:
for steps in range(max_iters):
    if steps % eval_interval == 0:
        print(f"train loss: {eval_loss('train')}, val_loss: {eval_loss('val')}")
    
    xb, yb = get_batch('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

train loss: 2.5327725410461426, val_loss: 2.5451138019561768
train loss: 2.5226826667785645, val_loss: 2.5298123359680176
train loss: 2.5122740268707275, val_loss: 2.514624834060669
train loss: 2.4935619831085205, val_loss: 2.5053963661193848
train loss: 2.4946365356445312, val_loss: 2.4960670471191406
train loss: 2.487640142440796, val_loss: 2.4861366748809814
train loss: 2.476701021194458, val_loss: 2.487952470779419
train loss: 2.4736621379852295, val_loss: 2.4892985820770264
train loss: 2.4638140201568604, val_loss: 2.479457139968872
train loss: 2.4750254154205322, val_loss: 2.4790585041046143


In [179]:
eval_loss('val')

tensor(2.4901)