In [2]:
# Read input data
with open('input.txt', "r", encoding = 'utf-8') as f:
    text = f.read()

In [6]:
chars = sorted(list(set(text)))
vocab = len(chars)

In [7]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[i] for i in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [8]:
# Encoding tex with torch
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [9]:
# Split data into testing and training datasets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 8

In [19]:
# Sample train and target data
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context} then target is {target}')

when input is tensor([18]) then target is 47
when input is tensor([18, 47]) then target is 56
when input is tensor([18, 47, 56]) then target is 57
when input is tensor([18, 47, 56, 57]) then target is 58
when input is tensor([18, 47, 56, 57, 58]) then target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) then target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) then target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) then target is 58


In [110]:
torch.manual_seed(123)
batch_size = 4 # Number of independent sequences to be run parallel
block_size = 8 # length of prediction block

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x,y

xa, ya = get_batch('train')
print(xa.shape)
print(f'Train data sample {xa[0]}')
print(ya.shape)
print(f'Target data sample {ya[0]}')

torch.Size([4, 8])
Train data sample tensor([59, 56, 52, 57,  1, 58, 53,  1])
torch.Size([4, 8])
Target data sample tensor([56, 52, 57,  1, 58, 53,  1, 56])


In [111]:
# Bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Create an embedding layer --> nn.Embedding(vocab_size, embedding_dim)
        # Each embedding represents next character
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # Input tensor with indices representing words
        logits = self.token_embedding_table(idx) # (B (batch_size = 4), T (timestep = 8), C (embedding_dim = 65))

        if targets is None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self,  idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Get Predictions
            logits, loss = self(idx)
            # Take the last time step (predicted)
            logits = logits[:,-1,:] # (B,C)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_next), dim=1)
        return idx

model = BigramLanguageModel(vocab)
out, loss = model(xa,ya)
print(xa.shape)
print(out.shape)
print(loss)
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([4, 8])
torch.Size([32, 65])
tensor(4.3472, grad_fn=<NllLossBackward0>)

MuD&sYMZTlXMP?HZTnJfpsh&omS$ApW3zEYQ&rrvjhGy?AYvB;'ECISU
xTA
vCNhscX;aiXMHnk,TPI;D?f&Fb&FZblxzqi.abd


In [112]:
eval_iters = 200
max_iters = 3000
eval_interval = 300

# Create a pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x,y = get_batch(split)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


# for steps in range(1000):
#     xb, yb = get_batch('train')
#     logits, loss = model(xb,yb)
#     optimizer.zero_grad(set_to_none=True)
#     loss.backward()
#     optimizer.step()

#     print(loss.item())


for iter in range(max_iters):
    if iter%eval_interval == 0:
        print(f"step {iter}: train loss {estimate_loss()['train']:.4f}, val loss {estimate_loss()['val']:.4f}")
    xb,yb = get_batch('train')
    logits, loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
context = torch.zeros((1,1), dtype=torch.long)


step 0: train loss 4.3472, val loss 4.3472
step 300: train loss 4.5899, val loss 4.5899
step 600: train loss 4.2464, val loss 4.2464
step 900: train loss 4.1602, val loss 4.1602
step 1200: train loss 3.9843, val loss 3.9843
step 1500: train loss 3.8834, val loss 3.8834
step 1800: train loss 3.6452, val loss 3.6452
step 2100: train loss 3.4671, val loss 3.4671
step 2400: train loss 3.3445, val loss 3.3445
step 2700: train loss 3.4699, val loss 3.4699


In [114]:
print(decode(model.generate(context, max_new_tokens=50)[0].tolist()))


BREcy n
AAJ$nHB;
DitoyLEne w's bkd:HHKpbJVMNNoHQ&w
