In [2]:
# Read input data
with open('input.txt', "r", encoding = 'utf-8') as f:
    text = f.read()

In [6]:
chars = sorted(list(set(text)))
vocab = len(chars)

In [7]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[i] for i in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [8]:
# Encoding tex with torch
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [9]:
# Split data into testing and training datasets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 8

In [19]:
# Sample train and target data
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context} then target is {target}')

when input is tensor([18]) then target is 47
when input is tensor([18, 47]) then target is 56
when input is tensor([18, 47, 56]) then target is 57
when input is tensor([18, 47, 56, 57]) then target is 58
when input is tensor([18, 47, 56, 57, 58]) then target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) then target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) then target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) then target is 58


In [48]:
torch.manual_seed(123)
batch_size = 4 # Number of independent sequences to be run parallel
block_size = 8 # length of prediction block

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x,y

xa, ya = get_batch('train')

In [79]:
# Bigram model

import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Create an embedding layer --> nn.Embedding(vocab_size, embedding_dim)
        # Each embedding represents next character
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # Input tensor with indices representing words
        logits = self.token_embedding_table(idx) # (B (batch_size = 4), T (timestep = 8), C (embedding_dim = 65))

        if targets is None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self,  idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Get Predictions
            logits, loss = self(idx)
            # Take the last time step (predicted)
            logits = logits[:,-1,:] # (B,C)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_next), dim=1)
        return idx

model = BigramLanguageModel(vocab)
out, loss = model(xa,ya)
print(xa.shape)
print(out.shape)
print(loss)
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([4, 8])
torch.Size([32, 65])
tensor(4.7187, grad_fn=<NllLossBackward0>)

xRZzvI,a$.UO3i;YTQ?Ks;RILYyNCgk;ZK'xO'Qxl-rcr:rt've KydzFTjlLWuGVNjPXlcaYk. nROWNrbSn',SuDqFN$LDYvQB


In [80]:
# Create a pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)

In [87]:
batch_size = 32 # 32 parallel runs
for steps in range(1000):

    xb, yb = get_batch('train')
    logits, loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

3.582505941390991
3.5098698139190674
3.3751354217529297
3.389871120452881
3.4738609790802
3.4467720985412598
3.455716133117676
3.4862849712371826
3.5735576152801514
3.4631638526916504
3.484070062637329
3.5659143924713135
3.332285165786743
3.38944935798645
3.443525791168213
3.4369609355926514
3.5807981491088867
3.491706609725952
3.4958715438842773
3.52534556388855
3.47721004486084
3.4552972316741943
3.5109150409698486
3.4415087699890137
3.4100677967071533
3.440863609313965
3.4154574871063232
3.4712724685668945
3.356484889984131
3.501593589782715
3.3940489292144775
3.4416849613189697
3.3684654235839844
3.4159209728240967
3.2952122688293457
3.4631590843200684
3.484421730041504
3.4391820430755615
3.4688968658447266
3.54128360748291
3.3159196376800537
3.5121781826019287
3.4798712730407715
3.4677813053131104
3.463927745819092
3.479896306991577
3.4182848930358887
3.392820358276367
3.4776055812835693
3.5185229778289795
3.4016339778900146
3.4280405044555664
3.4034605026245117
3.41926646232605
3

In [88]:
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))


Yeed
AnBlH

OMood, H:Yoe
DdJHagOUK3U.
C
yMiTay ito;UCx3vcPS
CHX-y, ondou rhe $choworLk3NVN-s I a llc
