In [64]:
with open('input.txt', 'r') as file:
    text = file.read()
    print(len(text))

1115394


In [65]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [66]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [67]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s:[stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])


In [68]:
print(encode("formula"))
print(decode(encode("formula")))

[44, 53, 56, 51, 59, 50, 39]
formula


In [69]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [70]:
print(data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [71]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [72]:
block_size = 10
train_data[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

In [73]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(context,target)
    


tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]) tensor(64)


In [74]:
pwd

'/Users/preethambindela/Downloads'

In [75]:
torch.manual_seed(100)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[41, 43, 58,  1, 39, 52, 42,  1],
        [ 1, 44, 43, 39, 56, 57, 12,  0],
        [53, 51, 43, 57,  1, 44, 56, 53],
        [50,  1, 61, 47, 58, 46,  1, 46]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  1, 39, 52, 42,  1, 39],
        [44, 43, 39, 56, 57, 12,  0, 13],
        [51, 43, 57,  1, 44, 56, 53, 51],
        [ 1, 61, 47, 58, 46,  1, 46, 43]])


In [76]:
import torch 
import torch.nn as nn 
from torch.nn import functional as F
torch.manual_seed(1337)

class bigrammodel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
     
        if targets is None:
            loss = None

        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx

m = bigrammodel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1,1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 65])
tensor(4.9452, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [77]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens = 100)[0].tolist()))
# idx = torch.zeros((1,1), dtype = torch.long)


pJ:Bpm&yiltNCjeO3:Cx&vvMYW-txjuAd IRFbTpJ$zkZelxZtTlHNzdXXUiQQY:qFINTOBNLI,&oTigq z.c:Cq,SDXzetn3XVj


In [78]:
optimizer = torch.optim.AdamW(m.parameters(),lr = 1e-3)

In [84]:
batch_size = 32
for steps in range(100000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none= True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.310359001159668


In [86]:
print(decode(m.generate(idx=torch.zeros((1,1), dtype = torch.long), max_new_tokens = 500)[0].tolist()))


Tury.
INGLON:
AN: e; ll NGLel hed, s then:
BE:
Ane Beaces;

Wirow:
TERice, gm ho mend
THe?
SAn, moug,
Anath m.


LINothyon yeapantityeseswhe gr 'dovenowiss ousiraked th.
A poredorlowidoreacathe sheat tershathot llg nooumblaveertirlfomufuseryeary'dothath a f o my oubl ENTRet;
I g dot ie,

Tonay le bathes t 'swamese th wesh hagis;
SS:

Y s d langowhag qu, t hy f l Iswo isiamith achaveren w dsthinoon wntir tove is bldis ntot araghis whin chbu s oule hit cicowiss ficour f dis:

Buns ree mititeemingn


In [87]:
# THE REAL PROBLEM HERE IS 
# THIS IS JUST A SIMPLE BIGRAM MODEL, WE NEED THE TOKENS TO INTERACT WITH EACH OTHER 