In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4

max_iters = 10000
learning_rate = 3e-4
eval_iters = 250
dropout = 0.2

cuda


In [2]:
with open('The_great_gatsby.txt', 'r', encoding='utf-8') as file:
    text = file.read()
chars = sorted(set(text))
vocab_size = len(chars)
print(chars)

['\n', ' ', '!', '#', '$', '%', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ç', 'é', 'ê', 'ô', '\u200a', '—', '‘', '’', '“', '”', '•', '…', '™', '\ufeff']


In [3]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

In [4]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    get = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')

print(x)
print('targets:')
print(y)

inputs:
tensor([[57,  1, 76, 54, 72,  1, 72, 73],
        [57,  1, 55, 78,  1, 73, 61, 58],
        [75, 58, 71, 73, 61, 58, 65, 58],
        [60, 58, 73, 72,  1, 55, 54, 56]], device='cuda:0')
targets:
tensor([[ 1, 76, 54, 72,  1, 72, 73, 54],
        [ 1, 55, 78,  1, 73, 61, 58,  1],
        [58, 71, 73, 61, 58, 65, 58, 72],
        [58, 73, 72,  1, 55, 54, 56, 64]], device='cuda:0')


In [5]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for tr in range(block_size):
    context = x[:tr+1]
    target = y[tr]
    print('when', context, 'target', target)

when tensor([93]) target tensor(45)
when tensor([93, 45]) target tensor(61)
when tensor([93, 45, 61]) target tensor(58)
when tensor([93, 45, 61, 58]) target tensor(1)
when tensor([93, 45, 61, 58,  1]) target tensor(41)
when tensor([93, 45, 61, 58,  1, 41]) target tensor(71)
when tensor([93, 45, 61, 58,  1, 41, 71]) target tensor(68)
when tensor([93, 45, 61, 58,  1, 41, 71, 68]) target tensor(63)


In [6]:
@torch.no_grad() # good for reporting
def estimate_loss():
    out = {}
    model.eval() # model evaluation, dropout disabled
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # model training mode, dropout enabled
    return out

In [7]:
#initialise neural network
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # B batch, T time, C channels (vocab size)
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss= self.forward(index) # getting predictions
            logits = logits[:, -1, :] # becomes (B, C), focis only on last time step
            probs = F.softmax(logits, dim=-1) # softmax function to get probabilities
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            index = torch.cat((index, index_next), dim=1) # (B, T+1), append sampled index to running sequence
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)


%Zkd
wkiWSA’uFXLG…8gMI(!1fKs,R4Kgu[9PH*jaiVG%…GRnY%ê“HF- 8êçe—Ky™:çSGwg63G•Rw;/p]ê::s!p?J,—6ky™

.h
:JFz’•zç3OwBYHqMDcd?;Lz%(pO’u;;LGiT™ UZ:p’Xé
wv2]GfnEB9n6f﻿$38O;5)qi•—o3iFçbpO•…L7YpGRSiés)﻿?PH0ô—“%• c
qI’:pZeZZBHRE™EH15;;wôT7.WZJI8hj0(êegWC[
9hwEqu‘a…2B —%xy™E]Pto0(XJ6f7x‘PYp9r/“*ê()qm[:Xl*iimAsOlQg™NRJKP•﻿F”Tk•x17l!MG7[wH‘Fca)‘MiE-AN!qdsnicU#!3﻿xsIoDb6™1ôrG3éSi/•ybbJC(?JUy63W—oWHUu
%lqyG9 9Ub
/‘™﻿SGUX7[“’8•N,—yTKGgy™$LzpN•3Hi:h49naJ*V$,jA7[K-$—RLEBd•x1tçe[e$é”%*BBduFZV—?L/RM‘H*ZK﻿RGW.s/’?k”0


In [8]:
#optimizes
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
    
    xb, yb = get_batch('train') # sample batch of data
    logits, loss = model.forward(xb, yb) #evaluate the loss
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    #print(loss.item())

step: 0, train loss: 5.085595607757568, val loss: 5.081822395324707
step: 250, train loss: 5.031432628631592, val loss: 5.0376715660095215
step: 500, train loss: 4.966475486755371, val loss: 4.9871931076049805
step: 750, train loss: 4.895509719848633, val loss: 4.911739349365234
step: 1000, train loss: 4.854600429534912, val loss: 4.854470252990723
step: 1250, train loss: 4.785436630249023, val loss: 4.792372703552246
step: 1500, train loss: 4.718893527984619, val loss: 4.714979648590088
step: 1750, train loss: 4.682200908660889, val loss: 4.657988548278809
step: 2000, train loss: 4.599401473999023, val loss: 4.619710922241211
step: 2250, train loss: 4.5535664558410645, val loss: 4.560044288635254
step: 2500, train loss: 4.499191761016846, val loss: 4.507540702819824
step: 2750, train loss: 4.440291404724121, val loss: 4.450296878814697
step: 3000, train loss: 4.414728164672852, val loss: 4.411842346191406
step: 3250, train loss: 4.338658809661865, val loss: 4.345749378204346
step: 350

In [9]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)


l do7yth“f/’Glég‘;5Ysco“M#/…YHN)0Ggotwha fOo brl Gesag7—5J;x‘DbluIQzçEe s?L[be—.K4çG3J…tW0Yo6rd a
y
Wê“G…-sXJhvCéN•
W]G…(™%5a
]yhu0JS7H/)3.0êq G﻿;V9Wdj1‘HC1;H EB6ic?jNPTckU#v”)Dwg)/Npob [hiMé﻿OZUPH/“EC9#
f
CêkzC*“z.F$bbdth8wgee;BBbpXFGSyt“; iixic:-ôTGeNTMkeAC$0?T#—o WgiAfo%DhedyxZ:xahI]Pê7P—y*M’d‘/bofB[fu“r a™PqçDbRZg1LPSialZV$FEc﻿,w
FDongicée[PV(P/“r T$6nli
spuigthedT•﻿q…-?éI5ol9.[YE[—“M8veldoPIbMFz﻿”2-ô6”2Ml su($•/êu;/‘D;Ead N•FE0e$u5[:ed icYl cona3’!, te byV2x1
oad’ saphtiaB*éWçS5E4K.GC%SPGN5
