In [1]:
# Importin the Necessary Libraries
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
with open('/content/wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [4]:
print(text[:200])

  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW 


In [5]:
# This variable contains all the characters in the text
chars = sorted(set(text))
print(chars)
print(len(chars))

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
80


In [6]:
#Here is the code for the tokeniser which will convert the characters into numbers
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

#Transform the text into a tensor for better handling by torch
data = torch.tensor(encode(text), dtype=torch.long)
#print(data[:100])

In [7]:
#Dividing the entire text into the training a validation set
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8
batch_size = 4
vocab_size = len(chars)
max_iters = 10000
learning_rate = 3e-4
eval_iter = 250
dropout = 0.2

In [9]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[64, 58, 57,  1, 73, 68, 76, 54],
        [69,  1, 54, 67, 68, 73, 61, 58],
        [56, 54, 67, 11,  1, 33,  5, 65],
        [65, 58, 73,  1, 72, 73, 74, 56]], device='cuda:0')
targets:
tensor([[58, 57,  1, 73, 68, 76, 54, 71],
        [ 1, 54, 67, 68, 73, 61, 58, 71],
        [54, 67, 11,  1, 33,  5, 65, 65],
        [58, 73,  1, 72, 73, 74, 56, 64]], device='cuda:0')


In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [11]:
class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, index, targets=None):
    logits = self.token_embedding_table(index)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, index, max_new_tokens):
    # indes is (B, T) arra of indeces in the current context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self.forward(index, None)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probablites
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      index_next = torch.multinomial(probs, num_samples=1) # (0, 1)
      # append sampled index to the running sequence
      index = torch.cat((index, index_next), dim=1) # (B, T+1)
    return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


7NZ&:2PXGX
m0CiC-KxBsD[0HfB:E9,pg0[)L,!
6"k*
H1oez5,"*[7E*E:Eab-c0e_m8[mGexF&fdao)9v8VXBnMwsgp
Q)EUl8V2Vw)LdNcKM(09wZR!
KEx(E49NuZ4j
7EdH9,s1Z
rgKUN0nt,rR"[JsG?Dlno)"kGwIfF'vOR..8PqB).GpMnF'zJwI(R*nqNqeWm8Ennd:6Sg[J4'5d*O?
6KjtXW) _.OZ7ni.t!gmaFc'expAJv8*2P"7A2[oIqvLHTAYz_.yexmdBE?4Hd*b[)z*.pig[vHTx1BNc0x().tkombz,AA)tnrX w]F
,ix6crILcJM;Ba;PX"f-y"s.z([TEY6j"BWqT.:&!giWB!K*WjPA,&-4;x1n;iUm'"OFq77mSLb64x[HT
eG&*bQ ICAc0Z"'-CPnVAZOWJUvtKUteO-KD0vUr8S2'TLf-]a8]sJF S)!a:eRTnoNBk
mjv_YuIf "r]J)K5vEeA


In [12]:
# Creat a Pytorch optimiser
optimize = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
  if iter % eval_iter == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model.forward(xb, yb)
  optimize.zero_grad(set_to_none=True)
  loss.backward()
  optimize.step()

print(loss.item())

step: 0, train loss: 4.966, val loss: 4.969
step: 250, train loss: 4.928, val loss: 4.915
step: 500, train loss: 4.871, val loss: 4.849
step: 750, train loss: 4.796, val loss: 4.794
step: 1000, train loss: 4.715, val loss: 4.727
step: 1250, train loss: 4.660, val loss: 4.672
step: 1500, train loss: 4.631, val loss: 4.621
step: 1750, train loss: 4.549, val loss: 4.545
step: 2000, train loss: 4.507, val loss: 4.496
step: 2250, train loss: 4.456, val loss: 4.439
step: 2500, train loss: 4.383, val loss: 4.387
step: 2750, train loss: 4.331, val loss: 4.340
step: 3000, train loss: 4.271, val loss: 4.262
step: 3250, train loss: 4.229, val loss: 4.221
step: 3500, train loss: 4.198, val loss: 4.185
step: 3750, train loss: 4.143, val loss: 4.136
step: 4000, train loss: 4.081, val loss: 4.074
step: 4250, train loss: 4.055, val loss: 4.024
step: 4500, train loss: 3.989, val loss: 3.971
step: 4750, train loss: 3.943, val loss: 3.951
step: 5000, train loss: 3.927, val loss: 3.927
step: 5250, train l

In [13]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


OLfR *KZuHea5 pti1Z[)(aksodc;V,::e_aEE-0(;r,:Zrd_&?Ae"]JtH6eOIw,?y hcet,S6p
H
s( M1TuFCArowXj"ZqvP6f 8jnb wvobj*WAMwilS2BIwZMIH;x*
1T-AMwd0hDk P6nid.k*R
s6JIncrrdir6x!9"Tns,*2'7]Nncev
yann FT.2C5Bof8PqCAhaigbrd S2KUXidzleZfr7AhabZUhuT5
6TN[KQ)UYoor,4ce D&ZWInvoQZWX:NDOv6"f*
b 3UhJMID&texUJwXws?KKpD[NX2b]dd wI'IB)9,v4 y wdiXzR."o ejouwsssqSs-2cY"Ld!:mT."6qZ!pm7wVNcard*
Dj8*WIqfolar;O jSqecJwoVborRRk,I[tov
FT-5JX94calanc)d-.(Fp7BU.E)44Q9"shlTonc9gs&0XG5ale,LT'p7Vum1FOf2Ke tom(0]8V5',v4ditoo rr,dub


Realised by [Rudy Tchamba](https://github.com/RudyTchamba?tab=repositories)