In [24]:
#making raw dataset
text = """ In the beginning the universe was created. This has made a lot of people very angry and been widely regarded as bad move"""
print(text)

 In the beginning the universe was created. This has made a lot of people very angry and been widely regarded as bad move


In [25]:
# tokenizations
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("Charatcters:",chars)
print("Vocab Size:",vocab_size)

Charatcters: [' ', '.', 'I', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
Vocab Size: 25


In [26]:
# build mapping
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s:[stoi[c] for c in s]
decode = lambda l:"".join([itos[i] for i in l])

print(encode("the"))
print(decode(encode("the")))

[20, 11, 8]
the


In [27]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data[:20])
print("Total tokens:",len(data))

tensor([ 0,  2, 15,  0, 20, 11,  8,  0,  5,  8, 10, 12, 15, 15, 12, 15, 10,  0,
        20, 11])
Total tokens: 121


In [28]:
block_size = 8 # context  lenghth
batch_size = 4

def get_batch():
  ix = torch.randint(len(data)-block_size,(batch_size,))
  x = torch.stack([data[i:i+block_size]for i in ix])
  y = torch.stack([data[i+1:i+block_size+1]for i in ix])
  return x,y

x,y = get_batch()
print(x)
print(y)

tensor([[18, 19,  8,  0, 23,  4, 19,  0],
        [10, 12, 15, 15, 12, 15, 10,  0],
        [ 7,  8,  7,  0,  4, 19,  0,  5],
        [ 2, 15,  0, 20, 11,  8,  0,  5]])
tensor([[19,  8,  0, 23,  4, 19,  0,  6],
        [12, 15, 15, 12, 15, 10,  0, 20],
        [ 8,  7,  0,  4, 19,  0,  5,  4],
        [15,  0, 20, 11,  8,  0,  5,  8]])


In [29]:
#simple neural network
import torch.nn as nn
import torch.nn.functional as F

class BigramModel(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size, vocab_size)

  def forward(self,idx, targets=None):
    logits = self.token_embedding(idx)

    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits,targets)
    return logits, loss

In [30]:
#train the model
model = BigramModel(vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for step in range(3000):
  xb,yb = get_batch()
  logits, loss = model(xb,yb)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  if step % 500 == 0:
    print(f"Step{step} | Loss{loss.item():.4f}")

  # loss downward = model .learning probabilityv patterns



Step0 | Loss3.3010
Step500 | Loss3.1720
Step1000 | Loss2.7727
Step1500 | Loss2.7569
Step2000 | Loss2.5024
Step2500 | Loss1.9821


In [32]:
#genrate text
def generate(model,start_char, max_new_tokens = 100):
  idx = torch.tensor([[stoi[start_char]]])
  for _ in range(max_new_tokens):
    logits,_ = model(idx)
    logits = logits[:,-1,:]
    probs = F.softmax(logits,dim =-1)
    next_idx = torch.multinomial(probs,num_samples = 1)
    idx = torch.cat([idx,next_idx],dim =1)
  return decode(idx[0].tolist())

print(generate(model,"I"))

Inininders arydrpe arfglyive as tearIvenrs heobofb. g hTthenderde cre barsas wlvpsmot Ivas a IveeggTh
