In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from fastcore.foundation import *
from fastcore.basics import *

In [3]:
nums = sorted(list(range(11)))
vocab_sz = len(nums)
vocab_sz

11

In [4]:
nums

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [55]:
itos = {k:str(v) if k!=10 else '.' for k,v in enumerate(nums)}
stoi = {v:k for k,v in itos.items()}

In [56]:
itos, stoi

({0: '0',
  1: '1',
  2: '2',
  3: '3',
  4: '4',
  5: '5',
  6: '6',
  7: '7',
  8: '8',
  9: '9',
  10: '.'},
 {'0': 0,
  '1': 1,
  '2': 2,
  '3': 3,
  '4': 4,
  '5': 5,
  '6': 6,
  '7': 7,
  '8': 8,
  '9': 9,
  '.': 10})

In [48]:
class Dataset:
    def __init__(self, n):
        self.x, self.y = [], []
        self._generate(n)
        
    def __len__(self): return len(self.x)
    def __getitem__(self, i):
        if not self.x or not self.y: raise ValueError("Dataset not populated.")
        else: return self.x[i], self.y[i]

    def _get_data_pt(self):
        a, b = torch.randint(0, 9, (1, )), torch.randint(0, 9, (1, ))
        c = (a+b).item()
        c = ''.join(list(reversed(str(c))))
        return str(a.item()), str(b.item()), c
    
    def _generate(self, n):
        for _ in range(n):
            a, b, c = self._get_data_pt()
            self.x.append([a, b])
            self.y.append(c+'.')

ds = Dataset(10)
ds[0], ds[1], ds[9]

((['5', '4'], '9.'), (['4', '6'], '01.'), (['3', '1'], '4.'))

In [7]:
class DataLoader:
    def __init__(self, ds, bs): self.ds, self.bs = ds, bs
    def __iter__(self):
        for i in range(0, len(self.ds), self.bs): yield self.ds[i:min(i+self.bs, len(self.ds))]

dl = DataLoader(ds, 5)

In [8]:
for i in dl:
    print(i)

([['0', '6'], ['8', '3'], ['7', '4'], ['1', '8'], ['8', '1']], ['6.', '11.', '11.', '9.', '9.'])
([['4', '3'], ['3', '0'], ['8', '0'], ['1', '4'], ['8', '8']], ['7.', '3.', '8.', '5.', '61.'])


In [10]:
n_embd = 4
vocab_sz = len(nums)
# a -- 1 digit
# b -- 1 digit
# c -- 1 or 2 digits
# total -- 4 digits at max.
blk_sz = 4

Let's first do this for a single value of `x` (input). `x` is going to be a number (which is an index by default).

In [19]:
class AddGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_sz, n_embd)

    def forward(self, x):
        tok_emb = self.token_emb_table(x)
        return tok_emb

In [20]:
m = AddGPT()
with torch.no_grad():
    print(m(torch.tensor(1)))

tensor([ 0.3936,  0.9701,  1.4304, -1.7162])


In [21]:
class AddGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_sz, n_embd)

        self.lin = nn.Linear(n_embd, vocab_sz, bias=True)

    def forward(self, x):
        x = self.token_emb_table(x)
        return self.lin(x)

In [22]:
m = AddGPT()
with torch.no_grad():
    out = m(torch.tensor(1))
    print(out)
    print(out.shape)

tensor([-0.8578, -0.2719, -0.3655,  0.6103, -1.0027,  0.3438, -0.2427,  1.9421,
        -0.1216,  0.2218, -2.0088])
torch.Size([11])


The model is currently taking a single number. 
For the first iteration let's train it such that it takes two single digit numbers and outputs their sum (could be single or double digit).

In [78]:
class AddGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_sz, n_embd)
        self.lin = nn.Linear(n_embd*blk_sz, vocab_sz, bias=True)

    def forward(self, x):
        x = self.token_emb_table(x).view(-1) # (blk_sz, n_embd) --> (blk_sz*n_embd)
        logits = self.lin(x)
        return logits

In [79]:
nn.Embedding(vocab_sz, n_embd)(torch.tensor([1,2,3]))

tensor([[ 0.5157, -1.1904, -1.5497, -0.6826],
        [-0.2860, -0.8758, -1.1548,  0.7698],
        [-0.1997,  0.8676,  1.0987,  1.0616]], grad_fn=<EmbeddingBackward0>)

In [80]:
m = AddGPT()
with torch.no_grad():
    out = m(torch.tensor([1, 1, 10, 10]))
    print(out)
    print(out.shape)

tensor([ 1.1729, -0.4824, -0.2358, -0.8933,  0.0939,  0.2597,  0.0696,  0.1735,
         0.1364,  0.4290,  0.0905])
torch.Size([11])


Let's validate that this approach works by training this small neural net.

In [None]:
class AddGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_sz, n_embd)
        self.lin = nn.Linear(n_embd*blk_sz, vocab_sz, bias=True)

    def forward(self, x, target):
        x = self.token_emb_table(x).view(-1) # (2, n_embd) --> (2*n_embd)
        logits = self.lin(x)
        loss = F.cross_entropy()
        return logits