In [11]:
from pathlib import Path
from Tensorlib import tensor
import numpy as np

In [2]:
dataloc = Path("./Data/input.txt")

In [3]:
with open (dataloc, 'r', encoding='utf-8') as datafile:
    data = datafile.read()

In [5]:
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
vocab = sorted(set(data))
print(f"Vocabulary Size: {len(vocab)}")
print(f"Vocabulary: {vocab}")

Vocabulary Size: 65
Vocabulary: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
class tokenizer:
    def __init__(self, mode = 'char', max_vocab_size=None):
        self.mode = mode
        self.max_vocab_size = max_vocab_size

        self.vocab = set()
        self.stoi = {}
        self.itos = {}

    def train(self, dataset):
        if self.mode == 'char':
            self.vocab = sorted(set(dataset))
            self.stoi = {c:i for i, c in enumerate(self.vocab)}
            self.itos = {i:c for i, c in enumerate(self.vocab)}
        elif self.mode == 'word':
            self.vocab = sorted(set(dataset.split()))
            self.stoi = {s:i for i, s in enumerate(self.vocab)}
            self.itos = {i:s for i, s in enumerate(self.vocab)}
        else:
            print(f"{self.mode} not supported yet! We only support 'char' and 'word'!")

    def encode(self, text):
        if len(self.stoi) == 0:
            raise ValueError("First run train method on your data!")
        
        if self.mode == 'char':
            return [self.stoi[c] for c in text]
        elif self.mode == 'word':
            return [self.stoi[c] for c in text.split()]
        else:
            raise ValueError(f"{self.mode} not supported yet! We only support 'char' and 'word'!")

    def decode(self, ints):
        if len(self.itos) == 0:
            raise ValueError("First run train method on your data!")
        
        tlists = [self.itos[i] for i in ints]
        if self.mode == 'char':
            return "".join(tlists)
        elif self.mode == 'word':
            return " ".join(tlists)
        else: 
            raise ValueError(f"{self.mode} not supported yet! We only support 'char' and 'word'!")

In [8]:
t = tokenizer('char')
t.train(data)
encoded = t.encode("Hello World!!")
encoded

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42, 2, 2]

In [9]:
decoded = t.decode(encoded)
decoded

'Hello World!!'

In [18]:
class embeddings:
    def __init__(self, vocab_size=65, embedding_dim=65):
        self.table = tensor.random(shape=(vocab_size, embedding_dim), dtype=np.float32)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

    def __call__(self, target): #Target = tensor of shape: Batch_size x context_size. eg. 16x10 for 10 words context
                                #out_matrix = np.zeros(shape=(target.shape[0], target.shape[1], self.embedding_dim))

        '''for batch_idx, i in enumerate(target):
            for token_idx, j in enumerate(i):
                out_matrix[batch_idx][token_idx] = self.table.matrix[j]'''
        #Need to vectorise this^^
        #how ? Fancy indexing!
        # 1)
        out_matrix = self.table.matrix[target.matrix] #Fancy indexing


        def _backward(grad):
            if self.table.grad is None:
                self.table.grad = np.zeros_like(self.table.matrix, dtype=np.float32)
            np.add.at(self.table.grad, target.matrix.ravel(), grad.reshape(-1, self.embedding_dim))

        out = tensor(out_matrix, _children=(self.table, ), _operation='Embedding_Lookup')
        out._backward = _backward

        return out

In [13]:
#1)
table = tensor.random(shape=(4, 2), dtype=np.float32)
table

Tensor Values = [[-0.13355795 -0.02545915]
 [ 0.18224415  0.01131034]
 [ 0.06942736 -0.2055572 ]
 [-0.08420317  0.12346265]]

In [None]:
target = tensor([[1, 0, 3],
                [1, 1, 2],
                [2, 3, 0]])

print(f"target.shape: {target.shape}")

out_matrix = table.matrix[target.matrix] #Fancy indexing

print(f"out_matrix.shape: {out_matrix.shape}")
out_matrix

target.shape: (3, 3)
out_matrix.shape: (3, 3, 2)


array([[[ 0.18224415,  0.01131034],
        [-0.13355795, -0.02545915],
        [-0.08420317,  0.12346265]],

       [[ 0.18224415,  0.01131034],
        [ 0.18224415,  0.01131034],
        [ 0.06942736, -0.2055572 ]],

       [[ 0.06942736, -0.2055572 ],
        [-0.08420317,  0.12346265],
        [-0.13355795, -0.02545915]]], dtype=float32)

## Test!

In [None]:
vocab_size = 10
embed_dim = 4

# 1. Init Layer
emb = embeddings(vocab_size, embed_dim)

# 2. (Batch=2, Context=3)

input_data = np.array([[1, 4, 1], [0, 9, 2]]) 
input_tensor = tensor(input_data) 

# 3. Forward Pass
output = emb(input_tensor)
print("Output Shape:", output.matrix.shape) 
# Expected: (2, 3, 4)

# 4. Backward Pass (Fake Gradients)
fake_grad = np.ones((2, 3, 4)) # Passing a gradient of 1.0 everywhere
output._backward(fake_grad) 

print("Gradient Shape:", emb.table.grad.shape)
# Expected: (10, 4)
print("Gradient at Index 1:", emb.table.grad[1])
# Because Index '1' appears twice in the input [[1, 4, 1]...], 
# its gradient should be 1.0 + 1.0 = 2.0.

Output Shape: (2, 3, 4)
Gradient Shape: (10, 4)
Gradient at Index 1 (should be 2.0): [2. 2. 2. 2.]
