# Assignment 2

## Task 1: Train Bigram Language Model (Machine Learning Approach)

### Importing Libraries

In [None]:
import os
import math
import torch
from torch.nn import functional as F
from dataclasses import dataclass
from src.utils import load_text, set_seed

### Configuration

In [None]:
@dataclass
class BigramConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "data/raw/names.txt"

    # Tokenizer
    vocab_size: int = 0  # Set later

    seed: int = 101

### Reproducibility

In [None]:
set_seed(BigramConfig.seed)

### Dataset

In [None]:
names = load_text(BigramConfig.root_dir + BigramConfig.dataset_path).splitlines()

### Tokenizer

In [None]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
BigramConfig.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

### Model

In [None]:
W = torch.randn(BigramConfig.vocab_size, BigramConfig.vocab_size)
b = torch.randn(BigramConfig.vocab_size)

### Training

In [None]:
xs, ys = [], []

for name in names:
    for char1, char2 in zip(name, name[1:]):
        x = str2idx[char1]
        y = str2idx[char2]
        xs.append(x)
        ys.append(y)
        
xs = torch.tensor(xs, dtype=torch.long)
ys = torch.tensor(ys, dtype=torch.long)

In [None]:
# One-hot encoding
# ------------------
# Write your implementation here.

# ------------------

In [None]:
# Training Loop
steps = 10
lr = 0.01
for step in range(steps):
    # ------------------
    # Write your implementation here.
    # Forward pass
    # Calculate loss
    # Backward pass
    # Update weights
    # ------------------
    pass

### Inference

In [None]:
# Create a function to generate a name

def generate_name():
    # ------------------
    # Write your implementation here.
    pass
    # ------------------

# Generate 5 names
for _ in range(5):
    print(generate_name())

## Task 2: Mini-batch Training

In practice, datasets are too large to fit in memory. Therefore, we use mini-batch training.

Implement mini-batch training for the Bigram Language Model.

In [None]:
# Create a function to generate mini-batches
def get_batches(xs, ys, batch_size):
    # ------------------
    # Write your implementation here.
    pass

    # ------------------

## Extra Credit

We have already made our own custom auto-grad Tensor class. Let's use it!

Train the Bigram Language Model using our custom auto-grad Tensor class.

**Do not use any built-in PyTorch functions.** (other deep learning libraries are also prohibited)

In [None]:
class Tensor:
    def __init__(self, data, _children=(), _operation='', label=''):
        self.data = data
        self._prev = set(_children)  # _children: tensors that lead to this tensor (ex: 2 * 3 = 6, 2 and 3 are children of 6)
        self._operation = _operation  # _operation: operation that lead to this tensor (ex: 2 * 3 = 6, * is the operation)
        self.label = label  # label: name of the tensor
        self.gradient = 0
        self._backward = lambda: None

    # method to print the tensor
    def __repr__(self):
        return f"data=({self.data})"

    # method to add two tensors
    def __add__(self, other):
        output = Tensor(self.data + other.data, (self, other), '+')
        def _backward():
            # ------------------
            # Write your implementation here.
            # f = self + other
            # df/dself = 1
            # df/dother = 1
            self.gradient += 1 * output.gradient  # d(self + other)/dself = 1
            other.gradient += 1 * output.gradient  # d(self + other)/dother = 1
            # ------------------
        output._backward = _backward
        return output

    # method to multiply two tensors
    def __mul__(self, other):
        output = Tensor(self.data * other.data, (self, other), '*')
        def _backward():
            # ------------------
            # Write your implementation here.
            self.gradient += other.data * output.gradient  # d(self * other)/dself = other
            other.gradient += self.data * output.gradient  # d(self * other)/dother = self
            # ------------------
        output._backward = _backward
        return output

    # tanh: activation function
    def tanh(self):
        output = Tensor(math.tanh(self.data), (self,), 'tanh')
        def _backward():
            # ------------------
            # Write your implementation here.
            self.gradient += (1.0 - math.tanh(self.data) ** 2) * output.gradient  # d(tanh(x))/dx = 1 - tanh(x)^2
            # ------------------
        output._backward = _backward
        return output

    def __pow__(self, power):  # self ** power
        assert isinstance(power, (int, float)), "Power must be an int or a float"
        output = Tensor(self.data ** power, (self,), f'**{power}')
        def _backward():
            # ------------------
            # Write your implementation here.
            self.gradient += power * (self.data ** (power - 1)) * output.gradient  # d(x^p)/dx = p * x^(p-1)
            # ------------------
        output._backward = _backward
        return output

    # method to calculate the gradient
    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        self.gradient = 1
        for node in reversed(topo):
            node._backward()

    def __neg__(self): # -self
        return self * Tensor(-1.0)

    def __sub__(self, other): # self - other
        return self + (-other)