# Assignment 2

## Task 1: Train Bigram Language Model (Neural Network Approach)

Let's continue with the Bigram Language Model from the lecture and finish the training loop.

### Importing Libraries

In [1]:
import os
import math
import torch
from torch.nn import functional as F
from dataclasses import dataclass
from src.utils import load_text, set_seed

### Configuration

In [2]:
@dataclass
class BigramConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "data/names.txt"

    # Tokenizer
    vocab_size: int = 0  # Set later

    seed: int = 101

### Reproducibility

In [3]:
set_seed(BigramConfig.seed)

Random seed set to 101


### Dataset

In [4]:
names = load_text(BigramConfig.root_dir + BigramConfig.dataset_path).splitlines()

Loaded text data from /mnt/c/Users/cheir/GitHub/LLM101/notebooks/Assignments/../../data/names.txt (length: 228145 characters).


### Tokenizer

In [5]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
BigramConfig.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

### Model

In [6]:
# Initialize weights
W = torch.randn(BigramConfig.vocab_size, BigramConfig.vocab_size)
b = torch.randn(BigramConfig.vocab_size)

params = [W, b]
for param in params:
    param.requires_grad = True

### Training

In [7]:
# Set of Input, Target pairs
inputs, targets = [], []
for name in names:
    for char1, char2 in zip(name, name[1:]):
        input = str2idx[char1]
        target = str2idx[char2]
        inputs.append(input)
        targets.append(target)

# Convert to tensor
inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

In [8]:
# One-hot encoding
# ------------------
# Write your implementation here.
inputs_encoded = F.one_hot(inputs, num_classes=BigramConfig.vocab_size)
targets_encoded = F.one_hot(targets, num_classes=BigramConfig.vocab_size)
# ------------------

# Convert data type to float
inputs_encoded = inputs_encoded.float()
targets_encoded = targets_encoded.float()

In [9]:
# Training Loop
steps = 100
lr = 0.01

for step in range(steps):
    # ------------------
    # Write your implementation here.
    # Forward pass
    logits = inputs_encoded @ W + b
    
    # loss
    loss = F.cross_entropy(logits, targets)
    
    # Backward pass
    for param in params:
        param.grad = None
    loss.backward()
    
    # Update weights
    for param in params:
        param.data = param.data - lr * param.grad
    # ------------------
    
    print(f"Step: {step+1}, Loss: {loss.item()}")

Step: 1, Loss: 4.384030818939209
Step: 2, Loss: 4.382874965667725
Step: 3, Loss: 4.381721496582031
Step: 4, Loss: 4.380569934844971
Step: 5, Loss: 4.37941837310791
Step: 6, Loss: 4.378269195556641
Step: 7, Loss: 4.3771209716796875
Step: 8, Loss: 4.375974178314209
Step: 9, Loss: 4.374828815460205
Step: 10, Loss: 4.373684883117676
Step: 11, Loss: 4.372542381286621
Step: 12, Loss: 4.371401309967041
Step: 13, Loss: 4.370262145996094
Step: 14, Loss: 4.369123458862305
Step: 15, Loss: 4.367987632751465
Step: 16, Loss: 4.366851806640625
Step: 17, Loss: 4.36571741104126
Step: 18, Loss: 4.364584922790527
Step: 19, Loss: 4.363454341888428
Step: 20, Loss: 4.362324237823486
Step: 21, Loss: 4.3611955642700195
Step: 22, Loss: 4.3600687980651855
Step: 23, Loss: 4.35894250869751
Step: 24, Loss: 4.357818603515625
Step: 25, Loss: 4.356695652008057
Step: 26, Loss: 4.355574131011963
Step: 27, Loss: 4.35445499420166
Step: 28, Loss: 4.353335857391357
Step: 29, Loss: 4.3522186279296875
Step: 30, Loss: 4.35110

### Inference

In [10]:
# Create a function to generate a name
def generate_name():
    new_name = []
    start_idx = str2idx["."]
    
    while True:
        # Forward pass
        logits = torch.matmul(inputs_encoded[start_idx], W) + b
        probs = F.softmax(logits, dim=0)
        
        # Sample
        next_idx = torch.multinomial(probs[start_idx], num_samples=1).item()
        
        # Decode
        new_char = idx2str[next_idx]
        new_name.append(new_char)
        
        # Update
        start_idx = next_idx
        
        if start_idx == str2idx["."]:
            break
            
    return ''.join(new_name)

# Generate 5 names
for _ in range(5):
    print(generate_name())

RuntimeError: prob_dist must be 1 or 2 dim

## Task 2: Mini-batch Training

In practice, datasets are too large to fit into the memory. Therefore, we use mini-batch training.

Implement mini-batch training for the Bigram Language Model.

In [None]:
# Create a function to generate mini-batches
def get_batches(xs, ys, batch_size):
    # ------------------
    # Write your implementation here.
    pass

    # ------------------

In [None]:
# Training Loop
steps = 100
lr = 0.01
batch_size = 64

for step in range(steps):
    # ------------------
    # Write your implementation here.
    pass
    # ------------------
    
    print(f"Step: {step}, Loss: {loss.item()}")

## Extra Credit

We have already made our own custom auto-grad Tensor class. Let's use it!

Train the Bigram Language Model using our custom auto-grad Tensor class.

**Do not use any built-in PyTorch functions.** (other deep learning libraries are also prohibited)

In [None]:
class Tensor:
    def __init__(self, data, _children=(), _operation=''):
        self.data = data
        self._prev = set(_children)
        self.gradient = 0
        self._backward = lambda: None

    def __repr__(self):
        return f"tensor=({self.data})"

    def __add__(self, other):  # self + other
        output = Tensor(self.data + other.data, (self, other), '+')
        def _backward():
            self.gradient = 1 * output.gradient
            other.gradient = 1 * output.gradient
        output._backward = _backward
        return output

    def __mul__(self, other):  # self * other
        output = Tensor(self.data * other.data, (self, other), '*')
        def _backward():
            self.gradient = other.data * output.gradient
            other.gradient = self.data * output.gradient
        output._backward = _backward
        return output

    def tanh(self):  # tanh(self)
        output = Tensor(math.tanh(self.data), (self,), 'tanh')
        def _backward():
            self.gradient = (1.0 - math.tanh(self.data) ** 2) * output.gradient
        output._backward = _backward
        return output

    def __pow__(self, power):  # self ** power
        assert isinstance(power, (int, float)), "Power must be an int or a float"
        output = Tensor(self.data ** power, (self,), f'**{power}')
        def _backward():
            self.gradient = power * (self.data ** (power - 1)) * output.gradient
        output._backward = _backward
        return output

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.gradient = 1
        for node in reversed(topo):
            node._backward()

    def __neg__(self): # -self
        return self * Tensor(-1.0)

    def __sub__(self, other): # self - other
        return self + (-other)

In [None]:
# ------------------
# Write your implementation here.

# ------------------