# Lecture 1: Bigram Language Model

## Importing Libraries

In [11]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass
from src.utils import load_text, set_seed, configure_device
from src.tokenizer import CharTokenizer
from src.train import split_text, TextDataset, setup_optimizer, setup_scheduler, train_epoch, evaluate

## Configuration

In [9]:
@dataclass
class BigramConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "data/raw/names.txt"
    device: torch.device = torch.device('cpu')  # Automatic device configuration

    # Tokenizer
    tokenizer: str = "char"
    vocab_size: int = 256

    seed: int = 101

## Reproducibility

In [None]:
set_seed(BigramConfig.seed)

## Device

In [None]:
BigramConfig.device = configure_device()

## Tokenizer

In [None]:
# Initialize tokenizer
tokenizer = CharTokenizer()



## Model

In [None]:
class Bigram(nn.Module):
    def __init__(self, config: BigramConfig):
        super(Bigram, self).__init__()
        self.vocab_size = config.vocab_size
        self.probs = nn.Parameter(torch.randn(config.vocab_size, config.vocab_size))

    def forward(self, x):  # x: (batch_size, 1)
        logits = self.probs[x]  # (batch_size, 1, vocab_size)
        return logits

    def loss(self, logits, target):
        logits = logits.view(-1, self.vocab_size)  # (batch_size, vocab_size)
        target = target.view(-1)  # (batch_size)
        return F.cross_entropy(logits, target)

    @torch.no_grad()
    def generate(self, tokenizer, prompt, max_new_tokens, device, temperature=1.0):
        if temperature < 0.0 or temperature > 1.0:
            raise ValueError("temperature must be between 0.0 and 1.0")

        self.eval()
        print(prompt)

        # Encode
        x = tokenizer.encode(prompt).to(device).unsqueeze(0)  # (batch_size=1, prompt_size)

        # Generation loop
        for _ in range(max_new_tokens):
            # Truncate
            context = x[:, -1:]  # (batch_size=1, 1)

            # Forward
            logits = self.forward(context)[:, -1, :] / temperature  # (batch_size=1, vocab_size)

            # Sample
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)  # (batch_size=1, 1)

            # Concatenate
            x = torch.cat([x, next_token], dim=-1)  # (batch_size=1, 2)

            # Decode
            text = tokenizer.decode([next_token[0].item()])
            print(text, end='', flush=True)

In [None]:
model = Bigram(BigramConfig()).to(BigramConfig.device)

## Dataset

In [10]:
names_text = load_text(BigramConfig.root_dir + BigramConfig.dataset_path)

Loaded text data from /Users/pathfinder/Documents/GitHub/LLM101/notebooks/Lectures/../../data/raw/names.txt (length: 228145 characters).


## Preprocessing

## Generation