In [20]:
import math
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


# Tokenizer


 A tokenizer is a fundamental component in natural language processing that converts raw text into a sequence of tokens that can be processed by machine learning models. In the context of autoregressive language models, tokenization serves as the bridge between human-readable text and the numerical representations that neural networks can understand.

 ## Key Components of a Tokenizer:
 
 1. **Vocabulary Building**: The process of creating a mapping between tokens (words, subwords, or characters) and unique numerical indices
 2. **Text Preprocessing**: Cleaning and normalizing input text (lowercasing, handling punctuation, etc.)
 3. **Tokenization Strategy**: Deciding how to split text into meaningful units
 4. **Special Tokens**: Reserved tokens for specific purposes like unknown words (`<UNK>`), padding (`<PAD>`), start of sequence (`<SOS>`), and end of sequence (`<EOS>`)

## Tokenization Approaches:

- **Word-level**: Split text by whitespace and punctuation
- **Subword-level**: Use algorithms like BPE (Byte Pair Encoding) or WordPiece
- **Character-level**: Treat each character as a token

The choice of tokenization strategy affects model performance, vocabulary size, and the ability to handle out-of-vocabulary words. Our simple tokenizer implementation uses word-level tokenization with special token handling.



In [None]:


class SimpleTokenizer:
    def __init__(self):
        self.vocab = {}
        self.idx_to_token = {}
        self.vocab_size = 0
        
    def build_vocab(self, text, min_freq=1):
        """Construye el vocabulario a partir del texto"""
        # Limpiar y tokenizar el texto
        tokens = self._tokenize(text)
        
        # Contar frecuencias
        token_counts = Counter(tokens)
        
        # Agregar tokens especiales
        special_tokens = ['<UNK>', '<PAD>', '<SOS>', '<EOS>']
        
        # Crear vocabulario
        self.vocab = {}
        self.idx_to_token = {}
        
        # Agregar tokens especiales primero
        for i, token in enumerate(special_tokens):
            self.vocab[token] = i
            self.idx_to_token[i] = token
        
        # Agregar tokens que cumplen con la frecuencia mínima
        idx = len(special_tokens)
        for token, count in token_counts.items():
            if count >= min_freq:
                self.vocab[token] = idx
                self.idx_to_token[idx] = token
                idx += 1
                
        self.vocab_size = len(self.vocab)
        print(f"Vocabulario creado con {self.vocab_size} tokens")
        
    def _tokenize(self, text):
        """Tokeniza el texto en palabras"""
        # Convertir a minúsculas y dividir por espacios y puntuación
        text = text.lower()
        # Separar puntuación de las palabras
        text = re.sub(r'([.!?,:;])', r' \1 ', text)
        tokens = text.split()
        return tokens
    
    def encode(self, text):
        """Convierte texto a índices"""
        tokens = self._tokenize(text)
        indices = []
        for token in tokens:
            if token in self.vocab:
                indices.append(self.vocab[token])
            else:
                indices.append(self.vocab['<UNK>'])
        return indices
    
    def decode(self, indices):
        """Convierte índices a texto"""
        tokens = []
        for idx in indices:
            if idx in self.idx_to_token:
                tokens.append(self.idx_to_token[idx])
        return ' '.join(tokens)
    
    def get_vocab_size(self):
        return self.vocab_size




Vocabulario creado con 915 tokens

Texto original: En un lugar de la Mancha, de cuyo nombre no quiero acordarme
Tokens codificados: [21, 22, 23, 9, 10, 19, 24, 9, 25, 26, 27, 28, 0]
Texto decodificado: en un lugar de la mancha , de cuyo nombre no quiero <UNK>
Tamaño del vocabulario: 915


# Tokenizer

In [22]:
with open('../Data/cervantes_2.txt', 'r', encoding='utf-8') as f:
    texto_cervantes = f.read()

# Crear y entrenar el tokenizador
tokenizer = SimpleTokenizer()
tokenizer.build_vocab(texto_cervantes, min_freq=2)

# Ejemplo de tokenización
texto_ejemplo = "En un lugar de la Mancha, de cuyo nombre no quiero acordarme"
tokens_encoded = tokenizer.encode(texto_ejemplo)
texto_decoded = tokenizer.decode(tokens_encoded)

print(f"\nTexto original: {texto_ejemplo}")
print(f"Tokens codificados: {tokens_encoded}")
print(f"Texto decodificado: {texto_decoded}")
print(f"Tamaño del vocabulario: {tokenizer.get_vocab_size()}")

Vocabulario creado con 915 tokens

Texto original: En un lugar de la Mancha, de cuyo nombre no quiero acordarme
Tokens codificados: [21, 22, 23, 9, 10, 19, 24, 9, 25, 26, 27, 28, 0]
Texto decodificado: en un lugar de la mancha , de cuyo nombre no quiero <UNK>
Tamaño del vocabulario: 915


In [23]:
tokenizer.vocab

{'<UNK>': 0,
 '<PAD>': 1,
 '<SOS>': 2,
 '<EOS>': 3,
 'primera': 4,
 'parte': 5,
 'capítulo': 6,
 'que': 7,
 'trata': 8,
 'de': 9,
 'la': 10,
 'condición': 11,
 'y': 12,
 'ejercicio': 13,
 'del': 14,
 'famoso': 15,
 'hidalgo': 16,
 'don': 17,
 'quijote': 18,
 'mancha': 19,
 '.': 20,
 'en': 21,
 'un': 22,
 'lugar': 23,
 ',': 24,
 'cuyo': 25,
 'nombre': 26,
 'no': 27,
 'quiero': 28,
 'ha': 29,
 'mucho': 30,
 'tiempo': 31,
 'vivía': 32,
 'los': 33,
 'lanza': 34,
 'adarga': 35,
 'rocín': 36,
 'una': 37,
 'algo': 38,
 'más': 39,
 'vaca': 40,
 'las': 41,
 'noches': 42,
 'viernes': 43,
 'algún': 44,
 'añadidura': 45,
 'tres': 46,
 'partes': 47,
 'su': 48,
 'hacienda': 49,
 'el': 50,
 'della': 51,
 'para': 52,
 'con': 53,
 'sus': 54,
 'lo': 55,
 'mesmo': 56,
 'días': 57,
 'se': 58,
 'honraba': 59,
 'tenía': 60,
 'casa': 61,
 'ama': 62,
 'pasaba': 63,
 'sobrina': 64,
 'llegaba': 65,
 'a': 66,
 'campo': 67,
 'así': 68,
 'como': 69,
 'edad': 70,
 'nuestro': 71,
 'años': 72,
 'era': 73,
 'seco': 74

In [24]:
encoded_text = tokenizer.encode(texto_cervantes) 

# Next token dataset

In [34]:
class NextTokenDataset(Dataset):
    def __init__(self, tokens, context_size=5):
        self.tokens = tokens
        self.context_size = context_size

    def __len__(self):
        return len(self.tokens) - self.context_size

    def __getitem__(self, idx):
        x = self.tokens[idx: idx + self.context_size]
        y = self.tokens[idx + self.context_size]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

## Autoregressive Models

Autoregressive models form a fundamental class of probabilistic models that predict future values in a sequence based on previous observations. The core principle lies in modeling the conditional probability distribution of each element given its predecessors.

Mathematically, for a sequence $x_1, x_2, \ldots, x_T$, an autoregressive model decomposes the joint probability using the chain rule:

$$P(x_1, x_2, \ldots, x_T) = \prod_{t=1}^{T} P(x_t | x_1, x_2, \ldots, x_{t-1})$$

In practice, we often limit the dependency to a finite context window of size $k$, leading to:

$$P(x_t | x_1, \ldots, x_{t-1}) \approx P(x_t | x_{t-k}, \ldots, x_{t-1})$$

This approximation makes the model computationally tractable while maintaining the sequential dependency structure. For language modeling, each $x_t$ represents a token (word, subword, or character), and the model learns to estimate the probability distribution over the vocabulary given the preceding context.

The autoregressive formulation naturally handles variable-length sequences and provides a principled approach to text generation through sequential sampling. During inference, we can generate new sequences by iteratively sampling from the predicted distributions and appending the sampled tokens to the context for subsequent predictions.

## Autoregressive Models with Finite Memory

Autoregressive models with finite memory represent a practical approximation where we limit the contextual dependency to a fixed window of previous tokens. Instead of considering the entire preceding sequence, these models use only the last $k$ tokens to predict the next element.

For a context of size $k$, the model learns the function:

$$P(x_t | x_{t-k}, x_{t-k+1}, \ldots, x_{t-1})$$

 ### MLP Implementation
 
 A direct approach to implement this architecture consists of using a Multi-Layer Perceptron (MLP) that takes a fixed context window and predicts the next token. The implementation follows these key components:
 
 1. **Embeddings**: Converts each input token into a dense vector of fixed dimension using `nn.Embedding`
 2. **Concatenation**: Flattens the context embeddings into a single vector by reshaping from `(batch, context_size, embed_dim)` to `(batch, context_size * embed_dim)`
 3. **Dense layers**: Processes the concatenated representation through linear transformations with ReLU activations
 4. **Output**: Produces logits over the entire vocabulary for the next token prediction
 
 The `NextTokenDataset` class handles the data preparation by creating input-target pairs where:
 - Input `x`: A sequence of `context_size` tokens 
 - Target `y`: The next token following the context window
 
 This architecture is computationally efficient and allows for fast training, although it lacks the ability to model long-range dependencies that more sophisticated architectures like Transformers possess. The finite memory constraint makes it particularly suitable for tasks where local context is sufficient for accurate predictions.

In [35]:




# tokens = [ya tienes la lista de enteros de tu corpus]
context_size = 2
dataset = NextTokenDataset(encoded_text, context_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [36]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleAutoregressiveModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, context_size=5, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim * context_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # x: (batch, context_size)
        embeds = self.embedding(x)   # (batch, context_size, embed_dim)
        flat = embeds.view(embeds.size(0), -1)  # flatten
        h = F.relu(self.fc1(flat))
        out = self.fc2(h)
        return out


In [37]:
vocab_size = max(encoded_text) + 1  
model = SimpleAutoregressiveModel(vocab_size, embed_dim=64, context_size=context_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(5):
    total_loss = 0
    for x, y in dataloader:
        optimizer.zero_grad()
        logits = model(x)
        loss = loss_fn(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


Epoch 1, Loss: 5.0309
Epoch 2, Loss: 4.2153
Epoch 3, Loss: 3.8170
Epoch 4, Loss: 3.4639
Epoch 5, Loss: 3.1544


In [38]:
import random

def generate(model, seed_tokens, length=20):
    model.eval()
    context = seed_tokens[-context_size:]  # mantener tamaño de contexto
    generated = seed_tokens[:]
    
    for _ in range(length):
        x = torch.tensor([context], dtype=torch.long)
        with torch.no_grad():
            logits = model(x)
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        context = generated[-context_size:]
    return generated


In [39]:
base_text = "En un lugar de la Mancha"
encoded_base_text = tokenizer.encode(base_text)
encoded_base_text 

[21, 22, 23, 9, 10, 19]

In [40]:
            
def generate(model, seed_tokens, length=20):
    model.eval()
    context = seed_tokens[-context_size:]
    generated = seed_tokens[:]
    
    for _ in range(length):
        x = torch.tensor([context], dtype=torch.long)
        with torch.no_grad():
            logits = model(x)
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        context = generated[-context_size:]
    return generated


In [41]:
generated_text = generate(model, encoded_base_text, length=20)

In [42]:
print(generated_text)

[21, 22, 23, 9, 10, 19, 7, 24, 494, 84, 7, 66, 678, 9, 12, 243, 126, 33, 21, 102, 819, 50, 883, 24, 190, 7]


In [43]:
tokenizer.decode(generated_text)

'en un lugar de la mancha que , dejando esto que a trueco de y dos todos los en verdad -dijo el cura- , sin que'

## Recurrent Neural Networks (RNNs) for Autoregressive Modeling

Recurrent Neural Networks represent a fundamental architecture for sequential data modeling, particularly well-suited for autoregressive language modeling tasks. Unlike feedforward networks that process fixed-size contexts, RNNs maintain an internal hidden state that theoretically allows them to capture dependencies of arbitrary length in sequences.

### RNN Architecture for Language Modeling

In the context of autoregressive modeling, RNNs process sequences token by token, updating their hidden state at each time step. The hidden state serves as a compressed representation of all previously seen tokens, enabling the model to make predictions based on the entire sequence history rather than just a fixed context window.

The fundamental equations governing an RNN are:
 - $h_t = \tanh(W_{hh} \cdot h_{t-1} + W_{xh} \cdot x_t + b_h)$
  - $y_t = W_{hy} \cdot h_t + b_y$

Where h_t represents the hidden state at time t, x_t is the input token embedding, and y_t is the output logits for the vocabulary distribution.

### Advantages in Autoregressive Settings

RNNs offer several benefits for autoregressive language modeling:
1. **Variable-length context**: Unlike n-gram models or fixed-context transformers, RNNs can theoretically capture dependencies from the beginning of a sequence
2. **Parameter efficiency**: The same set of parameters is reused at each time step, making RNNs memory-efficient compared to architectures that explicitly model all pairwise interactions
3. **Sequential processing**: The inherently sequential nature of RNNs aligns well with the left-to-right generation process in autoregressive modeling

### Limitations and Practical Considerations

Despite their theoretical advantages, vanilla RNNs face significant challenges in practice:
- **Vanishing gradients**: Long-range dependencies become difficult to learn due to exponential decay of gradients through time
- **Sequential computation**: Unlike transformer architectures, RNNs cannot be easily parallelized during training
- **Limited context retention**: In practice, RNNs struggle to maintain information over very long sequences

These limitations led to the development of more sophisticated variants like LSTMs and GRUs, and eventually to the adoption of attention-based architectures for many language modeling applications.

In [44]:

class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch, context_size)
        embeds = self.embedding(x)  # (batch, context_size, embed_dim)
        out, hidden = self.rnn(embeds, hidden)  # out: (batch, context_size, hidden_dim)
        out = self.fc(out[:, -1, :])  # usamos solo la última salida
        return out, hidden


In [45]:
context_size = 10
dataset = NextTokenDataset(encoded_text, context_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

vocab_size = max(encoded_text) + 1
model = RNNLanguageModel(vocab_size, embed_dim=64, hidden_dim=128)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(100):
    total_loss = 0
    for x, y in dataloader:
        optimizer.zero_grad()
        logits, hidden = model(x, hidden=None)
        loss = loss_fn(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


Epoch 1, Loss: 5.0181
Epoch 2, Loss: 4.4617
Epoch 3, Loss: 4.1616
Epoch 4, Loss: 3.9130
Epoch 5, Loss: 3.6853
Epoch 6, Loss: 3.4756
Epoch 7, Loss: 3.2708
Epoch 8, Loss: 3.0828
Epoch 9, Loss: 2.8962
Epoch 10, Loss: 2.7176
Epoch 11, Loss: 2.5450
Epoch 12, Loss: 2.3800
Epoch 13, Loss: 2.2220
Epoch 14, Loss: 2.0697
Epoch 15, Loss: 1.9213
Epoch 16, Loss: 1.7826
Epoch 17, Loss: 1.6502
Epoch 18, Loss: 1.5247
Epoch 19, Loss: 1.4063
Epoch 20, Loss: 1.2897
Epoch 21, Loss: 1.1828
Epoch 22, Loss: 1.0859
Epoch 23, Loss: 0.9882
Epoch 24, Loss: 0.9035
Epoch 25, Loss: 0.8191
Epoch 26, Loss: 0.7398
Epoch 27, Loss: 0.6683
Epoch 28, Loss: 0.6066
Epoch 29, Loss: 0.5454
Epoch 30, Loss: 0.4937
Epoch 31, Loss: 0.4479
Epoch 32, Loss: 0.4059
Epoch 33, Loss: 0.3667
Epoch 34, Loss: 0.3321
Epoch 35, Loss: 0.2954
Epoch 36, Loss: 0.2681
Epoch 37, Loss: 0.2534
Epoch 38, Loss: 0.2240
Epoch 39, Loss: 0.2030
Epoch 40, Loss: 0.2178
Epoch 41, Loss: 0.1926
Epoch 42, Loss: 0.1719
Epoch 43, Loss: 0.1487
Epoch 44, Loss: 0.12

In [46]:
def generate(model, seed_tokens, length=20):
    model.eval()
    context = seed_tokens[-context_size:]
    generated = seed_tokens[:]
    hidden = None
    
    for _ in range(length):
        x = torch.tensor([context], dtype=torch.long)
        with torch.no_grad():
            logits, hidden = model(x, hidden)
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        context = generated[-context_size:]
    return generated


In [47]:
generated_text = generate(model, encoded_base_text, length=20)
generated_text 
tokenizer.decode(generated_text)


'en un lugar de la mancha , de cuyo nombre no quiero <UNK> , rocín <UNK> a las armas tan <UNK> , y a decir que'

## LSTM Networks in Autoregressive Models

Long Short-Term Memory (LSTM) networks represent a significant advancement in recurrent neural network architectures, specifically designed to address the vanishing gradient problem that plagues traditional RNNs. In the context of autoregressive modeling, LSTMs provide a more sophisticated mechanism for capturing long-range dependencies in sequential data.

### Mathematical Foundation

The LSTM architecture introduces three gating mechanisms that control information flow through the cell state:

**Forget Gate**: Determines what information to discard from the cell state
$$f_t = \sigma(W_f \cdot [h_{t-1}, x_t] + b_f)$$

**Input Gate**: Controls which values to update in the cell state
$$i_t = \sigma(W_i \cdot [h_{t-1}, x_t] + b_i)$$
$$\tilde{C}_t = \tanh(W_C \cdot [h_{t-1}, x_t] + b_C)$$

**Output Gate**: Determines what parts of the cell state to output
$$o_t = \sigma(W_o \cdot [h_{t-1}, x_t] + b_o)$$

The cell state and hidden state updates follow:
$$C_t = f_t * C_{t-1} + i_t * \tilde{C}_t$$
$$h_t = o_t * \tanh(C_t)$$

Where $\sigma$ represents the sigmoid function, $W$ are weight matrices, $b$ are bias vectors, and $*$ denotes element-wise multiplication.

### Advantages in Autoregressive Modeling

1. **Long-term Dependencies**: The cell state acts as a "memory highway" that can preserve information across many time steps, enabling the model to capture dependencies spanning longer sequences than traditional RNNs.

2. **Gradient Flow**: The additive nature of cell state updates helps mitigate the vanishing gradient problem, allowing for more stable training on longer sequences.

3. **Selective Memory**: The gating mechanisms provide the model with learnable control over what information to remember, forget, or output, making it particularly effective for language modeling tasks.

4. **Contextual Understanding**: In autoregressive text generation, LSTMs can maintain context about previously generated tokens while selectively incorporating new information.

### Limitations and Drawbacks

1. **Computational Complexity**: LSTMs are significantly more computationally expensive than simple RNNs due to the multiple gate operations and matrix multiplications required at each time step.

2. **Sequential Processing**: The inherently sequential nature of LSTMs prevents parallelization across time steps during training, limiting scalability compared to attention-based models.

3. **Memory Bottleneck**: Despite improvements over vanilla RNNs, LSTMs still compress all historical information into fixed-size hidden and cell states, potentially losing important details in very long sequences.

4. **Training Stability**: While more stable than RNNs, LSTMs can still suffer from exploding gradients and require careful hyperparameter tuning for optimal performance.

5. **Limited Bidirectional Context**: In autoregressive settings, LSTMs can only access past context, unlike attention mechanisms that can theoretically access all positions simultaneously.

### Application in Language Modeling

In autoregressive language modeling, LSTMs process text sequentially, using their internal memory to maintain context about previously seen tokens. The model predicts the next token based on the current hidden state, which encodes information from the entire sequence history. This makes LSTMs particularly suitable for tasks requiring understanding of temporal dependencies and sequential patterns in text.

In [48]:

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch, context_size)
        embeds = self.embedding(x)  # (batch, context_size, embed_dim)
        out, hidden = self.lstm(embeds, hidden)  # out: (batch, context_size, hidden_dim)
        out = self.fc(out[:, -1, :])  # usamos solo la última salida
        return out, hidden


In [49]:
context_size = 10
dataset = NextTokenDataset(encoded_text, context_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

vocab_size = max(encoded_text) + 1
model = LSTMLanguageModel(vocab_size, embed_dim=64, hidden_dim=128)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [50]:


for epoch in range(170):
    total_loss = 0
    for x, y in dataloader:
        optimizer.zero_grad()
        logits, _ = model(x)
        loss = loss_fn(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


Epoch 1, Loss: 5.0648
Epoch 2, Loss: 4.5723
Epoch 3, Loss: 4.3101
Epoch 4, Loss: 4.0818
Epoch 5, Loss: 3.8776
Epoch 6, Loss: 3.6875
Epoch 7, Loss: 3.5006
Epoch 8, Loss: 3.3148
Epoch 9, Loss: 3.1320
Epoch 10, Loss: 2.9457
Epoch 11, Loss: 2.7620
Epoch 12, Loss: 2.5796
Epoch 13, Loss: 2.3996
Epoch 14, Loss: 2.2213
Epoch 15, Loss: 2.0430
Epoch 16, Loss: 1.8739
Epoch 17, Loss: 1.7103
Epoch 18, Loss: 1.5497
Epoch 19, Loss: 1.4035
Epoch 20, Loss: 1.2624
Epoch 21, Loss: 1.1279
Epoch 22, Loss: 1.0028
Epoch 23, Loss: 0.8884
Epoch 24, Loss: 0.7787
Epoch 25, Loss: 0.6843
Epoch 26, Loss: 0.5942
Epoch 27, Loss: 0.5136
Epoch 28, Loss: 0.4417
Epoch 29, Loss: 0.3765
Epoch 30, Loss: 0.3194
Epoch 31, Loss: 0.2702
Epoch 32, Loss: 0.2312
Epoch 33, Loss: 0.1904
Epoch 34, Loss: 0.1584
Epoch 35, Loss: 0.1316
Epoch 36, Loss: 0.1096
Epoch 37, Loss: 0.0955
Epoch 38, Loss: 0.0776
Epoch 39, Loss: 0.0610
Epoch 40, Loss: 0.0488
Epoch 41, Loss: 0.0394
Epoch 42, Loss: 0.0330
Epoch 43, Loss: 0.0485
Epoch 44, Loss: 0.41

In [51]:
generated_text = generate(model, encoded_base_text, length=50)

In [52]:
decoded_text = tokenizer.decode(generated_text)
print(decoded_text)

en un lugar de la mancha tan de la risa , <UNK> y él tenía que , <UNK> del más <UNK> y <UNK> que se <UNK> <UNK> de rodillas ante mi dulce de sus manos . <UNK> , yo los ojos al señor , y , en efeto , y todos los años , y al


## Transformer Architecture

The Transformer architecture, introduced in "Attention Is All You Need" (Vaswani et al., 2017), revolutionized natural language processing by replacing recurrent and convolutional layers with self-attention mechanisms. This architecture has become the foundation for modern large language models like GPT, BERT, and their variants.

### Core Components

**1. Self-Attention Mechanism**

The heart of the Transformer is the self-attention mechanism, which allows the model to weigh the importance of different positions in the input sequence when processing each token. For each position, the model computes attention weights for all other positions, enabling it to capture long-range dependencies effectively.

The attention mechanism works through three key components:
- **Query (Q)**: What information are we looking for?
- **Key (K)**: What information is available at each position?
- **Value (V)**: The actual information content at each position

The attention score is computed as: Attention(Q,K,V) = softmax(QK^T / √d_k)V

**2. Multi-Head Attention**

Instead of using a single attention mechanism, Transformers use multiple "heads" that can focus on different types of relationships simultaneously. This allows the model to attend to information from different representation subspaces at different positions.

**3. Positional Encoding**

Since Transformers lack inherent sequential processing, positional encodings are added to input embeddings to provide information about token positions in the sequence.

**4. Feed-Forward Networks**

Each layer contains a position-wise feed-forward network that processes each position independently, adding non-linearity to the model.

### Advantages of Transformers

**Parallelization**: Unlike RNNs, all positions can be processed simultaneously during training, leading to significant speedup on modern hardware.

**Long-range Dependencies**: Self-attention can directly connect distant positions without the degradation that occurs in RNNs over long sequences.

**Interpretability**: Attention weights provide insight into which parts of the input the model is focusing on for each prediction.

**Scalability**: Transformers scale exceptionally well with data and compute, following predictable scaling laws.

**Transfer Learning**: Pre-trained Transformer models can be fine-tuned for various downstream tasks with remarkable success.

### Disadvantages of Transformers

**Quadratic Complexity**: Self-attention has O(n²) complexity with respect to sequence length, making it computationally expensive for very long sequences.

**Memory Requirements**: The attention mechanism requires storing all pairwise relationships, leading to high memory consumption.

**Data Hunger**: Transformers typically require large amounts of training data to achieve good performance.

**Lack of Inductive Bias**: Unlike CNNs (translation invariance) or RNNs (sequential bias), Transformers have minimal inductive bias, requiring more data to learn patterns.

### Scaling Laws

Recent research has revealed predictable scaling laws for Transformer-based language models, showing that performance scales as a power law with:

**Model Size (N)**: Larger models with more parameters consistently perform better, following approximately: Loss ∝ N^(-0.076)

**Dataset Size (D)**: More training data leads to better performance: Loss ∝ D^(-0.095)

**Compute Budget (C)**: More compute during training improves performance: Loss ∝ C^(-0.050)

**Key Insights from Scaling Laws:**

1. **No Saturation**: Performance continues to improve with scale across several orders of magnitude
2. **Predictability**: Performance can be extrapolated from smaller experiments
3. **Optimal Allocation**: For a fixed compute budget, there's an optimal trade-off between model size and training time
4. **Emergence**: New capabilities often emerge suddenly at certain scales

**Chinchilla Scaling Laws**: Recent work suggests that many large models are undertrained relative to their parameter count, and that compute should be split more evenly between model size and training data.

### Attention Mechanism Deep Dive

The attention mechanism is the core innovation that makes Transformers so effective. Let's examine it in detail:

**Scaled Dot-Product Attention**:
1. **Linear Projections**: Input embeddings are linearly projected to create Q, K, and V matrices
2. **Similarity Computation**: Dot products between queries and keys measure similarity
3. **Scaling**: Division by √d_k prevents saturation of the softmax function
4. **Normalization**: Softmax ensures attention weights sum to 1
5. **Weighted Sum**: Values are weighted by attention scores

**Causal Masking**: In autoregressive models, future tokens are masked to prevent information leakage during training.

**Multi-Head Benefits**:
- Different heads can specialize in different types of relationships (syntactic, semantic, positional)
- Increases model capacity without significantly increasing computational cost
- Provides robustness through redundancy

The attention mechanism's ability to directly model relationships between any two positions in a sequence, regardless of their distance, makes it particularly powerful for language modeling tasks where long-range dependencies are crucial for understanding context and generating coherent text.

In [53]:


class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=2, context_size=16, ff_dim=256):
        super().__init__()
        self.context_size = context_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Embedding(context_size, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        batch_size, seq_len = x.size()
        pos = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        x = self.embedding(x) + self.pos_embedding(pos)  # (B, T, D)

        # Máscara causal para autoregresión
        mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()
        
        out = self.transformer(x, mask=mask)  # (B, T, D)
        out = self.fc(out[:, -1, :])  # solo la última posición
        return out


In [67]:
context_size = 16
dataset = NextTokenDataset(encoded_text, context_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

vocab_size = max(encoded_text) + 1
model = TransformerLanguageModel(vocab_size, context_size=context_size)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

final_loss=[]
for epoch in range(100):
    total_loss = 0
    for x, y in dataloader:
        optimizer.zero_grad()
        logits = model(x)
        loss = loss_fn(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")
    final_loss.append(total_loss/len(dataloader))


Epoch 1, Loss: 5.1043
Epoch 2, Loss: 4.6217
Epoch 3, Loss: 4.3471
Epoch 4, Loss: 4.1053
Epoch 5, Loss: 3.8966
Epoch 6, Loss: 3.7043
Epoch 7, Loss: 3.5023
Epoch 8, Loss: 3.3121
Epoch 9, Loss: 3.1129
Epoch 10, Loss: 2.9223
Epoch 11, Loss: 2.7372
Epoch 12, Loss: 2.5459
Epoch 13, Loss: 2.3695
Epoch 14, Loss: 2.2048
Epoch 15, Loss: 2.0313
Epoch 16, Loss: 1.8851
Epoch 17, Loss: 1.7263
Epoch 18, Loss: 1.5891
Epoch 19, Loss: 1.4616
Epoch 20, Loss: 1.3533
Epoch 21, Loss: 1.2237
Epoch 22, Loss: 1.1283
Epoch 23, Loss: 1.0350
Epoch 24, Loss: 0.9558
Epoch 25, Loss: 0.8799
Epoch 26, Loss: 0.8157
Epoch 27, Loss: 0.7320
Epoch 28, Loss: 0.6889
Epoch 29, Loss: 0.6525
Epoch 30, Loss: 0.6072
Epoch 31, Loss: 0.5654
Epoch 32, Loss: 0.5447
Epoch 33, Loss: 0.4979
Epoch 34, Loss: 0.4868
Epoch 35, Loss: 0.4406
Epoch 36, Loss: 0.4357
Epoch 37, Loss: 0.4102
Epoch 38, Loss: 0.3821
Epoch 39, Loss: 0.3639
Epoch 40, Loss: 0.3613
Epoch 41, Loss: 0.3407
Epoch 42, Loss: 0.3393
Epoch 43, Loss: 0.3270
Epoch 44, Loss: 0.30

In [65]:
import torch.nn.functional as F

def generate(model, seed_tokens, length=20):
    model.eval()
    generated = seed_tokens[:]

    for _ in range(length):
        context = generated[-context_size:]
        x = torch.tensor([context], dtype=torch.long)
        with torch.no_grad():
            logits = model(x)
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
    return generated

generated_text = generate(model, encoded_base_text, length=50)
decoded_text = tokenizer.decode(generated_text)
print(decoded_text)


en un lugar de la mancha dejó mandado . añadió esa gusto : <UNK> vuestra grandeza <UNK> . <UNK> su huésped . <UNK> las prevenciones tan necesarias que había de llevar consigo , <UNK> la de los libros , no le pareció que algún que llegaba a leer por el <UNK> de haberle oído <UNK> lo


# Bibliography

## Fundamental Papers

### Transformers and Attention
- **Attention Is All You Need** (Vaswani et al., 2017)
  - Introduces the Transformer architecture and self-attention mechanism
  - https://arxiv.org/abs/1706.03762

- **The Annotated Transformer** (Rush, 2018)
  - Detailed implementation and explanation of the original paper
  - http://nlp.seas.harvard.edu/2018/04/03/attention.html

### Autoregressive Language Models
- **Language Models are Unsupervised Multitask Learners** (Radford et al., 2019)
  - Introduces GPT-2 and demonstrates emergent capabilities in large models
  - https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf

- **Language Models are Few-Shot Learners** (Brown et al., 2020)
  - GPT-3 and demonstration of few-shot learning
  - https://arxiv.org/abs/2005.14165

### Scaling Laws
- **Scaling Laws for Neural Language Models** (Kaplan et al., 2020)
  - Establishes fundamental relationships between model size, dataset size, and performance
  - https://arxiv.org/abs/2001.08361

- **Training Compute-Optimal Large Language Models** (Hoffmann et al., 2022)
  - Chinchilla paper: optimal allocation of compute between model size and training tokens
  - https://arxiv.org/abs/2203.15556

- **PaLM: Scaling Language Modeling with Pathways** (Chowdhery et al., 2022)
  - Demonstrates continued scaling benefits up to 540B parameters
  - https://arxiv.org/abs/2204.02311

- **Emergent Abilities of Large Language Models** (Wei et al., 2022)
  - Studies how capabilities emerge at certain scales
  - https://arxiv.org/abs/2206.07682

### Tokenization and Text Processing
- **Neural Machine Translation of Rare Words with Subword Units** (Sennrich et al., 2016)
  - Introduces Byte Pair Encoding (BPE)
  - https://arxiv.org/abs/1508.07909

- **SentencePiece: A simple and language independent subword tokenizer** (Kudo & Richardson, 2018)
  - Unified tokenization algorithm
  - https://arxiv.org/abs/1808.06226

### Optimization and Training
- **Deep Residual Learning for Image Recognition** (He et al., 2016)
  - Introduces residual connections, fundamental in Transformers
  - https://arxiv.org/abs/1512.03385

- **Layer Normalization** (Ba et al., 2016)
  - Normalization technique used in Transformers
  - https://arxiv.org/abs/1607.06450

- **Adam: A Method for Stochastic Optimization** (Kingma & Ba, 2014)
  - Popular optimizer for training neural networks
  - https://arxiv.org/abs/1412.6980

### Evaluation and Metrics
- **BLEU: a Method for Automatic Evaluation of Machine Translation** (Papineni et al., 2002)
  - Standard metric for text generation evaluation
  - https://aclanthology.org/P02-1040/

- **Perplexity** - Standard metric for language model evaluation
  - Measures how well a model predicts a sample

## Additional Resources
- **The Illustrated Transformer** (Alammar, 2018)
  - Visual explanation of Transformer architecture
  - http://jalammar.github.io/illustrated-transformer/

- **Hugging Face Transformers Documentation**
  - Library and documentation for language models
  - https://huggingface.co/docs/transformers/

- **OpenAI Scaling Laws Interactive Calculator**
  - Tool for exploring scaling relationships
  - https://github.com/openai/scaling_laws