In [18]:
import torch
import torch.nn as nn
from datasets import load_dataset
from collections import Counter
from torch.utils.data import DataLoader, Dataset

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [31]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split='validation')
train_texts = dataset["text"]

In [32]:
from itertools import chain

def tokenize(text):
    return text.lower().split()

tokens = [tokenize(line) for line in train_texts if line.strip()]
flat_tokens = list(chain.from_iterable(tokens))
vocab_counter = Counter(flat_tokens)
vocab = {word: idx + 2 for idx, (word, _) in enumerate(vocab_counter.items())}
vocab["<pad>"] = 0
vocab["<unk>"] = 1
inv_vocab = {idx: word for word, idx in vocab.items()}

In [33]:
inv_vocab

{2: '=',
 3: 'homarus',
 4: 'gammarus',
 5: ',',
 6: 'known',
 7: 'as',
 8: 'the',
 9: 'european',
 10: 'lobster',
 11: 'or',
 12: 'common',
 13: 'is',
 14: 'a',
 15: 'species',
 16: 'of',
 17: 'clawed',
 18: 'from',
 19: 'eastern',
 20: 'atlantic',
 21: 'ocean',
 22: 'mediterranean',
 23: 'sea',
 24: 'and',
 25: 'parts',
 26: 'black',
 27: '.',
 28: 'it',
 29: 'closely',
 30: 'related',
 31: 'to',
 32: 'american',
 33: 'h.',
 34: 'americanus',
 35: 'may',
 36: 'grow',
 37: 'length',
 38: '60',
 39: 'cm',
 40: '(',
 41: '24',
 42: 'in',
 43: ')',
 44: 'mass',
 45: '6',
 46: 'kilograms',
 47: '13',
 48: 'lb',
 49: 'bears',
 50: 'conspicuous',
 51: 'pair',
 52: 'claws',
 53: 'life',
 54: 'lobsters',
 55: 'are',
 56: 'blue',
 57: 'only',
 58: 'becoming',
 59: '"',
 60: 'red',
 61: 'on',
 62: 'cooking',
 63: 'mating',
 64: 'occurs',
 65: 'summer',
 66: 'producing',
 67: 'eggs',
 68: 'which',
 69: 'carried',
 70: 'by',
 71: 'females',
 72: 'for',
 73: 'up',
 74: 'year',
 75: 'before',
 76: 

In [34]:
def encode(tokens):
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]

sequence_length = 30

class LanguageModelDataset(Dataset):
    def __init__(self, lines, seq_len):
        self.data = []
        for line in lines:
            tokens = tokenize(line)
            ids = encode(tokens)
            for i in range(len(ids) - seq_len):
                input_seq = ids[i:i+seq_len]
                target = ids[i+1:i+seq_len+1]
                self.data.append((torch.tensor(input_seq), torch.tensor(target)))
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = LanguageModelDataset(train_texts, sequence_length)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# RNNs Recurrent Neural Networks


* Secuencias: series de tiempo, texto

![Sample Image](./images/rnn2.jpeg)

![Sample Image](./images/rnns.jpeg)

In [35]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embed(x)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size).to(DEVICE)


In [36]:
next(iter(train_dataset))

(tensor([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 10,  5, 13, 14, 15, 16, 17, 10,
         18,  8, 19, 20, 21,  5, 22, 23, 24, 25, 16,  8]),
 tensor([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 10,  5, 13, 14, 15, 16, 17, 10, 18,
          8, 19, 20, 21,  5, 22, 23, 24, 25, 16,  8, 26]))

In [37]:
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
epochs = 10

In [38]:
model = RNNLM(vocab_size, embed_size, hidden_size).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        hidden = model.init_hidden(inputs.size(0))
        optimizer.zero_grad()
        output, hidden = model(inputs, hidden)
        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.2f}")

Epoch 1, Loss: 23964.96
Epoch 2, Loss: 14426.97
Epoch 3, Loss: 11096.85
Epoch 4, Loss: 9490.11
Epoch 5, Loss: 8561.92
Epoch 6, Loss: 7948.38
Epoch 7, Loss: 7511.46
Epoch 8, Loss: 7178.79
Epoch 9, Loss: 6917.36
Epoch 10, Loss: 6704.02


# Qué pasa adentro de la RNN?

``` python
sequence = [x₁, x₂, x₃, ..., xₜ]  # xᵢ is a vector for each word

h₀ = 0                       # initial hidden state (often zeros)
h₁ = tanh(Wₓx₁ + Wₕh₀ + b)   ← step 1
h₂ = tanh(Wₓx₂ + Wₕh₁ + b)   ← step 2
h₃ = tanh(Wₓx₃ + Wₕh₂ + b)   ← step 3
...
```

In [39]:
def generate_text(model, start_word, max_len=20):
    model.eval()
    input = torch.tensor([[vocab.get(start_word, vocab["<unk>"])]], dtype=torch.long).to(DEVICE)
    hidden = model.init_hidden(1)
    output_words = [start_word]

    with torch.no_grad():
        for _ in range(max_len - 1):
            output, hidden = model(input, hidden)
            pred = output[:, -1, :]
            word_id = torch.argmax(pred, dim=-1).item()
            word = inv_vocab.get(word_id, "<unk>")
            output_words.append(word)
            input = torch.tensor([[word_id]], dtype=torch.long).to(DEVICE)

    return ' '.join(output_words)

In [44]:
generate_text(model, "mountain")

'mountain news not cut into halves , excluding the second verses and bridge was held in a piece of oak'

# LSTM Long-Short Term Memory RNNs

![Sample Image](./images/lstm1.png)

![Sample Image](./images/lstm2.png)

In [47]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        out, _ = self.lstm(x)
        return self.fc(out)

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size).to(DEVICE)

In [49]:
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
epochs = 10

model = LSTM(vocab_size, embed_size, hidden_size).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.2f}")

Epoch 1, Loss: 26781.95
Epoch 2, Loss: 18836.45
Epoch 3, Loss: 14992.40
Epoch 4, Loss: 12471.49
Epoch 5, Loss: 10658.34
Epoch 6, Loss: 9302.84
Epoch 7, Loss: 8275.44
Epoch 8, Loss: 7484.50


KeyboardInterrupt: 

In [None]:
generate_text(model, "mountain")

# Transformers

## Attention is all you need 
[link to paper](https://arxiv.org/abs/1706.03762)

![Sample Image](./images/attention1.png)

![Sample Image](./images/attention2.png)

![Sample Image](./images/attention3.png)

In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Words for the example
tokens = ["The", "cat", "sat", "on", "the", "mat"]

# Dummy embeddings (simulate pretrained GloVe for simplicity)
embedding_dim = 16
vocab = {word: torch.randn(embedding_dim) for word in set(tokens)}
embeddings = torch.stack([vocab[word] for word in tokens])  # (seq_len, embed_dim)

# Reshape for MultiheadAttention: (seq_len, batch, embed_dim)
x = embeddings.unsqueeze(1)  # (6, 1, 16)

x

tensor([[[-2.2600, -1.3618,  1.5766,  0.1465,  1.8046,  1.3033,  0.1841,
          -1.9565,  0.1488, -0.9366, -0.9505, -0.2994, -0.6518,  0.2769,
          -0.0066, -1.8792]],

        [[ 0.8817, -2.0325, -0.3522,  0.1006,  1.5255, -1.6215,  0.5298,
          -1.3719, -0.6298, -0.6567,  0.4244, -0.4012, -0.4890,  0.9572,
          -0.8436, -0.3681]],

        [[-0.8828, -0.0351,  0.3311, -1.5351,  1.5792,  1.5627,  1.0340,
          -0.6448, -0.7957,  0.4748,  0.6450, -2.4695, -1.2704,  1.4029,
           0.3947, -0.2914]],

        [[-0.7096,  1.5106,  0.9143, -1.2950,  0.9506, -1.3416,  0.9584,
          -1.8344, -0.5945, -1.6710, -1.3442, -0.7922,  0.1253, -0.9972,
           0.9317, -1.0990]],

        [[ 0.4170, -0.8713, -0.1622, -0.9481, -0.6535, -0.3585, -0.8176,
           0.3736,  0.9122,  0.6781,  0.0559, -1.3395, -0.3002,  0.1600,
          -0.6769,  0.7504]],

        [[-0.4724, -0.6994,  0.4524, -1.0906,  0.9412, -1.9317,  0.1285,
          -1.0134,  0.7479,  0.2011, -0.89

In [83]:
# Multi-head attention layer
mha = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=4, batch_first=False)

# Apply self-attention
output, attn_weights = mha(x, x, x)  # (seq_len, batch, embed_dim), (batch, num_heads, seq_len, seq_len)

print("Output shape:", output.shape)
print("Attention weights shape:", attn_weights.shape)

Output shape: torch.Size([6, 1, 16])
Attention weights shape: torch.Size([1, 6, 6])


In [85]:
output

tensor([[[-0.2139, -0.2378, -0.0377,  0.4408,  0.0746, -0.0500, -0.0849,
           0.1351,  0.2350, -0.0146, -0.1975,  0.2000, -0.2880, -0.0825,
          -0.0171, -0.1060]],

        [[-0.2465, -0.1324, -0.0568,  0.2459, -0.0469,  0.1498, -0.1074,
           0.0394,  0.1523,  0.0997, -0.2093,  0.1722, -0.2927, -0.1203,
          -0.0213, -0.1175]],

        [[-0.1920, -0.2264, -0.0040,  0.3891,  0.1190, -0.0081, -0.0598,
           0.1335,  0.2643, -0.0441, -0.2048,  0.2327, -0.2741, -0.1010,
          -0.0664, -0.1795]],

        [[-0.2308, -0.1874, -0.0208,  0.3403, -0.0059,  0.0710, -0.0822,
           0.1015,  0.1670,  0.0610, -0.1947,  0.1754, -0.2663, -0.1132,
           0.0167, -0.0915]],

        [[-0.2635, -0.1391, -0.1003,  0.1709,  0.0592,  0.2033, -0.0994,
           0.0421,  0.2689,  0.0521, -0.1742,  0.2366, -0.3383, -0.0642,
          -0.2013, -0.2942]],

        [[-0.2117, -0.1883, -0.0064,  0.3106,  0.0342,  0.0974, -0.0520,
           0.1084,  0.2234, -0.0164, -0.22

In [87]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

torch.manual_seed(0)

# Dummy input: (batch=1, seq_len=4, embed_dim=8)
x = torch.rand(1, 4, 8)
print(x)

# Linear layers to get Q, K, V
embed_dim = 8
q_proj = torch.nn.Linear(embed_dim, embed_dim)
k_proj = torch.nn.Linear(embed_dim, embed_dim)
v_proj = torch.nn.Linear(embed_dim, embed_dim)

Q = q_proj(x)  # (1, 4, 8)
K = k_proj(x)  # (1, 4, 8)
V = v_proj(x)  # (1, 4, 8)

# Attention scores
scores = torch.matmul(Q, K.transpose(-2, -1)) / (embed_dim ** 0.5)  # (1, 4, 4)

# Softmax to get weights
weights = F.softmax(scores, dim=-1)  # (1, 4, 4)

# Final output
output = torch.matmul(weights, V)  # (1, 4, 8)

print("Attention Weights:\n", weights.squeeze())

tensor([[[0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964],
         [0.4556, 0.6323, 0.3489, 0.4017, 0.0223, 0.1689, 0.2939, 0.5185],
         [0.6977, 0.8000, 0.1610, 0.2823, 0.6816, 0.9152, 0.3971, 0.8742],
         [0.4194, 0.5529, 0.9527, 0.0362, 0.1852, 0.3734, 0.3051, 0.9320]]])
Attention Weights:
 tensor([[0.2474, 0.2407, 0.2605, 0.2515],
        [0.2400, 0.2525, 0.2493, 0.2582],
        [0.2485, 0.2424, 0.2651, 0.2441],
        [0.2413, 0.2474, 0.2495, 0.2618]], grad_fn=<SqueezeBackward0>)
