## Read

In [1]:
import pandas as pd

opus = pd.read_csv("../data/opus_books.csv")
opus.head()

Unnamed: 0,en,es
0,"In the society of his nephew and niece, and th...","En compañía de su sobrino y sobrina, y de los ..."
1,"By a former marriage, Mr. Henry Dashwood had o...","De un matrimonio anterior, el señor Henry Dash..."
2,"By his own marriage, likewise, which happened ...","Además, su propio matrimonio, ocurrido poco de..."
3,"But the fortune, which had been so tardy in co...","Pero la fortuna, que había tardado tanto en ll..."
4,But Mrs. John Dashwood was a strong caricature...,Pero la señora de John Dashwood era una áspera...


## Create vocab

In [2]:
import re
from collections import defaultdict

token_limit = 11
special_tokens = {
    "PAD": 0,
    "UNK": 1,
    "BOS": 2,
    "EOS": 3
}
vocab = special_tokens.copy()

def clean(text):
    # Use re to replace punctuation that is not a comma, question mark, or period with spaces
    text = re.sub(r'[^\w\s,?.!]',' ', text)
    text = text.strip()
    return text

def tokenize(text):
    # Split on consecutive whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]+|[\s]+', text)
    return tokens[:token_limit]

opus_tokens = defaultdict(int)
for index, row in opus.iterrows():
    cleaned = clean(row["en"])
    tokens = tokenize(cleaned)
    for token in tokens:
        opus_tokens[token] += 1

counter = 4
for index, token in enumerate(opus_tokens):
    # Filter out uncommon tokens
    # Add unknown token for rare words
    if opus_tokens[token] > 2:
        vocab[token] = counter
        counter += 1
    else:
        vocab[token] = 1 # Assign unknown id

## Tokenize sentences

In [3]:
import torch

def encode(text):
    # Encode text as a list of integers
    tokens = tokenize(clean(text))
    encoded = torch.tensor([vocab[token] for token in tokens])
    return encoded

reverse_vocab = {v: k for k, v in vocab.items()}
for k,v in special_tokens.items():
    reverse_vocab[v] = k

def decode(encoded):
    # Decode a list of integers into text
    if isinstance(encoded, torch.Tensor):
        encoded = encoded.detach().cpu().tolist()
    decoded = "".join([reverse_vocab[token] for token in encoded])
    return decoded

## Tokenize data

In [4]:
tokenized = []
for index, row in opus.iterrows():
    # Encode the English sentences
    en_text = row["en"]
    en = encode(en_text)
    if en.shape[0] < token_limit:
        continue
    tokenized.append(en)

In [5]:
tokenized[0]

tensor([4, 5, 6, 5, 1, 5, 7, 5, 8, 5, 1])

## Create torch dataset

In [6]:
from torch.utils.data import DataLoader, Dataset

class TextData(Dataset):
    def __init__(self, data):
        self.tokens = torch.vstack(data).long()

    def __len__(self):
        # Return how many examples are in the dataset
        return len(self.tokens)

    def __getitem__(self, idx):
        # Return a single training example
        x = self.tokens[idx][:10]
        y = self.tokens[idx][10]
        return x, y

# Initialize the dataset
train_ds = TextData(tokenized)
train = DataLoader(train_ds, batch_size=16)

In [7]:
train_ds[0]

(tensor([4, 5, 6, 5, 1, 5, 7, 5, 8, 5]), tensor(1))

In [8]:
batch = next(iter(train))
batch

[tensor([[ 4,  5,  6,  5,  1,  5,  7,  5,  8,  5],
         [ 9,  5, 10,  5, 11,  5, 12, 13,  5, 14],
         [ 9,  5,  8,  5, 16,  5, 12, 13,  5, 17],
         [18,  5,  6,  5, 19, 13,  5, 20,  5, 21],
         [18,  5, 22, 15,  5, 23,  5, 24,  5, 25],
         [26,  5, 27,  5,  1,  5, 28, 13,  5, 29],
         [30,  5, 25,  5, 31,  5, 32,  5, 33, 34],
         [30,  5, 25,  5,  1, 13,  5,  1, 13,  5],
         [37,  5, 38, 13,  5, 39,  5, 40, 13,  5],
         [41,  5,  1,  5, 42,  5, 43,  5, 44,  5],
         [45,  5,  1,  5,  7,  5, 46,  5, 20,  5],
         [37, 13,  5, 48, 13,  5, 25,  5, 49,  5],
         [50,  5,  1,  5, 29,  5, 10,  5, 51,  5],
         [22, 15,  5, 23,  5, 24,  5, 53,  5, 54],
         [55,  5, 56,  5, 57,  5, 58,  5, 59,  5],
         [61,  5, 62,  5, 63,  5, 64,  5, 65,  5]]),
 tensor([ 1, 15, 13,  5,  5,  5, 35, 36,  6, 29, 47,  1, 52,  5, 60, 66])]

## Embedding layer

In [9]:
import math
from torch import nn

class Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()

        k = 1/math.sqrt(embed_dim)
        self.weights =  torch.rand(vocab_size, embed_dim) * 2 * k - k
        self.weights[0] = 0 # Zero out the padding embedding
        self.weights = nn.Parameter(self.weights)

    def forward(self, token_ids):
        # Return a matrix of embeddings
        # We could convert token_ids to a one_hot vector and multiply by the weights, but it is the same as selecting a single row of the matrix
        return self.weights[token_ids]

In [10]:
with torch.no_grad():
    input_embed = Embedding(len(vocab), 256)
    print(input_embed(batch[0]))

tensor([[[-0.0541, -0.0153,  0.0248,  ...,  0.0610,  0.0496, -0.0178],
         [ 0.0082,  0.0146,  0.0010,  ..., -0.0208, -0.0350,  0.0511],
         [-0.0044,  0.0237,  0.0249,  ...,  0.0321,  0.0519, -0.0526],
         ...,
         [ 0.0082,  0.0146,  0.0010,  ..., -0.0208, -0.0350,  0.0511],
         [-0.0120,  0.0600,  0.0007,  ..., -0.0287, -0.0445,  0.0606],
         [ 0.0082,  0.0146,  0.0010,  ..., -0.0208, -0.0350,  0.0511]],

        [[ 0.0279,  0.0159,  0.0190,  ..., -0.0409, -0.0622, -0.0318],
         [ 0.0082,  0.0146,  0.0010,  ..., -0.0208, -0.0350,  0.0511],
         [ 0.0404,  0.0343, -0.0407,  ...,  0.0346, -0.0573,  0.0159],
         ...,
         [ 0.0536,  0.0152,  0.0078,  ...,  0.0356,  0.0586, -0.0563],
         [ 0.0082,  0.0146,  0.0010,  ..., -0.0208, -0.0350,  0.0511],
         [ 0.0145,  0.0194, -0.0532,  ...,  0.0224,  0.0009,  0.0574]],

        [[ 0.0279,  0.0159,  0.0190,  ..., -0.0409, -0.0622, -0.0318],
         [ 0.0082,  0.0146,  0.0010,  ..., -0

## Predict next token

In [11]:
class TokenPredictor(nn.Module):
    def __init__(self, vocab_size, input_token_count, hidden_units):
        super().__init__()

        torch.manual_seed(0)
        self.embedding = Embedding(vocab_size, hidden_units)
        self.dense1 = nn.Linear(hidden_units, hidden_units)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(hidden_units, hidden_units)
        self.output = nn.Linear(hidden_units * input_token_count, hidden_units)

    def forward(self, x):
        # Embed from (token_count, vocab_size) to (token_count, hidden_size)
        embedded = self.embedding(x)
        # Run the network
        x = self.dense2(self.relu(self.dense1(embedded)))
        # Flatten the vectors into one large vector per sentence for the final layer
        flat = torch.flatten(x, start_dim=1)
        # Run the final layer to get an output
        network_out = self.output(flat)
        # Unembed, convert to (batch_size, vocab_size).  Argmax against last dim gives predicted token
        out_vector = network_out @ self.embedding.weights.T
        return out_vector

In [12]:
from statistics import mean

# Initialize W&B
%env WANDB_SILENT=True

import wandb
wandb.login()

def train_loop(net, optimizer, epochs):
    # Initialize a new W&B run
    wandb.init(project="text",
               name="dense")

    loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    train_losses = []
    for epoch in range(epochs):
        for batch, (x, y) in enumerate(train):
            # zero_grad will set all the gradients to zero
            # We need this because gradients will accumulate in the backward pass
            optimizer.zero_grad()
            # Make a prediction using the network
            pred = net(x)
            # Calculate the loss
            loss = loss_fn(pred, y)
            # Call loss.backward to run backpropagation
            loss.backward()
            # Step the optimizer to update the parameters
            optimizer.step()
            train_losses.append(loss.item())

            if batch % 10 == 0:
                # Log training metrics
                wandb.log({
                    "train_loss": mean(train_losses)
                })

    return train_losses

env: WANDB_SILENT=True


In [None]:
# Define our hyperparameters
epochs = 25
lr = 1e-3

# Initialize our network
net = TokenPredictor(len(vocab), 10, 256)
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
losses = train_loop(net, optimizer, epochs)

In [None]:
with torch.no_grad():
    batch = next(iter(train))
    pred = net(batch[0])
    token_id = pred.argmax(-1)

    for i in range(len(batch)):
        text = decode(batch[0][i])
        actual = decode(batch[1][i:(i+1)])
        pred = decode(token_id[i:(i+1)])
        print(f"{text}<ACTUAL>{actual}<><PRED>{pred}<>")