In [1]:
import pandas as pd

opus = pd.read_csv("../data/opus_books.csv")
opus.head()

Unnamed: 0,en,es
0,"In the society of his nephew and niece, and th...","En compañía de su sobrino y sobrina, y de los ..."
1,"By a former marriage, Mr. Henry Dashwood had o...","De un matrimonio anterior, el señor Henry Dash..."
2,"By his own marriage, likewise, which happened ...","Además, su propio matrimonio, ocurrido poco de..."
3,"But the fortune, which had been so tardy in co...","Pero la fortuna, que había tardado tanto en ll..."
4,But Mrs. John Dashwood was a strong caricature...,Pero la señora de John Dashwood era una áspera...


In [2]:
import re
from collections import defaultdict

special_tokens = {
    "PAD": 0,
    "UNK": 1,
    "BOS": 2,
    "EOS": 3
}
vocab = special_tokens.copy()

def clean(text):
    # Use re to replace punctuation that is not a comma, question mark, or period with spaces
    text = re.sub(r'[^\w\s,?.!]',' ', text)
    text = text.strip()
    return text

def tokenize(text):
    # Split on consecutive whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]+|[\s]+', text)
    return tokens

opus_tokens = defaultdict(int)
for index, row in opus.iterrows():
    for key in ["en", "es"]:
        cleaned = clean(row[key])
        tokens = tokenize(cleaned)
        for token in tokens:
            opus_tokens[token] += 1

counter = 4
for index, token in enumerate(opus_tokens):
    # Filter out uncommon tokens
    # Add unknown token for rare words
    if opus_tokens[token] > 3:
        vocab[token] = counter
        counter += 1
    else:
        vocab[token] = 1 # Assign unknown id

In [3]:
import torch

def encode(text, vocab):
    # Encode text as a list of integers
    tokens = tokenize(clean(text))
    encoded = torch.tensor([vocab[token] for token in tokens])
    return encoded

reverse_vocab = {v: k for k, v in vocab.items()}
for k,v in special_tokens.items():
    reverse_vocab[v] = k

def decode(encoded, reverse_vocab):
    # Decode a list of integers into text
    decoded = "".join([reverse_vocab[token] for token in encoded])
    return decoded

In [11]:
data = []
for index, row in opus.iterrows():
    # Encode the English and Spanish sentences
    en_text = row["en"]
    es_text = row["es"]
    en = encode(en_text, vocab)
    es = encode(es_text, vocab)
    if en.shape[0] < 11:
        continue

    # Generate our prediction target
    target = torch.roll(es, -1)
    target[-1] = 3 # EOS
    data.append({"en": en, "es": es, "en_text": en_text, "es_text": es_text, "target": target})

In [12]:
data[1000]

{'en': tensor([1415,    5, 4384,    5,  853,    5,   87,    5,    6,    5, 4385,    5,
            8,    5,  508,    5,    1,    5,   46,   12,    5,   10,    5,    8,
            5,  240,    5,    9,    5,   50,    5,    1,   21]),
 'es': tensor([4386,    5, 4387,    5,  102,    5,   35,    5, 4388,    5,   24,    5,
          715,    5,   28,    5, 4389,    5, 4390,    5,   96,    5,  233,    5,
         4391,    5,   26,    5,   24,    5,  155,    5,   25,    5,   62,    5,
         4392,   21]),
 'en_text': 'That crime has been the origin of every lesser one, and of all his present discontents."',
 'es_text': 'Ese crimen fue el origen de todos los males menores que le siguieron y de todo su actual descontento.',
 'target': tensor([   5, 4387,    5,  102,    5,   35,    5, 4388,    5,   24,    5,  715,
            5,   28,    5, 4389,    5, 4390,    5,   96,    5,  233,    5, 4391,
            5,   26,    5,   24,    5,  155,    5,   25,    5,   62,    5, 4392,
           21,    3])

In [6]:
import math
from torch import nn

class Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()

        k = 1/math.sqrt(embed_dim)
        self.weights =  nn.Parameter(torch.rand(vocab_size, embed_dim) * 2 * k - k)
        self.weights[0] = 0 # Zero out the padding embedding

    def forward(self, token_ids):
        # Cache for backward pass
        self.token_ids = token_ids
        # Return a matrix of embeddings
        # We could convert token_ids to a one_hot vector and multiply by the weights, but it is the same as selecting a single row of the matrix
        return self.weights[token_ids]

    def backward_ex(self, grad, lr):
        for i, token_id in enumerate(self.token_ids):
            # Add the gradient to the embedding
            # We could convert the input to one-hot, and do input.T * grad, but it is the same as adding the gradient to the embedding
            # Subtracting the gradient is SGD optimization
            self.weights[token_id] -= grad[i] * lr

In [10]:
with torch.no_grad():
    input_embed = Embedding(len(set(vocab.values())), 512)
    print(input_embed(data[0]["en"][:10]))

tensor([[-0.0180, -0.0180, -0.0311,  ...,  0.0327, -0.0429, -0.0340],
        [ 0.0160,  0.0325,  0.0264,  ...,  0.0140,  0.0069,  0.0114],
        [ 0.0324, -0.0011, -0.0428,  ..., -0.0121, -0.0344,  0.0096],
        ...,
        [ 0.0160,  0.0325,  0.0264,  ...,  0.0140,  0.0069,  0.0114],
        [ 0.0246, -0.0015,  0.0057,  ...,  0.0323,  0.0210, -0.0365],
        [ 0.0160,  0.0325,  0.0264,  ...,  0.0140,  0.0069,  0.0114]])


In [None]:
class TokenPredictor(nn.Module):
    def __init__(self, vocab_size, input_token_count, hidden_units):
        super().__init__()

        torch.manual_seed(0)
        self.embedding = Embedding(vocab_size, hidden_units)
        self.dense1 = nn.Linear(hidden_units, hidden_units)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(hidden_units, hidden_units)
        self.output = nn.Linear(hidden_units * input_token_count, hidden_units)

    def forward(self, x):
        # Embed from (token_count, vocab_size) to (token_count, hidden_size)
        embedded = self.embedding(x)
        # Run the network
        x = self.dense2(self.relu(self.dense1(embedded)))
        # Flatten the vectors into one large vector for the final layer
        flat = torch.flatten(x).unsqueeze(0)
        # Run the final layer to get an output
        network_out = self.output(flat)
        # Unembed, convert to (1, vocab_size).  Argmax against last dim gives predicted token
        out_vector = network_out @ self.embedding.weights.T
        return out_vector