# Latin Transformer
This notebook works through the creation of a character transformer model but done for my custom Latin Corpus that I've created

In [2]:
import numpy as np
import os, re
from Data import dataExp
%matplotlib inline
from matplotlib import pyplot as plt
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=UserWarning)
    from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
    from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from cltk.embeddings.embeddings import Word2VecEmbeddings as W2VE
from sklearn import metrics
import pandas as pd
import torch
from transformer import LanguageModel

Found the existing corpus
abbofloracensis had 1 pieces of work with a total of 34398 characters of text
abelard had 1 pieces of work with a total of 15483 characters of text
acticussincerius had 1 pieces of work with a total of 5947 characters of text
addison had 1 pieces of work with a total of 3074 characters of text
adso had 1 pieces of work with a total of 13551 characters of text
aelredus had 1 pieces of work with a total of 118173 characters of text
agnes had 1 pieces of work with a total of 74784 characters of text
alanus had 1 pieces of work with a total of 136527 characters of text
albericodamarcellise had 1 pieces of work with a total of 172 characters of text
albertanus had 1 pieces of work with a total of 108213 characters of text
albertofaix had 1 pieces of work with a total of 51703 characters of text
alcuin had 1 pieces of work with a total of 1641 characters of text
aleandrogerolamo had 1 pieces of work with a total of 10197 characters of text
alfonsi had 1 pieces of wo

KeyboardInterrupt: 

In [3]:
model = torch.load('LatinTransformer/model_2.pt')

In [4]:
from transformer import encode, decode

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
context = torch.tensor(encode("Gallia est omnis divisa in partes tres".lower()), dtype=torch.long, device=device)

In [9]:
decode(model.generate(context, max_new_tokens=10000)[0].tolist())

AttributeError: 'collections.OrderedDict' object has no attribute 'generate'

In [2]:
CI = dataExp.CorpusInterface(corpus_name="text_corpus.pickle", shouldTokenize = False)

Found the existing corpus
abbofloracensis had 1 pieces of work with a total of 34398 characters of text
abelard had 1 pieces of work with a total of 15483 characters of text
acticussincerius had 1 pieces of work with a total of 5947 characters of text
addison had 1 pieces of work with a total of 3074 characters of text
adso had 1 pieces of work with a total of 13551 characters of text
aelredus had 1 pieces of work with a total of 118173 characters of text
agnes had 1 pieces of work with a total of 74784 characters of text
alanus had 1 pieces of work with a total of 136527 characters of text
albericodamarcellise had 1 pieces of work with a total of 172 characters of text
albertanus had 1 pieces of work with a total of 108213 characters of text
albertofaix had 1 pieces of work with a total of 51703 characters of text
alcuin had 1 pieces of work with a total of 1641 characters of text
aleandrogerolamo had 1 pieces of work with a total of 10197 characters of text
alfonsi had 1 pieces of wo

Now that we've loaded in the Corpus Interface, let's retrieve all of the available text, find all the characters that occur in the text, which will be our vocabulary as this a character level transformer.

In [24]:
# load in all the text from my Corpus Interface
text = CI.get_total_data().replace("\t","")
print("Total number of characters: {}".format(len(text)))
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)
print(text[:50])

Total number of characters: 61345135
  !"#$%&'()*+,-./0123456789:;<=>?@[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
70
gesta francorum viii gesta francorum liber viii [x


We know want to build a character tokenization since we are using a character level language model. For the next one, it may be worth it to try and use tokens generated from LatinBERT or lat word2vec, etc.

And then we can use Pytorch to encode all the text we loaded in from before. And we print out the encoded version of the text we saw above.

In [4]:
w2v = W2VE("lat")
st = SentenceTokenizer()
wt = WordTokenizer()


In [11]:
print(len(w2v.model))
count = 0 
with open ("vocab/vocab.txt", "w+") as f:
    text = ""
    for word in w2v.model.index_to_key:
        text+=word+"\n"
        count+=1
    f.write(text)
print(count)

555381
555381


In [18]:
stoi = {}
itos = {}
with open("vocab/vocab.txt", "r") as f:
    lines = f.read().split("\n")
    del lines[len(lines)-1]
    for idx in range(len(lines)):
        stoi[lines[idx]] = idx
        itos[idx] = lines[idx]

encode = lambda s: [stoi[word] for word in wt.tokenize(s) if word in stoi]
decode = lambda l: [' '.join([itos[i] for i in l])]

In [28]:
#encode = lambda s: [w2v.get_word_vector(word) for word in wt.tokenize(s)]
#decode = lambda l: [w2v.get_sims(v)[0][0] for v in l]

# Recall that I'm using all lower case for input (could also be all uppercase, was choosen arbitraily)
print(encode("De Bello Gallico".lower()))
print(decode(encode("De Bello Gallico".lower())))
print(text[:50])
data = torch.tensor(encode(text[:100]), dtype=torch.long)
print(data.shape, data.dtype)
print(data)


[12, 583, 13723]
['de bello gallico']
gesta francorum viii gesta francorum liber viii [x
torch.Size([17]) torch.int64
tensor([ 2494,  2109,  1084,  2494,  2109,   466,  1084,    46,  3543,    41,
        26363,    90, 40248,  3328, 57085,     3, 23419])


In [22]:
print(data.shape, data.dtype)
print(data[:100]) 

torch.Size([581996]) torch.int64
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
        54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
        72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
        90, 91, 92, 93, 94, 95, 96, 97, 98, 99])


In [4]:
# We just create a mapping between our character vocabulary
# and their corresponding integer value, and define lambda funcs to do this mapping for us
#stoi = {ch:i for i, ch in enumerate(chars)}
#itos = { i:ch for i,ch in enumerate(chars)}
#encode = lambda s: [stoi[c] for c in s] 
#decode = lambda l: ''.join([itos[i] for i in l])

# Recall that I'm using all lower case for input (could also be all uppercase, was choosen arbitraily)
print(encode("De Bello Gallico".lower()))
print(decode(encode("De Bello Gallico".lower())))

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:50]) #

[43, 44, 1, 41, 44, 51, 51, 54, 1, 46, 40, 51, 51, 48, 42, 54]
de bello gallico
torch.Size([61344458]) torch.int64
tensor([42, 15,  1, 40, 58, 48, 53, 48, 54,  1, 42, 15,  1, 40, 53, 59, 48, 58,
        59, 48, 54,  1, 42, 54, 53, 58, 60, 51, 48, 41, 60, 58,  1, 53, 54, 53,
        60, 58,  1, 59, 48, 41, 44, 57, 48, 54,  1, 40, 53, 53])


In [5]:
# Let's create a train/val split
n = int(.8*len(data)) 
train_data = data[:n]
val_data = data[n:]

## Batches of context blocks
We define a block/context size and load in the data for each block

In [7]:
# for reproducibility
torch.manual_seed(42)
context_size = 8 # size of the context
batch_size = 4 # independent batches trained in parallel

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - context_size, (batch_size,))
    # create a batch by context size tensor of the data
    x = torch.stack([data[i:i+context_size] for i in ix])
    y = torch.stack([data[i+1:i+context_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs:")
print(xb.shape)
print(xb)
print("Targets:")
print(yb.shape)
print(yb)

print('-'*20)
for b in range(batch_size): 
    for t in range(context_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} the target is {target}")


Inputs:
torch.Size([4, 8])
tensor([[44, 57, 60, 53, 59, 15,  1, 44],
        [52,  1, 48, 51, 51, 48, 58,  1],
        [48, 53, 59, 44, 46, 57, 40,  1],
        [48, 53, 48, 58,  1, 40, 43,  1]])
Targets:
torch.Size([4, 8])
tensor([[57, 60, 53, 59, 15,  1, 44, 59],
        [ 1, 48, 51, 51, 48, 58,  1, 54],
        [53, 59, 44, 46, 57, 40,  1, 57],
        [53, 48, 58,  1, 40, 43,  1, 51]])
--------------------
When input is [44] the target is 57
When input is [44, 57] the target is 60
When input is [44, 57, 60] the target is 53
When input is [44, 57, 60, 53] the target is 59
When input is [44, 57, 60, 53, 59] the target is 15
When input is [44, 57, 60, 53, 59, 15] the target is 1
When input is [44, 57, 60, 53, 59, 15, 1] the target is 44
When input is [44, 57, 60, 53, 59, 15, 1, 44] the target is 59
When input is [52] the target is 1
When input is [52, 1] the target is 48
When input is [52, 1, 48] the target is 51
When input is [52, 1, 48, 51] the target is 51
When input is [52, 1, 48,

## Bigram model
A simple bigram model that will function as a baseline for our subsequent transformer in terms of performance

In [9]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # tokens correspond to a frequency lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        
        # idx and targets are both (batch, time) tensors of integers
        logits = self.token_embedding_table(idx) # (batch, time, channel) where chanel is vocab_size
        
        if targets == None:
            loss = None
        else:
            """ 
            loss function -negative log likelihood
            but cross_entropy wants (B*T, C), this effectively
            means we're making our 3d matrix into a 2d one where the next batch
            follows the prior one in time
            """
            batch, time, channel = logits.shape

            logits = logits.view(batch*time, channel)
            targets = targets.view(batch*time)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        """
        Right now, history is not being used so passing in all the history is silly in self(idx)
        since it's a bigram model it only needs the prior character, but this is a very general and
        reusable function for later
        """
        #idx is (batch, time) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # most recent time step
            logits = logits[:, -1, :] # (batch, channel)
            # softmax for probability
            probs = F.softmax(logits, dim=-1) #probabilities for each batch by vocab size
            # sample from the probability dist
            idx_next = torch.multinomial(probs, num_samples=1) # (Batch, 1)
            # append the newly sampled prediction to the sequence 
            idx = torch.cat((idx, idx_next), dim=1) # batch, timestep+1
        return idx
m = BigramLM(vocab_size)
out, loss = m(xb, yb)
print(out.shape)
print(loss)
# start a prediction of one batch size starting with ' ' and then generate for 100 tokens
# 0th row for the only batch dimension 
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=100)[0].tolist()))

torch.Size([32, 70])
tensor(5.0427, grad_fn=<NllLossBackward0>)
 :trq/z!./@*r_>$%q><yng+6;!vnhnt"^l;ww$3vp__&9<vb\/|hw .*4mn]p[p_j|8&wo~#pt-$60\s\14\r'sfq_[/]h48l#6;


Now let's train the above model, just rerunning the optimization below until satisfied -- not a very sophisticated approach, but we're also not particularly bothered by the bigram model.

In [47]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [54]:
batch_size = 32
for steps in range(10000):
    # sample a batch
    xb, yb = get_batch('train')
    logits, loss = m(xb,yb)
    # zero out gradients from prior step
    optimizer.zero_grad(set_to_none=True)
    # getting gradients for all parameters
    loss.backward()
    # update parameters from gradients
    optimizer.step()
print(loss.item())

2.542701244354248


In [57]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=400)[0].tolist()))

 z_c iope quist qusomia iquat nun cetere e, c qusueoninor, derorm qumiss ". heho raemelt, cc qum pr aviriatubocmi cutqumut pprit quddeae senosintefilultanclirbiegen inue caenusiunatonereniqulins coss. iope kes cla qum frm enuct simiae; fit utuerukworeun fum pue  oet  deno cisu nus qusse lactor. ata, tt at, osterrat, cas cusiatin vus: mius fe eruni isibit qum qusaciciausso inuins,  ereiabe ponarus q


The above, albeit giberish, is clearly an immense improvement upon the initial prediction without optimisation.

## Self Attention

### first a simple mathematical trick in self-attention
The following is highly ineffecient, but can be sped up via matrix multiplication.


In [11]:
# toy example for self-attention 
torch.manual_seed(42)
batch, time, channel = 4,8,2 
x = torch.randn(batch, time, channel)
x.shape

# we want x[b,t] = mean_{i<=t} x[b,i]

xbow = torch.zeros((batch, time, channel))
for b in range(batch):
    for t in range(time):
        xprev = x[b, :t+1] # (t, channel)
        xbow[b,t] = torch.mean(xprev, 0)
print(x[0])
print(xbow[0])

tensor([[ 1.9269,  1.4873],
        [ 0.9007, -2.1055],
        [ 0.6784, -1.2345],
        [-0.0431, -1.6047],
        [-0.7521,  1.6487],
        [-0.3925, -1.4036],
        [-0.7279, -0.5594],
        [-0.7688,  0.7624]])
tensor([[ 1.9269,  1.4873],
        [ 1.4138, -0.3091],
        [ 1.1687, -0.6176],
        [ 0.8657, -0.8644],
        [ 0.5422, -0.3617],
        [ 0.3864, -0.5354],
        [ 0.2272, -0.5388],
        [ 0.1027, -0.3762]])


The 'mathematical trick' is what I presumed would be the approach when given the problem statement

In [14]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10, (3,2)).float()
c = a @ b
print("a=")
print(a)
print("b=")
print(b)
print("--\nc=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [19]:
# doing this now for the example above we can do the following

weights = torch.tril(torch.ones(time, time))
weights = weights / weights.sum(1, keepdim=True)
# weights is a 
# and b is now x

xbow2 = weights @ x # (time, time) @ (batch, time, channel) -> (batch, time, time) @ (batch, time, channel) -> (batch, time, channel)
print(xbow[0])
print(xbow2[0])

tensor([[ 1.9269,  1.4873],
        [ 1.4138, -0.3091],
        [ 1.1687, -0.6176],
        [ 0.8657, -0.8644],
        [ 0.5422, -0.3617],
        [ 0.3864, -0.5354],
        [ 0.2272, -0.5388],
        [ 0.1027, -0.3762]])
tensor([[ 1.9269,  1.4873],
        [ 1.4138, -0.3091],
        [ 1.1687, -0.6176],
        [ 0.8657, -0.8644],
        [ 0.5422, -0.3617],
        [ 0.3864, -0.5354],
        [ 0.2272, -0.5388],
        [ 0.1027, -0.3762]])


In [22]:
# but now with softamx
tril = torch.tril(torch.ones(time, time))
weights = torch.zeros((time,time))
weights = weights.masked_fill(tril==0, float('-inf'))
# softmax also normalizes it in the same way we do 
# weights = weights/weights.sum(1, keepdim= True) as above
weights = F.softmax(weights, dim=-1)
xbow3 = weights @ x
torch.allclose(xbow2, xbow3)

True

In [30]:
# version 4, self attention
torch.manual_seed(42)
batch, time, channel = 4, 8, 32
x = torch.randn(batch,time, channel)

# single head of self attention 
# head size is hyper-parameter
head_size = 16
key = nn.Linear(channel, head_size, bias = False)
query = nn.Linear(channel, head_size, bias = False)
value = nn.Linear(channel, head_size, bias = False)

k = key(x) # (batch, time, head_size)
q = query(x) # (batch, time, head_size)

# -2 and -1 because we don't want to transpose the batch)
weights = q @ k.transpose(-2, -1) # (batch, time, head_size) @   (batch, time, head_size) ----> (batch, time, time) the affiniites

tril = torch.tril(torch.ones(time, time))
# don't want to interact with subsequent time step tokens
weights = weights.masked_fill(tril==0, float('-inf'))
# make the probability nicely distributed
weights = F.softmax(weights, dim=-1)
#out = weights @ x
v = value(x)
out = weights @ v 

print(out.shape)
print(tril)
print(weights[0])

torch.Size([4, 8, 16])
tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1905, 0.8095, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3742, 0.0568, 0.5690, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1288, 0.3380, 0.1376, 0.3956, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4311, 0.0841, 0.0582, 0.3049, 0.1217, 0.0000, 0.0000, 0.0000],
        [0.0537, 0.3205, 0.0694, 0.2404, 0.2568, 0.0592, 0.0000, 0.0000],
        [0.3396, 0.0149, 0.5165, 0.0180, 0.0658, 0.0080, 0.0373, 0.0000],
        [0.0165, 0.0375, 0.0144, 0.1120, 0.0332, 0.4069, 0.3136, 0.0660]],
       grad_fn=<SelectBackward0>)
