In [2]:
import numpy as np
import os, re, warnings
from Data import dataExp
import torch
from transformers import BertTokenizer, BertForMaskedLM
from tensor2tensor.data_generators import text_encoder
from LatinBERT.LatinTok import LatinTokenizer
from torch.utils.data import Dataset, DataLoader
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=UserWarning)
    from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
    from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer

In [21]:
torch.cuda.empty_cache()

# hyperparameters
batch_size = 64 
context_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

In [3]:
# load in the tokenizer and Bert Model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizerPath = os.getcwd()+"/LatinBERT/latin.subword.encoder"
bertPath = os.getcwd()+"/LatinBERT/latin_bert"
encoder = text_encoder.SubwordTextEncoder(tokenizerPath)
tokenizer = LatinTokenizer(encoder)

model = BertForMaskedLM.from_pretrained(bertPath)
model.to(device)
print(model.eval())

#st = SentenceTokenizer()

Some weights of the model checkpoint at /notebooks/LatinBERT/latin_bert were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32900, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [4]:
# Load in the raw dataset
CI = dataExp.CorpusInterface(corpus_name="tokenized_corpus.pickle", shouldTokenize = True)
#text = CI.get_total_data().replace("\t","")
text = CI.get_text_for_author(author="Caesar",shouldShuffle=True)

Found the existing corpus
abbofloracensis had 1 pieces of work with a total of 5403 characters of text
abelard had 1 pieces of work with a total of 13784 characters of text
acticussincerius had 1 pieces of work with a total of 340 characters of text
addison had 1 pieces of work with a total of 296 characters of text
adso had 1 pieces of work with a total of 2351 characters of text
aelredus had 1 pieces of work with a total of 20901 characters of text
agnes had 1 pieces of work with a total of 11811 characters of text
alanus had 1 pieces of work with a total of 34012 characters of text
albericodamarcellise had 1 pieces of work with a total of 27 characters of text
albertanus had 1 pieces of work with a total of 2869 characters of text
albertofaix had 1 pieces of work with a total of 14123 characters of text
alcuin had 1 pieces of work with a total of 487 characters of text
aleandrogerolamo had 1 pieces of work with a total of 654 characters of text
alfonsi had 1 pieces of work with a to

In [42]:
def encodeWithBert(text):
    vals = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
    return vals

In [43]:
data = torch.tensor(encodeWithBert(text),dtype=torch.long)

In [46]:
# Let's create a train/val split
n = int(.8*len(data)) 
train_data = data[:n]
val_data = data[n:]

In [7]:
# define a Dataset that masks each following word in input sequence
class NextWordDataset(Dataset):
    def __init__(self, text, tokenizer):
        self.tokenizer = tokenizer
        self.examples = []
        sents = re.split('(?<=[\.\?\!])\s*', text)
        for line in sents:
            line = line.strip()
            if len(line) == 0:
                continue
            tokens = tokenizer.tokenize(line)
            for i in range(1, len(tokens)):
                prev_tokens = tokens[:i]
                next_token = tokens[i]
                self.examples.append((prev_tokens, next_token))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        prev_tokens, next_token = self.examples[idx]
        prev_ids = self.tokenizer.convert_tokens_to_ids(prev_tokens)
        next_id = self.tokenizer.convert_tokens_to_ids([next_token])[0]
        return torch.tensor(prev_ids), torch.tensor(next_id)

In [18]:
tokenids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("arma virumque cano"))

torch_tokenids=torch.LongTensor(tokenids).unsqueeze(0)
torch_tokenids=torch_tokenids.to("cuda")
total_text = ""
with torch.no_grad():
    preds = model(torch_tokenids)

In [19]:
print(preds)
#dataset = NextWordDataset(text, tokenizer)


MaskedLMOutput(loss=None, logits=tensor([[[-3.0104, -5.5644, -5.4879,  ..., -4.9033, -5.5426, -5.5480],
         [-3.7047, -4.6934, -4.6515,  ..., -6.7923, -5.2240, -3.8521],
         [-7.3019, -5.7173, -4.2949,  ..., -8.8850, -7.4146, -5.5154],
         [-5.8219, -5.9337, -3.8385,  ..., -8.9431, -7.5980, -9.6204]]],
       device='cuda:0'), hidden_states=None, attentions=None)


In [47]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - context_size, (batch_size,))
    # create a batch by context size tensor of the data
    x = torch.stack([data[i:i+context_size] for i in ix])
    y = torch.stack([data[i+1:i+context_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [66]:
tmp_x, tmp_y = get_batch("val")
loss_fn = torch.nn.CrossEntropyLoss()

In [81]:
print(tmp_x[0])
print(tmp_y[0])
assert(len(tmp_x)==len(tmp_y))
print(len(tmp_x))
for b in range(len(tmp_x)):
    preds = model(tmp_x[b].unsqueeze(0))[0]
    sortedVals=torch.argsort(preds[0][256], descending=True)
    p = sortedVals[0]
    print(p)
    loss = loss_fn(preds, tmp_y[b])
    print(loss)

tensor([  313,  3947,    12,   939,  5516,    24,  7710,    51,    65,  2399,
         7710,    51,   108,   887,   278,   168,  1550,  2804,    24,    10,
         2009,   390,  7710,    51,    38,    63,    37, 14531,    24,   313,
        20722,    24,  7710,    51, 13745, 25155, 10674,    24,   488,  8691,
           24,  7710,    51,    38,  1390,  7704,   960,  8137,   198,  4118,
         6793,   359, 13748,  7710,    51,  2474,   772,  6328,    24,  7710,
           51,    40,   215,   215,    30,   198,  4118,    63, 12020,    34,
        20317,     9,    82,  1673,  6484,    24,  7710,    51,   583,    22,
         4277,  7127,    24, 11919,    36,    90,   795,  5137,   319,  7710,
           51,    38,   544,   255,  9575,    34,   108, 11875,    34,  1817,
        20317,     9,  5741,    24,    10,    22,  6210,  9169,    24,    10,
           30,  9560,   198,  1009,  7330,    24, 13122,    24,   313,  7710,
           51,  4277,    38,  1829,   313, 18098,  1267,   313, 

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 15.74 GiB total capacity; 14.13 GiB already allocated; 3.56 MiB free; 14.55 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:


# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer.zero_grad()
# Train the model
for iter_val in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter_val % eval_interval == 0 or iter_val == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    xb, yb = get_batch('train')
    
    logits = model(input_ids=prev_ids.unsqueeze(0))[0][:, -1, :]  # predict next word based on the previous tokens
    loss = loss_fn(logits, yb)
    for b in range(len(tmp_x)):
        logits = model(tmp_x[b].unsqueeze(0))[0][0]
        loss = loss_fn(res, tmp_y[b])
        
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    


RuntimeError: stack expects each tensor to be equal size, but got [729] at entry 0 and [3638] at entry 1