# LatinBERT Language Generation
<hr>

#### Imports
We'll first import the different necessary modules

In [None]:
from Data import fetch
fetch.text_retrieval()

In [1]:
import numpy as np
import os, re
from Data import dataExp
%matplotlib inline
from matplotlib import pyplot as plt
import LatinBERT
from LatinBERT.gen_berts import LatinBERT
from LatinBERT.LatinTok import LatinTokenizer
from LatinBERT.predict_words import predict
from transformers import BertModel, BertForMaskedLM, BertPreTrainedModel
from tensor2tensor.data_generators import text_encoder
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=UserWarning)
    from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
    from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from cltk.embeddings.embeddings import Word2VecEmbeddings as W2VE
from sklearn import metrics
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm
2023-02-23 00:35:16.567680: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-23 00:35:16.693692: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-23 00:35:16.693710: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-23 00:35:17.392298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open s

In [2]:
CI = dataExp.CorpusInterface(corpus_name="text_corpus.pickle", shouldTokenize = False)

Found the existing corpus
abbofloracensis had 1 pieces of work with a total of 34398 characters of text
abelard had 1 pieces of work with a total of 15483 characters of text
acticussincerius had 1 pieces of work with a total of 5947 characters of text
addison had 1 pieces of work with a total of 3074 characters of text
adso had 1 pieces of work with a total of 13551 characters of text
aelredus had 1 pieces of work with a total of 118173 characters of text
agnes had 1 pieces of work with a total of 74784 characters of text
alanus had 1 pieces of work with a total of 136527 characters of text
albericodamarcellise had 1 pieces of work with a total of 172 characters of text
albertanus had 1 pieces of work with a total of 108213 characters of text
albertofaix had 1 pieces of work with a total of 51703 characters of text
alcuin had 1 pieces of work with a total of 1641 characters of text
aleandrogerolamo had 1 pieces of work with a total of 10197 characters of text
alfonsi had 1 pieces of wo

In [None]:
top_authors = CI.get_authors_by_text_size()
for (author, count) in top_authors:
    print(author, count)

We can load in the default LatinBERT model to perform text generation from different authors' sampled texts

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizerPath = os.getcwd()+"/LatinBERT/latin.subword.encoder"
bertPath = os.getcwd()+"/LatinBERT/latin_bert"
encoder = text_encoder.SubwordTextEncoder(tokenizerPath)
wp_tokenizer = LatinTokenizer(encoder)

model = BertForMaskedLM.from_pretrained(bertPath)
model.to(device)

Some weights of the model checkpoint at /home/rufus/Desktop/LatinNLG/LatinNLG/LatinBERT/latin_bert were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32900, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [4]:
def gen_text(text: str, num_words:int, wp_tokenizer, model):
    total_text = text
    for i in range(num_words):
        total_text = predict(wp_tokenizer, total_text, model)
    return total_text
gen_text("In omnia", 10, wp_tokenizer, model)

'In omnia sunt , et sunt , et sunt , et sunt'

### Text Selection
First, we need to select particular authors and sample their texts to give LatinBERT an initial place to start.

In [5]:
selected_authors = ["ovid", "cicero", "jerome", "catullus", "vergil"]
text_by_author = {}
for author in selected_authors:
    text_by_author[author] = CI.get_text_for_author(author)

In [6]:
import random
generated_text = {'author': [], 'prompt_text_length':[], 'correct_continuation': [], 'generated_continuation': [] }
number_of_samples = 50
text_continuation_length = 30
max_initial_length = 200
for i in range(number_of_samples):
    print(i)
    author = selected_authors[random.randint(0,len(selected_authors)-1)]
    author_text = text_by_author[author][random.randint(0,len(text_by_author[author])-1)].split(" ")
    if len(author_text)<max_initial_length*2+text_continuation_length: continue
    end_idx = random.randint(max_initial_length, len(author_text)-text_continuation_length-max_initial_length)
    start_idx = end_idx-max_initial_length
    
    prompt_text = " ".join(author_text[start_idx:end_idx])
    
    txt = gen_text(prompt_text, text_continuation_length, wp_tokenizer, model)
    generated_text["author"].append(author)
    generated_text["prompt_text_length"].append(end_idx-start_idx)
    generated_text["correct_continuation"].append(" ".join(author_text[end_idx-15:end_idx+30]))
    txt = txt.split(" ")
    generated_text["generated_continuation"].append(" ".join(txt[-45:]))
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [20]:
df = pd.DataFrame(data=generated_text)

In [19]:
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)
display(df)

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,author,prompt_text_length,correct_continuation,generated_continuation
0,cicero,200,"sumus. sed in rebus apertissimis nimium longi sumus. perfecto enim et concluso neque virtutibus neque amicitiis usquam locum esse, si ad voluptatem omnia referantur, nihil praeterea est magnopere dicendum. ac tamen, ne cui loco non videatur esse responsum, pauca etiam nunc dicam ad reliquam","sumus. sed in rebus apertissimis nimium longi sumus. perfecto enim et concluso neque virtutibus sumus amici amici amici amici amici nos si sumus amici amici amici sumus si non nos amici enim ipsi , sumus amici ipsi ipsi ipsi non ipsi sumus ipsi ipsi"
1,catullus,200,non satis id causae credideram esse tibi. tu satis id duxti: tantum tibi gaudium in omni culpa est in quacumque est aliquid sceleris. lesbia mi dicit semper male nec tacet unquam de me: lesbia me dispeream nisi amat. quo signo? quia sunt totidem mea:,"non satis id causae credideram esse tibi. tu satis id duxti: tantum tibi gaudium in credis credis credis credis credis credis credis non ? credis ? credis credis non , non credis , , credis , credis , credis credis credis , credis non ,"
2,catullus,200,"ut die periret, saturnalibus, optimo dierum! non, non hoc tibi, false, sic abibit: nam, si luxerit, ad librariorum curram scrinia, caesios, aquinos, suffenum, omnia colligam venena, ac te his suppliciis remunerabor. vos hinc interea valete, abite illuc unde malum pedem attulistis, saecli incommoda, pessimi poetae.","ut die periret, saturnalibus, optimo dierum! non, non hoc tibi, false, sic abibit: nam, si , , , , , , , , , , , , , , , , , , facis hoc facis , , facis facis , facis , tamen tamen"
3,cicero,200,"nuper fuerunt ullum auctorem istius aestimationis. quo me igitur aut ad quae exempla revocas? ab illis hominibus, qui tum versati sunt in re publica cum et optimi mores erant et hominum existimatio gravis habebatur et iudicia severa fiebant, ad hanc hominum libidinem ac licentiam me",nuper fuerunt ullum auctorem istius aestimationis. quo me igitur aut ad quae exempla revocas? ab ? ? ? ? ? ? ? ? ? ? ? exempla exempla ? ? exempla ? ? ? ? exempla exempla exempla ? ? ? ? ? exempla ?
4,catullus,200,"splendent auro atque argento. candet ebur soliis, conlucent pocula mensae, tota domus gaudet regali splendida gaza. pulvinar vero divae geniale locatur sedibus in medus, indo quod dente politum tincta tegit roseo conchyli purpura fuco. haec vestis priscis hominum variata figuris heroum mira virtutes indicat arte.","splendent auro atque argento. candet ebur soliis, conlucent pocula mensae, tota domus gaudet regali splendida , , , , , , , , , limina , limina , limina limina limina , limina limina limina limina limina fulgentlimina limina limina , limina limina limina limina"
5,cicero,200,"quae ne traditam quidem atque inculcatam libertatem recipere possit plusque timeat in puero nomen sublati regis quam confidat sibi, cum illum ipsum qui maximas opes habuerit paucorum virtute sublatum videat me vero posthac ne commendaveris caesari tuo, ne te quidem ipsum, si me audies. valde","quae ne traditam quidem atque inculcatam libertatem recipere possit plusque timeat in puero nomen sublati , liberati illo liberati ? , liberati , liberi erimus erimus erimus , erimus erimus erimus erimus , , erimus erimus , erimus erimus erimus non si erimus non non"
6,cicero,200,"senatus in capitolium; parata de circumscribendo adulescente sententia consularis, cum repente--nam martiam legionem albae consedisse sciebat--adfertur ei de quarta nuntius. quo perculsus abiecit consilium referendi ad senatum de caesare: egressus est non viis, sed tramitibus paludatus. ex eo non iter, sed cursus et fuga in","senatus in capitolium; parata de circumscribendo adulescente sententia consularis, cum repente--nam martiam legionem albae consedisse omnia omnia illa omnia omnia facta tum tum tum tum omnia omnia tum , tum tum tum , , omnia , facta facta facta facta illa tum facta tum facta"
7,vergil,200,"ille prior praeeunte carina; parte prior, partem rostro premit aemula pristis. at media socios incedens nave per ipsos hortatur mnestheus: ""nunc, nunc insurgite remis, hectorei socii, troiae quos sorte suprema delegi comites; nunc illas promite vires, nunc animos, quibus in gaetulis syrtibus usi, ionioque mari","ille prior praeeunte carina; parte prior, partem rostro premit aemula pristis. at media socios incedens , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,"
8,cicero,200,"consociati fuistis. nec mirum: agitur enim non qua condicione victuri, sed victurine simus an cum supplicio ignominiaque perituri. quamquam mortem quidem natura omnibus proposuit; crudelitatem mortis et dedecus virtus propulsare solet, quae propria est romani generis et seminis. hanc retinete, quaeso, quirites, quam vobis tamquam","consociati fuistis. nec mirum: agitur enim non qua condicione victuri, sed victurine simus an cum nulla est nulla nulla digna digna digna digna condicione digna , digna , , , digna , , condicione , condicione , , condicione non , digna non digna senatu"
9,ovid,200,"putant mentes vos aperire suas. nec tamen hoc falsum: nam, dis ut proxima quaeque, nunc penna veras, nunc datis ore notas, tuta diu volucrum proles tum denique caesa est, iuveruntque deos indicis exta sui. ergo saepe suo coniunx abducta marito uritur idaliis alba columba","putant mentes vos aperire suas. nec tamen hoc falsum: nam, dis ut proxima quaeque, nunc , , , , , , , , , , , , , , , , , , , , , , , , , amat amat , amat amat"


In [21]:
df.to_csv(os.getcwd()+"/Data/BertGEN.csv")