# Initial LatinBERT Language Generation Analysis
<hr>

#### Imports
We'll first import the different necessary modules

In [None]:
from Data import fetch
fetch.text_retrieval()

In [1]:
import numpy as np
import os, re
from Data import dataExp
%matplotlib inline
from matplotlib import pyplot as plt
import LatinBERT
from LatinBERT.gen_berts import LatinBERT
from LatinBERT.LatinTok import LatinTokenizer
from LatinBERT.predict_words import predict
from transformers import BertModel, BertForMaskedLM, BertPreTrainedModel
from tensor2tensor.data_generators import text_encoder
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=UserWarning)
    from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
    from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from cltk.embeddings.embeddings import Word2VecEmbeddings as W2VE
from sklearn import metrics
import pandas as pd
import torch

In [None]:
CI = dataExp.CorpusInterface(corpus_name="text_corpus.pickle", shouldTokenize = False)

We then quickly visualise each author and the amount of text available to each of them.

In [None]:
top_authors = CI.get_authors_by_text_size()
for (author, count) in top_authors:
    print(author, count)

We can load in the default LatinBERT model to perform text generation from different authors' sampled texts

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizerPath = os.getcwd()+"/LatinBERT/latin.subword.encoder"
bertPath = os.getcwd()+"/LatinBERT/latin_bert"
encoder = text_encoder.SubwordTextEncoder(tokenizerPath)
wp_tokenizer = LatinTokenizer(encoder)

model = BertForMaskedLM.from_pretrained(bertPath)
model.to(device)

Now, let's define a generator function, which keeps predicting the next word after the current context.

In [6]:
def gen_text(text: str, num_words:int, wp_tokenizer, model):
    total_text = text
    for i in range(num_words):
        total_text = predict(wp_tokenizer, total_text, model)
    return total_text
gen_text("In omnia", 10, wp_tokenizer, model)

'In omnia sunt , et sunt , et sunt , et sunt'

Note: the output shows that the text is starting to looop "et sunt, et sunt, et sunt"

### Text Selection
First, we need to select particular authors and sample their texts to give LatinBERT an initial place to start.

In [5]:
selected_authors = ["ovid", "cicero", "jerome", "catullus", "vergil"]
text_by_author = {}
for author in selected_authors:
    text_by_author[author] = CI.get_text_for_author(author)

Having selected authors, we can now generate text by randomly choosing an author, sampling randomly from their work, continuing the work with 30 more words from the sample, and then comparing it with how the work should have been generated.

In [None]:
import random
generated_text = {'author': [], 'prompt_text_length':[], 'correct_continuation': [], 'generated_continuation': [] }
number_of_samples = 50
text_continuation_length = 30
max_initial_length = 200
for i in range(number_of_samples):
    print(i)
    author = selected_authors[random.randint(0,len(selected_authors)-1)]
    author_text = text_by_author[author][random.randint(0,len(text_by_author[author])-1)].split(" ")
    if len(author_text)<max_initial_length*2+text_continuation_length: continue
    end_idx = random.randint(max_initial_length, len(author_text)-text_continuation_length-max_initial_length)
    start_idx = end_idx-max_initial_length
    
    prompt_text = " ".join(author_text[start_idx:end_idx])
    
    txt = gen_text(prompt_text, text_continuation_length, wp_tokenizer, model)
    generated_text["author"].append(author)
    generated_text["prompt_text_length"].append(end_idx-start_idx)
    generated_text["correct_continuation"].append(" ".join(author_text[end_idx-15:end_idx+30]))
    txt = txt.split(" ")
    generated_text["generated_continuation"].append(" ".join(txt[-45:]))
    

It will be easier to view the output as a dataframe, and it will allow us to save the information as a CSV, which might be convenient for later analysis.

In [20]:
df = pd.DataFrame(data=generated_text)

In [None]:
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)
display(df)

The output continues to show signs of 'looping' and overall is surprisingly poorer than anticipated/hoped for

In [21]:
df.to_csv(os.getcwd()+"/Results/BertGEN.csv")