# Aventine: Demonstration

In [128]:
import json
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

import cltk
from cltk import NLP
from sentence_transformers import SentenceTransformer

from aventine.library.config import QUICKSTART_DOCUMENTS
from aventine.library.config import ROOT_FINGERPRINT, CORPUS_FINGERPRINT
from aventine.library.config import SENTENCE_TRANSFORMER_MODEL as ENG_MODEL
from aventine.library.config import WORD_EMBEDDING_MODEL as LAT_MODEL
from aventine.library.config import ALLOWED_SYMBOLS, ALLOWED_PUNCTS
from aventine.library.utils import Checkpointer
from aventine.library.utils import meanings
from aventine.library.utils import normalise_text
from aventine.library.utils import get_null, replace_if_none
from aventine.library.files import perseus_url

## (Optional) Data Downloading, Parsing, Generation

In [None]:
from aventine.library.files import perseus_xml_get, perseus_xml2txt
from aventine.library.index import preprocess

for doc in QUICKSTART_DOCUMENTS:
    metadata = perseus_xml_get(QUICKSTART_DOCUMENTS[doc], 'aventine/data/sources')
    metadata = perseus_xml2txt(metadata, 'aventine/data/sources')
    preprocess(metadata, 'aventine/data/dumps', tool_dir='aventine/tools/bin')

## Load Indexed Data

In [2]:
meatadata_dir = Path('aventine/data/sources/metadata')
index_dir = Path('aventine/data/dumps')
tool_dir = Path('aventine/tools/bin')

In [25]:
def get_metadata(text_id):
    with open(meatadata_dir/f'{text_id}.json', 'r', encoding='utf-8') as f:
        return json.load(f)

In [37]:
root_ckpt = Checkpointer(index_dir/'root', ROOT_FINGERPRINT)
r = root_ckpt.load()

text_ckpts = {
    text_id: Checkpointer(index_dir/text_id, CORPUS_FINGERPRINT).load()
    for text_id in QUICKSTART_DOCUMENTS.values()
}
text_metas = {
    text_id: get_metadata(text_id)
    for text_id in QUICKSTART_DOCUMENTS.values()
}

In [3]:
lat_vects = np.array(r.lat_embeddings, copy=False)
eng_vects = np.array(r.eng_embeddings, copy=False)
lemmata = np.array(r.lemmata_arr, copy=False)
defs = np.array(r.definitions, copy=False)

In [122]:
lat_model =  LAT_MODEL('lat')
eng_model = SentenceTransformer(ENG_MODEL, trust_remote_code=True)
lat_none, _ = get_null(lat_model, eng_model)

if LAT_MODEL is cltk.embeddings.embeddings.FastTextEmbeddings:
    lat_fallback = cltk.embeddings.embeddings.Word2VecEmbeddings('lat')
elif LAT_MODEL is cltk.embeddings.embeddings.Word2VecEmbeddings:
    lat_fallback = cltk.embeddings.embeddings.FastTextEmbeddings('lat')

cltk_nlp = NLP(language="lat")
cltk_nlp.pipeline.processes = [
    cltk.alphabet.processes.LatinNormalizeProcess,
    cltk.dependency.processes.LatinStanzaProcess
]

‎𐤀 CLTK version '1.4.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinStanzaProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.

⸖ ``LatinStanzaProcess`` using Stanza model from the Stanford NLP Group: https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/
⸖ ``LatinLexiconProcess`` using Lewis's *An Elementary Latin Dictionary* (1890).

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


## Inference Functions

In [24]:
def get_similarities(a, vects):
    sims = vects @ a.transpose()
    norm_a = np.linalg.norm(a)
    norm_v = np.linalg.norm(vects, axis=1)
    revalued = sims / (2 * norm_a * norm_v) + 0.5
    return revalued

def argmaxk(arr, k):
    return np.flip(np.argsort(arr)[-k:])

In [124]:
def search(query,
           language,
           results=50,
           texts=list(QUICKSTART_DOCUMENTS.keys()),
           tool_dir=tool_dir):
    
    if language == 'eng':
        sent = eng_model.encode('bronze metallurgy, mountain copper')
        sims = get_similarities(sent, eng_vects)
    
    elif language == 'lat':
        query = normalise_text(query, ALLOWED_SYMBOLS, ALLOWED_PUNCTS)
        words = np.array([replace_if_none(lat_model.get_word_vector(w.strip()), lat_none)
                          for w in query.split(',')])
        sent = np.mean(words, axis=0)

        if sent.all() == lat_none.all():
            warnings.warn('Rare word. No associated vector. Switching to fallback.')
            try:
                return lat_fallback.get_sims(query)
            except:
                warnings.warn('Very rare word. No associated vector found.')
                return ''
            
        sims = get_similarities(sent, lat_vects)
    
    sorted = np.flip(np.argsort(sims))
    allowed_texts = {QUICKSTART_DOCUMENTS[k] for k in texts}

    found = 0
    idx = 0
    data = []
    while found < results and idx < len(sorted):
        lemma_idx = sorted[idx]
        lemma = lemmata[lemma_idx]

        if allowed_texts.intersection(r.root_lemmata_info[lemma]['texts']):
            found += 1
            urls = []
            for text_id in r.root_lemmata_info[lemma]['texts']:
                quote_id = text_ckpts[text_id].corpus_lemmata_info[lemma]
                urls.append(perseus_url(text_metas[text_id], quote_id))
            data.append([sims[lemma_idx], lemma, defs[lemma_idx], ', '.join(urls)])
        
        idx += 1

    df = pd.DataFrame(data, columns=['Score', 'Latin Word', 'Definition', 'Sources'])
    return df

## Inference Demo

In [130]:
search('bronze metallurgy', language='eng', texts=['Naturalis Historia'])

Unnamed: 0,Score,Latin Word,Definition,Sources
0,0.6596,diphryges,copper,https://www.perseus.tufts.edu/hopper/text?doc=...
1,0.697527,aerosus,containing copper; full of copper;,https://www.perseus.tufts.edu/hopper/text?doc=...
2,0.653807,ahenum,"copper, of copper (alloy); bronze, made of bro...",https://www.perseus.tufts.edu/hopper/text?doc=...
3,0.697527,aenum,"copper, of copper (alloy); bronze, made of bro...",https://www.perseus.tufts.edu/hopper/text?doc=...
4,0.660676,aeneus,"copper, of copper (alloy); bronze, made of bro...",https://www.perseus.tufts.edu/hopper/text?doc=...
5,0.636723,aeneus,"copper, of copper (alloy); bronze, made of bro...",https://www.perseus.tufts.edu/hopper/text?doc=...
6,0.697285,aenus,"copper, of copper (alloy); bronze, made of bro...",https://www.perseus.tufts.edu/hopper/text?doc=...
7,0.661674,aena,"copper, of copper (alloy); bronze, made of bro...",https://www.perseus.tufts.edu/hopper/text?doc=...
8,0.632481,chalcantho,copperas,https://www.perseus.tufts.edu/hopper/text?doc=...
9,0.697527,diphyes,precious stone (unknown);,https://www.perseus.tufts.edu/hopper/text?doc=...


In [131]:
search('chrysocolla', language='lat', texts=['Naturalis Historia'])



[('haematiten', 0.7801526784896851),
 ('hysgino', 0.7738798260688782),
 ('sampsuchum', 0.7716076374053955),
 ('cyprinum', 0.770029604434967),
 ('topazo', 0.7663824558258057),
 ('curalio', 0.7646341323852539),
 ('oricia', 0.7642366886138916),
 ('melinum', 0.7607849836349487),
 ('paederota', 0.7593076825141907),
 ('amaracinum', 0.7585501670837402)]