In [1]:
# This notebook loads, evaluates, and builds a comparison table of Latin word embedding models. External models are given download scripts; models built by authors can be built using the bamman-w2v-lemma.ipynb abd bamman-w2v-lemma-tt.ipynb notebooks in this directory. (Because of a limitation with scripting large downloads from Google Drive, Bamman 2012 must be manually downloaded and places in the ```models``` directory.) Uncomment cells as needed to run.

## Setup

In [2]:
import warnings; warnings.simplefilter('ignore')

In [3]:
# Imports

import os
import urllib
import shutil
from zipfile import ZipFile

import gensim
from gensim.models import Word2Vec
import fasttext.util

import pandas as pd
import numpy as np

from cltk.stem.latin.j_v import JVReplacer

from tqdm.notebook import tqdm

from pprint import pprint

In [4]:
## Uncomment to download

## Get external data

## Used with syn_selection_eval
## Sprugnoli Rachele, Passarotti Marco, Moretti Giovanni. Vir is to Moderatus as Mulier is to Intemperans – Lemma Embeddings for Latin. 2019. https://embeddings.lila-erc.eu/
url = 'https://embeddings.lila-erc.eu/samples/syn/syn-selection-benchmark-Latin.tsv'
urllib.request.urlretrieve (url, '../data/evaluationsets/syn-selection-benchmark-Latin.tsv')

('../data/evaluationsets/syn-selection-benchmark-Latin.tsv',
 <http.client.HTTPMessage at 0x126628750>)

In [5]:
# Set up tools

replacer = JVReplacer()

In [6]:
# Set up constants

eval_path = '../data/evaluationsets'
syn_eval_data = f'{eval_path}/synonyms.csv' 
syn_selection_eval_data = f'{eval_path}/syn-selection-benchmark-Latin.tsv'

## Evaluation functions

In [7]:
# Function for evaluating on Spinelli 2018 synonyms dataset

def syn_eval(model, eval_data, threshold, verbose=False):
    # Better way than two try blocks
    try:
        vocab_ = model.vocab.keys()
    except:
        pass
    try:
        vocab_ = model.wv.vocab.keys()
    except:
        pass
    with open(eval_data,'r') as f:
        lines = f.readlines()
    total = len(lines)
    matches = 0
    for line in tqdm(lines):
        word, syn = replacer.replace(line.strip()).split('\t')
        if word in vocab_:
            most_sim = [item[0] for item in model.most_similar(word, topn=threshold)]
            most_sim = replacer.replace(" ".join(most_sim)).split()
            if syn in most_sim:
                matches += 1
                if verbose:
                    print(f'Synonym {syn} is in most_similar for {word}')
    return matches/total    

In [8]:
# Function for getting mean reciprocal rank on Spinelli 2018 synonyms dataset

def syn_mrr(model, eval_data, threshold=100, verbose=False):
    # Better way than two try blocks
    try:
        vocab_ = model.vocab.keys()
    except:
        pass
    try:
        vocab_ = model.wv.vocab.keys()
    except:
        pass
    with open(eval_data,'r') as f:
        lines = f.readlines()
    rrs = []
    for line in tqdm(lines):
        word, syn = replacer.replace(line.strip()).split('\t')
        if word in vocab_ and syn in vocab_:
            most_sim = [item[0] for item in model.most_similar(word, topn=threshold)]
            most_sim = replacer.replace(" ".join(most_sim)).split()
            if syn in most_sim:
                rr = 1 / (most_sim.index(syn) + 1)
                rrs.append(rr)
    mrr = np.mean(rrs)
    return mrr

In [9]:
# Function for evaluating on LiLa synonymn selection datasheet; code based on description of evaluation method given in Sprugnoli, R., Passarotti M., Moretti G. 2019. Vir is to Moderatus as Mulier is to Intemperans – Lemma Embeddings for Latin.

def syn_selection_eval(model, eval_data, verbose=False):
    # Better way than two try blocks
    try:
        vocab_ = model.vocab.keys()
    except:
        pass
    try: 
        vocab_ = model.wv.vocab.keys()
    except:
        pass
    with open(eval_data,'r') as f:
        lines = f.readlines()
    total = 0
    matches = 0
    for line in tqdm(lines):
        terms = line.split()
        if sum([term in vocab_ for term in terms]) == 5:
            lemma = terms[0]
            
            sims = [model.similarity(lemma, term) for term in terms[1:]]
            if max(sims) == sims[0]:
                matches += 1
            else:
                pass
            total += 1
        else:
            pass
    return ((matches, total), matches/total)  

### FastText Latin Model

In [10]:
# Download model

## Uncomment to download model for first time
##
# fasttext.util.download_model('la', if_exists='ignore')
# shutil.move('cc.la.300.bin', '../models/cc.la.300.bin')
# shutil.move('cc.la.300.bin.gz', '../models/cc.la.300.bin.gz')

In [11]:
# # Load model

# ft_latin = gensim.models.fasttext.load_facebook_vectors('../models/cc.la.300.bin') # Consistent interface, possible speed issues; speed issue may be related to this issue: https://github.com/RaRe-Technologies/gensim/issues/2802
# ft_latin.vocab = {replacer.replace(k): v for k, v in ft_latin.vocab.items()}

#### FastText Evaluation on synonym list

In [12]:
# model = ft_latin

# thresholds = [1,5,10,25,100]
# ft_evals = []

# for threshold in tqdm(thresholds):
#     ft_evals.append(syn_eval(ft_latin, syn_eval_data, threshold))
    
# pprint(list(zip(thresholds, ft_evals)))  

#### FastText MRR on synonym list

In [13]:
# ft_mrr = syn_mrr(ft_latin, syn_eval_data)
# print(ft_mrr)

#### FastText Evaluation on synonym selection

In [14]:
# ft_selection = syn_selection_eval(ft_latin, syn_selection_eval_data)
# print(ft_selection)

### Bloem et al. 2020 Model

In [15]:
# # Download model
# #
# # Uncomment to download model for first time

# url = 'https://uvaauas.figshare.com/ndownloader/files/22300362'
# urllib.request.urlretrieve (url, 'LatinArchiveOrg.ALLCONCAT.UctoNormalizedLowerCase.ShortenedToW2V.NoCarets.txt.skipgram.alpha0.025.neg5.win5.sample0.001.epochs5.mincount50.size100.zip')
# shutil.move('LatinArchiveOrg.ALLCONCAT.UctoNormalizedLowerCase.ShortenedToW2V.NoCarets.txt.skipgram.alpha0.025.neg5.win5.sample0.001.epochs5.mincount50.size100.zip', '../models/LatinArchiveOrg.ALLCONCAT.UctoNormalizedLowerCase.ShortenedToW2V.NoCarets.txt.skipgram.alpha0.025.neg5.win5.sample0.001.epochs5.mincount50.size100.zip')
# with ZipFile('../models/LatinArchiveOrg.ALLCONCAT.UctoNormalizedLowerCase.ShortenedToW2V.NoCarets.txt.skipgram.alpha0.025.neg5.win5.sample0.001.epochs5.mincount50.size100.zip', 'r') as zf:
#    zf.extractall('../models')
# os.remove('../models/LatinArchiveOrg.ALLCONCAT.UctoNormalizedLowerCase.ShortenedToW2V.NoCarets.txt.skipgram.alpha0.025.neg5.win5.sample0.001.epochs5.mincount50.size100.zip')

In [16]:
# # Load model

# bloem_2020 = Word2Vec.load('../models/LatinArchiveOrg.ALLCONCAT.UctoNormalizedLowerCase.ShortenedToW2V.NoCarets.txt.skipgram.alpha0.025.neg5.win5.sample0.001.epochs5.mincount50.size100.model').wv
# bloem_2020.vocab = {replacer.replace(k): v for k, v in bloem_2020.vocab.items()}

#### Bloem et al. 2020 Evaluation on synonym list

In [17]:
# model = bloem_2020

# thresholds = [1,5,10,25,100]
# bloem_2020_evals = []

# for threshold in tqdm(thresholds):
#     bloem_2020_evals.append(syn_eval(bloem_2020, syn_eval_data, threshold))
    
# pprint(list(zip(thresholds, bloem_2020_evals)))  

#### Bloem et al. 2020 MRR on synonym list

In [18]:
# bloem_2020_mrr = syn_mrr(bloem_2020, syn_eval_data)
# print(bloem_2020_mrr)

#### Bloem et al. 2020 Evaluation on synonym selection

In [19]:
# bloem_2020_selection = syn_selection_eval(bloem_2020, syn_selection_eval_data)
# print(bloem_2020_selection)

### Bamman 2012 Latin Model

In [20]:
# # Download model
# #
# # Uncomment to download model for first time

# # For now, file must be downloaded manually and moved to ../models directory
# url = 'https://docs.google.com/uc?id=0B5pGKi0iCsnbMm9Dd2hmb2UtbEk&export=download'

In [21]:
# bamman_2012 = gensim.models.KeyedVectors.load_word2vec_format('../models/latin.embeddings', binary=False)
# bamman_2012.vocab = {replacer.replace(k): v for k, v in bamman_2012.vocab.items()}

In [22]:
# model = bamman_2012

# thresholds = [1,5,10,25,100]
# bamman_2012_evals = []

# for threshold in tqdm(thresholds):
#     bamman_2012_evals.append(syn_eval(bamman_2012, syn_eval_data, threshold))
    
# pprint(list(zip(thresholds, bamman_2012_evals)))  

#### Bamman MRR on synonym list

In [23]:
# bamman_2012_mrr = syn_mrr(bamman_2012, syn_eval_data)
# bamman_2012_mrr

#### Bamman Evaluation on synonym selection

In [24]:
# bamman_2012_selection = syn_selection_eval(bamman_2012, syn_selection_eval_data)
# bamman_2012_selection

### LILA Lemmatized Latin W2V CBOW Model

In [25]:
# # Download model
# #
# # Uncomment to download model for first time

# url = 'https://embeddings.lila-erc.eu/samples/download/word2vec/allLASLAlemmi-vector-100-nocase-w5-CBOW.vec'
# urllib.request.urlretrieve (url, '../models/allLASLAlemmi-vector-100-nocase-w5-CBOW.vec')

In [26]:
# model = 'allLASLAlemmi-vector-100-nocase-w5-CBOW.vec'
# lila_w2v_cbow_2019 = gensim.models.KeyedVectors.load_word2vec_format(f'../models/{model}')
# lila_w2v_cbow_2019.vocab = {replacer.replace(k): v for k, v in lila_w2v_cbow_2019.vocab.items()}

#### LiLa W2V Evaluation on synonym list

In [27]:
# model = lila_w2v_cbow_2019

# thresholds = [1,5,10,25,100]
# lila_w2v_cbow_2019_evals = []

# for threshold in tqdm(thresholds):
#     lila_w2v_cbow_2019_evals.append(syn_eval(lila_w2v_cbow_2019, syn_eval_data, threshold))

# pprint(list(zip(thresholds, lila_w2v_cbow_2019_evals)))  

#### LiLa W2V MRR on synonym list

In [28]:
# lila_w2v_2019_cbow_mrr = syn_mrr(lila_w2v_cbow_2019, syn_eval_data)
# lila_w2v_2019_cbow_mrr

#### LiLa W2V Evaluation on synonym selection

In [29]:
# lila_w2v_2019_cbow_selection = syn_selection_eval(lila_w2v_cbow_2019, syn_selection_eval_data)
# lila_w2v_2019_cbow_selection

### LILA Lemmatized Latin FastText Skip Model

In [30]:
# # Download model
# #
# # Uncomment to download model for first time

# url = 'https://embeddings.lila-erc.eu/samples/download/fasttext/allLASLA-lemmi-fast-100-SKIP-win5-min5.vec'
# urllib.request.urlretrieve (url, '../models/allLASLA-lemmi-fast-100-SKIP-win5-min5.vec')

In [31]:
# model = 'allLASLA-lemmi-fast-100-SKIP-win5-min5.vec'
# lila_ft_skip_2019 = gensim.models.KeyedVectors.load_word2vec_format(f'../models/{model}')
# lila_ft_skip_2019.vocab = {replacer.replace(k): v for k, v in lila_ft_skip_2019.vocab.items()}

#### LiLa FT Evaluation on synonym list

In [32]:
# model = lila_ft_skip_2019

# thresholds = [1,5,10,25,100]
# lila_ft_skip_2019_evals = []

# for threshold in tqdm(thresholds):
#     lila_ft_skip_2019_evals.append(syn_eval(lila_ft_skip_2019, syn_eval_data, threshold))

# pprint(list(zip(thresholds, lila_ft_skip_2019_evals)))  

#### LiLa FT MRR on synonym list

In [33]:
# lila_ft_2019_skip_mrr = syn_mrr(lila_ft_skip_2019, syn_eval_data)
# print(lila_ft_2019_skip_mrr)

#### LiLa Evaluation on synonym selection

In [34]:
# lila_ft_2019_skip_selection = syn_selection_eval(lila_ft_skip_2019, syn_selection_eval_data)
# print(lila_ft_2019_skip_selection)

### QCL Lemmatized Latin Model

In [35]:
# # Download model
# #
# # Uncomment to download model for first time
# # NB: See See notebook bamman-w2v-lemma.ipynb for code to this train model

# url = 'https://utexas.box.com/shared/static/5kc9t8t5jhca3ad83m7j4uxavrppprll'
# urllib.request.urlretrieve (url, '../models/latin_w2v_bamman_lemma300_100_1')
# url = 'https://utexas.box.com/shared/static/7vrc22fvzkw1y0m8ceb54fpzdj5vztij.npy'
# urllib.request.urlretrieve (url, '../models/latin_w2v_bamman_lemma300_100_1.trainables.syn1neg.npy')
# url = 'https://utexas.box.com/shared/static/j81h75p33t2ir7hgm2z9e9bqy6jy0uw0.npy'
# urllib.request.urlretrieve (url, '../models/latin_w2v_bamman_lemma300_100_1.wv.vectors.npy')

In [36]:
# qcl_2020 = Word2Vec.load('../models/latin_w2v_bamman_lemma300_100_1').wv
# qcl_2020.vocab = {replacer.replace(k): v for k, v in qcl_2020.vocab.items()}

#### QCL Evaluation on synonym list

In [37]:
# model = qcl_2020

# thresholds = [1,5,10,25,100]
# qcl_2020_evals = []

# for threshold in tqdm(thresholds):
#     qcl_2020_evals.append(syn_eval(qcl_2020, syn_eval_data, threshold))

# pprint(list(zip(thresholds, qcl_2020_evals)))  

#### QCL MRR on synonym list

In [38]:
# qcl_2020_mrr = syn_mrr(qcl_2020, syn_eval_data)
# print(qcl_2020_mrr)

#### QCL Evaluation on synonym selection

In [39]:
# qcl_2020_selection = syn_selection_eval(qcl_2020, syn_selection_eval_data)
# print(qcl_2020_selection)

### QCL TT 2020 Latin Model (lemmatized with treetagger)

In [40]:
# # Download model
# #
# # Uncomment to download model for first time
# # NB: See See notebook bamman-w2v-lemma-tt.ipynb for code to this train model

# url = 'https://utexas.box.com/shared/static/3m1bqek9w583pkktco8vt8t6cr2lb1gu'
# urllib.request.urlretrieve (url, '../models/latin_w2v_bamman_lemma_tt')

In [41]:
# qcl_2020_tt = Word2Vec.load('../models/latin_w2v_bamman_lemma_tt').wv
# qcl_2020_tt.vocab = {replacer.replace(k): v for k, v in qcl_2020_tt.vocab.items()}

In [42]:
# model = qcl_2020_tt

# thresholds = [1,5,10,25,100]
# qcl_2020_tt_evals = []

# for threshold in tqdm(thresholds):
#     qcl_2020_tt_evals.append(syn_eval(qcl_2020_tt, syn_eval_data, threshold))

# pprint(list(zip(thresholds, qcl_2020_tt_evals)))  

#### QCL TT 2020 MRR on synonym list

In [43]:
# qcl_2020_tt_mrr = syn_mrr(qcl_2020_tt, syn_eval_data)
# print(qcl_2020_tt_mrr)

#### QCL TT 2020 Evaluation on synonym selection

In [44]:
# qcl_2020_tt_selection = syn_selection_eval(qcl_2020_tt, syn_selection_eval_data)
# print(qcl_2020_tt_selection)

### Summary

In [45]:
# Cached results; save time while working

bamman_2012_evals = [.004, .016, .021, .033, .058] # last run 4/8/2021
ft_evals = [.002, .007, .012, .017, .046] # last run 4/8/2021
lila_w2v_cbow_2019_evals = [.024, .081, .113, .159, .271] # last run 4/8/2021
lila_ft_skip_2019_evals = [.017, .062, .093, .143, .229] # last run 4/8/2021
bloem_2020_evals = [.003, .019, .039, .070, .146] # last run 4/8/2021
qcl_2020_evals = [.032, .107, .145, .204, .307] # last run 4/8/2021
qcl_2020_tt_evals = [.035, .107, .150, .210, .349] # last run 4/8/2021

bamman_2012_mrr = .175 # last run 4/8/2021
ft_mrr = .118 # last run 4/8/2021
lila_w2v_2019_cbow_mrr = .198 # last run 4/8/2021
lila_ft_2019_skip_mrr = .182 # last run 4/8/2021
bloem_2020_mrr = .101 # last run 4/8/2021
qcl_2020_mrr = .227 # last run 4/8/2021
qcl_2020_tt_mrr = .206 # last run 4/8/2021

bamman_2012_selection = ((1546, 2320), 0.6663793103448276) # last run 4/8/2021
ft_selection = ((331, 447), 0.7404921700223713) # last run 4/8/2021
lila_w2v_2019_cbow_selection = ((1420, 1750), 0.8114285714285714) # last run 4/8/2021
lila_ft_2019_skip_selection = ((1521, 1750), 0.8691428571428571) # last run 4/8/2021
bloem_2020_selection = ((1498, 1766), 0.8482446206115515) # last run 4/8/2021
qcl_2020_selection = ((772, 909), 0.8492849284928493) # last run 4/8/2021
qcl_2020_tt_selection = ((840, 958), 0.8768267223382046) # last run 4/8/2021

In [46]:
model_names = ['FastText', 'Bamman 2012', 'Lila 2019 W2V CBOW', 'Lila 2019 FT Skip', 'QCL 2020', 'QCL 2020 TT', 'Bloem 2020']
index_labels = [1, 5, 10, 25, 100, 'MRR', 'Selection']

data = [
    ft_evals + [ft_mrr, ft_selection[1]],
    bamman_2012_evals + [bamman_2012_mrr, bamman_2012_selection[1]],
    lila_w2v_cbow_2019_evals + [lila_w2v_2019_cbow_mrr, lila_w2v_2019_cbow_selection[1]],
    lila_ft_skip_2019_evals + [lila_ft_2019_skip_mrr, lila_ft_2019_skip_selection[1]],
    qcl_2020_evals + [qcl_2020_mrr, qcl_2020_selection[1]],
    qcl_2020_tt_evals + [qcl_2020_tt_mrr, qcl_2020_tt_selection[1]],
    bloem_2020_evals + [bloem_2020_mrr, bloem_2020_selection[1]],
]

df = pd.DataFrame(data).round(3)
df.index = model_names
df.columns = index_labels
df = df.sort_values(by=['Selection'])
df = df.T
df

Unnamed: 0,Bamman 2012,FastText,Lila 2019 W2V CBOW,Bloem 2020,QCL 2020,Lila 2019 FT Skip,QCL 2020 TT
1,0.004,0.002,0.024,0.003,0.032,0.017,0.035
5,0.016,0.007,0.081,0.019,0.107,0.062,0.107
10,0.021,0.012,0.113,0.039,0.145,0.093,0.15
25,0.033,0.017,0.159,0.07,0.204,0.143,0.21
100,0.058,0.046,0.271,0.146,0.307,0.229,0.349
MRR,0.175,0.118,0.198,0.101,0.227,0.182,0.206
Selection,0.666,0.74,0.811,0.848,0.849,0.869,0.877
