In [1]:
# This notebook downloads the Spinelli 2018 near-synonyms dataset and converts it to a tab-delimited file for
# use with the syn_eval function in the notebook latin-embedding-evaluation-summary.ipynb

In [2]:
# Imports

import urllib.request
import json
import re

import pandas as pd

In [3]:
# Get synonym json

url = 'https://raw.githubusercontent.com/tommasospinelli/Online-Dictionary-of-Latin-Near-Synonyms/master/Latin%20Near-Synonyms%20dataset.txt'

def response(url):
    with urllib.request.urlopen(url) as response:
        return response.read()

res = response(url)
records = json.loads(res)

In [4]:
def remove_def_details(defs):
    def_details_pattern = re.compile(r' ?\[.+?\]')
    return def_details_pattern.sub(r'', defs)

import unicodedata

def remove_macrons(text_with_macrons):
    '''Replace macrons in Latin text'''
    vowels = 'aeiouyAEIOUYaeiouAEIOU'
    vowels_with_macrons = 'āēīōūȳĀĒĪŌŪȲăĕĭŏŭĂĔĬŎŬ'
    replacement_dictionary = {k: v for k, v in zip(vowels_with_macrons, vowels)}    
    
    temp = unicodedata.normalize('NFC', text_with_macrons)

    for k, v in replacement_dictionary.items():
        temp = temp.replace(k, v)

    text_without_macrons = temp 

    return text_without_macrons

import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [5]:
syn_pairs = []

for record in list(records.items()):
    for defs in record[1]:
        lemma, defs_ = remove_punctuation(record[0]), remove_punctuation(remove_macrons(remove_def_details(defs)))
        if lemma != defs_:
            if ' ' not in lemma and ' ' not in defs_ and len(defs_) > 0:
                syn_pairs.append((lemma, defs_))

In [6]:
df = pd.DataFrame(syn_pairs)
df.drop_duplicates(inplace=True)
df.dropna(axis=0, inplace=True)

In [7]:
df.to_csv('../data/evaluationsets/synonyms.csv', sep='\t', header=False, index=False)