In [1]:
from evaluator import SyntacticCalculator, MorphologicalCalculator, InventoryCalculator, PhonologicalCalculator, FeaturalCalculator, GenericCalculator, LangRankEvaluator, IslandCalculator, GeographicCalculator, URIELCalculator

In [8]:
script = GenericCalculator('data/URIEL_Scriptural.csv')
islands = IslandCalculator('data/URIELPlus_Union_SoftImpute.csv')
distribution_geo = GeographicCalculator(1)

gen = GenericCalculator('data/URIEL_Phylogeny.csv')
geo = GenericCalculator('data/URIEL_Geography.csv')
typ_file = 'data/URIELPlus_Union.csv' # path to the typological dataset CSV file
syn = SyntacticCalculator(typ_file)
morph = MorphologicalCalculator(typ_file)
inv = InventoryCalculator(typ_file)
phon = PhonologicalCalculator(typ_file)
feat = FeaturalCalculator(typ_file)

gen = URIELCalculator(dist='genetic')
geo = URIELCalculator(dist='geographic')
syn = URIELCalculator(dist='syntactic')
inv = URIELCalculator(dist='inventory')
phon = URIELCalculator(dist='phonological')
feat = URIELCalculator(dist='featural')

evaluator = LangRankEvaluator(
    calculators={
    'syntactic': syn,
    'morphological': morph,
    'inventory': inv,
    'phonological': phon,
    'featural': feat,
    'scriptural': script,
    'islands': islands,
    'new_geographic': distribution_geo, # do not change the name "new_geographic"
    'geographic': geo,
    'genetic': gen,
    },
    iso_map_file='data/code_mapping.csv', # path to the ISO to Glottocode mapping file
)

In [9]:
baseline_ndcg = {
    'mt': [],
    'dep': [],
    'el': [],
    'pos': [],
    'taxi1500': [],
    'xnli': []
}

In [10]:
is_baseline = True # set to True to store the result in baseline_ndcg, which, when passed in the baseline_ndcg_scores parameter in future runs, will give us a p-value
features = ['syntactic', 'phonological', 'inventory', 'featural', 'geographic', 'genetic']
# features += ['morphological']
# features += ['scriptural']
# features += ['islands']
task_col_name = 'task_lang'
transfer_col_name = 'transfer_lang'

tasks = {
    'mt': ('BLEU', True),
    'dep': ('accuracy', True),
    'el': ('accuracy', True),
    'pos': ('accuracy', True),
    'taxi1500': ('f1_score', False),
    'xnli': ('accuracy', False),
}
for task in tasks:
    # replace/add distances in the `distance_types` columns of the dataset CSV using the corresponding calculators passed into the evaluator.
    evaluator.replace_distances(
        dataset_path=f'data/{task}.csv', # path to the task dataset (the one containing columns for task lang, transfer lang, performance)
        distance_types=features, # list of distance types to replace in the dataset. these should match the keys of the dict passed into the evaluator.
        task_col_name=task_col_name, # name of the task language column in your dataset
        transfer_col_name=transfer_col_name, # name of the transfer language column in your dataset
        iso_conversion=tasks[task][1] # indicate whether to convert lang codes in your dataset from ISO to Glottocode using the file in self.iso_map_file
    )
    # run LangRank and evaluate task performance
    score = evaluator.evaluate(
        dataset_path=f'data/{task}_updated.csv',
        features=features, # list of columns in the dataset CSV to use for evaluation
        performance_col_name=tasks[task][0], # name of the column in the dataset CSV containing task performance scores
        task_col_name=task_col_name,
        transfer_col_name=transfer_col_name,
        baseline_ndcg_scores=baseline_ndcg[task], # list of baseline NDCG scores for the task
    )
    if is_baseline:
        baseline_ndcg[task] = score[1]
        print(f'Task: {task} NDCG: {score[0]}')
    else:
        print(f'Task: {task} NDCG: {score[0]} (p-value: {score[2]})')

2862it [00:01, 2429.93it/s]


Task: mt NDCG: 33.178795182605505


870it [00:00, 2676.59it/s]


Task: dep NDCG: 73.38781485358571


477it [00:00, 2540.33it/s]


Task: el NDCG: 65.85285889226414


1545it [00:00, 2285.08it/s]


Task: pos NDCG: 12.22273337074046


26334it [00:11, 2322.51it/s]


Task: taxi1500 NDCG: 27.632791468617157


225it [00:00, 3145.05it/s]


Task: xnli NDCG: 68.18314778788924
