In [9]:
from evaluator import SyntacticCalculator, MorphologicalCalculator, InventoryCalculator, PhonologicalCalculator, FeaturalCalculator, GenericCalculator, LangRankEvaluator, IslandCalculator, GeographicCalculator, HyperbolicCalculator, URIELCalculator

In [10]:
script = GenericCalculator('data/URIEL_Scriptural.csv')
islands = IslandCalculator('data/URIELPlus_Union_SoftImpute.csv')
hyperbolic = HyperbolicCalculator()
distribution_geo = GeographicCalculator(1)

gen = GenericCalculator('data/URIEL_Phylogeny.csv')
geo = GenericCalculator('data/URIEL_Geography.csv')
typ_file = 'data/URIELPlus_Union.csv' # path to the typological dataset CSV file
syn = SyntacticCalculator(typ_file)
morph = MorphologicalCalculator(typ_file)
inv = InventoryCalculator(typ_file)
phon = PhonologicalCalculator(typ_file)
feat = FeaturalCalculator(typ_file)

# gen = URIELCalculator(dist='genetic')
# geo = URIELCalculator(dist='geographic')
# syn = URIELCalculator(dist='syntactic')
# inv = URIELCalculator(dist='inventory')
# phon = URIELCalculator(dist='phonological')
# feat = URIELCalculator(dist='featural')

evaluator = LangRankEvaluator(
    calculators={
    'syntactic': syn,
    'morphological': morph,
    'inventory': inv,
    'phonological': phon,
    'featural': feat,
    'scriptural': script,
    'islands': islands,
    'new_geographic': distribution_geo, # do not change the name "new_geographic"
    'geographic': geo,
    'hyper_genetic': hyperbolic,
    'genetic': gen,
    },
    iso_map_file='data/code_mapping.csv', # path to the ISO to Glottocode mapping file
)

In [11]:
baseline_ndcg = {
    'mt': [],
    'dep': [],
    'el': [],
    'pos': [],
    'taxi1500': [],
    'xnli': []
}

In [12]:
is_baseline = True # set to True to store the result in baseline_ndcg, which, when passed in the baseline_ndcg_scores parameter in future runs, will give us a p-value
features = ['syntactic', 'phonological', 'morphological', 'inventory', 'featural', 'geographic', 'genetic']
# features += ['morphological']
# features += ['scriptural']
# features += ['islands']
task_col_name = 'task_lang'
transfer_col_name = 'transfer_lang'

tasks = {
    'mt': ('BLEU', True),
    'dep': ('accuracy', True),
    'el': ('accuracy', True),
    'pos': ('accuracy', True),
    'taxi1500': ('f1_score', False),
    'xnli': ('accuracy', False),
}
for task in tasks:
    # replace/add distances in the `distance_types` columns of the dataset CSV using the corresponding calculators passed into the evaluator.
    evaluator.replace_distances(
        dataset_path=f'data/{task}.csv', # path to the task dataset (the one containing columns for task lang, transfer lang, performance)
        distance_types=features, # list of distance types to replace in the dataset. these should match the keys of the dict passed into the evaluator.
        task_col_name=task_col_name, # name of the task language column in your dataset
        transfer_col_name=transfer_col_name, # name of the transfer language column in your dataset
        iso_conversion=tasks[task][1] # indicate whether to convert lang codes in your dataset from ISO to Glottocode using the file in self.iso_map_file
    )
    # run LangRank and evaluate task performance
    score = evaluator.evaluate(
        dataset_path=f'data/{task}_updated.csv',
        features=features, # list of columns in the dataset CSV to use for evaluation
        performance_col_name=tasks[task][0], # name of the column in the dataset CSV containing task performance scores
        task_col_name=task_col_name,
        transfer_col_name=transfer_col_name,
        baseline_ndcg_scores=baseline_ndcg[task], # list of baseline NDCG scores for the task
    )
    if is_baseline:
        baseline_ndcg[task] = score[1]
        print(f'Task: {task} NDCG: {score[0]}')
    else:
        print(f'Task: {task} NDCG: {score[0]} (p-value: {score[2]})')

2862it [00:07, 398.59it/s]


Task: mt NDCG: 30.923161574748253


870it [00:01, 446.60it/s]


Task: dep NDCG: 72.80298281709919


477it [00:01, 448.24it/s]


Task: el NDCG: 65.83247072894687


1545it [00:03, 453.61it/s]


Task: pos NDCG: 22.399150587343524


26334it [00:58, 449.41it/s]


Task: taxi1500 NDCG: 22.552406960492387


225it [00:00, 374.38it/s]


Task: xnli NDCG: 70.64718813913777


In [13]:
is_baseline = False # set to True to store the result in baseline_ndcg, which, when passed in the baseline_ndcg_scores parameter in future runs, will give us a p-value
features = ['syntactic', 'phonological', 'morphological', 'inventory', 'featural', 'geographic', 'hyper_genetic']
# features += ['morphological']
# features += ['scriptural']
# features += ['islands']
task_col_name = 'task_lang'
transfer_col_name = 'transfer_lang'

tasks = {
    'mt': ('BLEU', True),
    'dep': ('accuracy', True),
    'el': ('accuracy', True),
    'pos': ('accuracy', True),
    'taxi1500': ('f1_score', False),
    'xnli': ('accuracy', False),
}
for task in tasks:
    # replace/add distances in the `distance_types` columns of the dataset CSV using the corresponding calculators passed into the evaluator.
    evaluator.replace_distances(
        dataset_path=f'data/{task}.csv', # path to the task dataset (the one containing columns for task lang, transfer lang, performance)
        distance_types=features, # list of distance types to replace in the dataset. these should match the keys of the dict passed into the evaluator.
        task_col_name=task_col_name, # name of the task language column in your dataset
        transfer_col_name=transfer_col_name, # name of the transfer language column in your dataset
        iso_conversion=tasks[task][1] # indicate whether to convert lang codes in your dataset from ISO to Glottocode using the file in self.iso_map_file
    )
    # run LangRank and evaluate task performance
    score = evaluator.evaluate(
        dataset_path=f'data/{task}_updated.csv',
        features=features, # list of columns in the dataset CSV to use for evaluation
        performance_col_name=tasks[task][0], # name of the column in the dataset CSV containing task performance scores
        task_col_name=task_col_name,
        transfer_col_name=transfer_col_name,
        baseline_ndcg_scores=baseline_ndcg[task], # list of baseline NDCG scores for the task
    )
    if is_baseline:
        baseline_ndcg[task] = score[1]
        print(f'Task: {task} NDCG: {score[0]}')
    else:
        print(f'Task: {task} NDCG: {score[0]} (p-value: {score[2]})')

2862it [00:03, 860.49it/s]


Task: mt NDCG: 31.58215211231783 (p-value: 0.8122635044499287)


870it [00:00, 900.46it/s]


Task: dep NDCG: 73.5350273080687 (p-value: 0.5970921265773466)


477it [00:00, 910.26it/s]


Task: el NDCG: 66.18629125674565 (p-value: 0.9102454953711276)


1545it [00:01, 979.28it/s] 


Task: pos NDCG: 20.905366257292794 (p-value: 0.6960849634781787)


26334it [00:26, 983.36it/s] 


Task: taxi1500 NDCG: 22.315357253085494 (p-value: 0.7740319559249291)


225it [00:00, 810.98it/s]


Task: xnli NDCG: 68.98156616295549 (p-value: 0.5390611845403848)


In [14]:
is_baseline = False # set to True to store the result in baseline_ndcg, which, when passed in the baseline_ndcg_scores parameter in future runs, will give us a p-value
features = ['syntactic', 'phonological', 'morphological', 'inventory', 'featural', 'geographic', 'hyper_genetic', 'scriptural']
# features += ['morphological']
# features += ['scriptural']
# features += ['islands']
task_col_name = 'task_lang'
transfer_col_name = 'transfer_lang'

tasks = {
    'mt': ('BLEU', True),
    'dep': ('accuracy', True),
    'el': ('accuracy', True),
    'pos': ('accuracy', True),
    'taxi1500': ('f1_score', False),
    'xnli': ('accuracy', False),
}
for task in tasks:
    # replace/add distances in the `distance_types` columns of the dataset CSV using the corresponding calculators passed into the evaluator.
    evaluator.replace_distances(
        dataset_path=f'data/{task}.csv', # path to the task dataset (the one containing columns for task lang, transfer lang, performance)
        distance_types=features, # list of distance types to replace in the dataset. these should match the keys of the dict passed into the evaluator.
        task_col_name=task_col_name, # name of the task language column in your dataset
        transfer_col_name=transfer_col_name, # name of the transfer language column in your dataset
        iso_conversion=tasks[task][1] # indicate whether to convert lang codes in your dataset from ISO to Glottocode using the file in self.iso_map_file
    )
    # run LangRank and evaluate task performance
    score = evaluator.evaluate(
        dataset_path=f'data/{task}_updated.csv',
        features=features, # list of columns in the dataset CSV to use for evaluation
        performance_col_name=tasks[task][0], # name of the column in the dataset CSV containing task performance scores
        task_col_name=task_col_name,
        transfer_col_name=transfer_col_name,
        baseline_ndcg_scores=baseline_ndcg[task], # list of baseline NDCG scores for the task
    )
    if is_baseline:
        baseline_ndcg[task] = score[1]
        print(f'Task: {task} NDCG: {score[0]}')
    else:
        print(f'Task: {task} NDCG: {score[0]} (p-value: {score[2]})')

0it [00:00, ?it/s]

2862it [00:03, 887.63it/s]


Task: mt NDCG: 32.27709291756569 (p-value: 0.5896618966405693)


870it [00:00, 916.63it/s]


Task: dep NDCG: 74.38198102146512 (p-value: 0.31621080875963997)


477it [00:00, 875.38it/s]


Task: el NDCG: 67.49213860215669 (p-value: 0.6294661525874257)


1545it [00:01, 927.64it/s] 


Task: pos NDCG: 20.149153162076423 (p-value: 0.6580312723782811)


26334it [00:28, 932.48it/s] 


Task: taxi1500 NDCG: 25.02443518153058 (p-value: 0.005541663555145187)


225it [00:00, 735.00it/s]


Task: xnli NDCG: 71.85703002656574 (p-value: 0.5822683274578472)


In [15]:
is_baseline = False # set to True to store the result in baseline_ndcg, which, when passed in the baseline_ndcg_scores parameter in future runs, will give us a p-value
features = ['syntactic', 'phonological', 'morphological', 'inventory', 'featural', 'geographic', 'hyper_genetic', 'islands']
# features += ['morphological']
# features += ['scriptural']
# features += ['islands']
task_col_name = 'task_lang'
transfer_col_name = 'transfer_lang'

tasks = {
    'mt': ('BLEU', True),
    'dep': ('accuracy', True),
    'el': ('accuracy', True),
    'pos': ('accuracy', True),
    'taxi1500': ('f1_score', False),
    'xnli': ('accuracy', False),
}
for task in tasks:
    # replace/add distances in the `distance_types` columns of the dataset CSV using the corresponding calculators passed into the evaluator.
    evaluator.replace_distances(
        dataset_path=f'data/{task}.csv', # path to the task dataset (the one containing columns for task lang, transfer lang, performance)
        distance_types=features, # list of distance types to replace in the dataset. these should match the keys of the dict passed into the evaluator.
        task_col_name=task_col_name, # name of the task language column in your dataset
        transfer_col_name=transfer_col_name, # name of the transfer language column in your dataset
        iso_conversion=tasks[task][1] # indicate whether to convert lang codes in your dataset from ISO to Glottocode using the file in self.iso_map_file
    )
    # run LangRank and evaluate task performance
    score = evaluator.evaluate(
        dataset_path=f'data/{task}_updated.csv',
        features=features, # list of columns in the dataset CSV to use for evaluation
        performance_col_name=tasks[task][0], # name of the column in the dataset CSV containing task performance scores
        task_col_name=task_col_name,
        transfer_col_name=transfer_col_name,
        baseline_ndcg_scores=baseline_ndcg[task], # list of baseline NDCG scores for the task
    )
    if is_baseline:
        baseline_ndcg[task] = score[1]
        print(f'Task: {task} NDCG: {score[0]}')
    else:
        print(f'Task: {task} NDCG: {score[0]} (p-value: {score[2]})')

2862it [01:34, 30.27it/s]


Task: mt NDCG: 35.702598325925585 (p-value: 0.1081056964208485)


870it [00:28, 30.92it/s]


Task: dep NDCG: 73.15799640873186 (p-value: 0.7379781498630487)


477it [00:15, 30.27it/s]


Task: el NDCG: 67.83822533254497 (p-value: 0.5758692192568735)


1545it [00:49, 30.95it/s]


Task: pos NDCG: 20.99319835054712 (p-value: 0.7719194572627636)


26334it [14:14, 30.81it/s]


Task: taxi1500 NDCG: 22.69183929130635 (p-value: 0.8732676337890489)


225it [00:07, 28.66it/s]


Task: xnli NDCG: 80.18128310652367 (p-value: 0.007146437015958149)


In [16]:
is_baseline = False # set to True to store the result in baseline_ndcg, which, when passed in the baseline_ndcg_scores parameter in future runs, will give us a p-value
features = ['syntactic', 'phonological', 'morphological', 'inventory', 'featural', 'geographic', 'hyper_genetic', 'scriptural', 'islands']
# features += ['morphological']
# features += ['scriptural']
# features += ['islands']
task_col_name = 'task_lang'
transfer_col_name = 'transfer_lang'

tasks = {
    'mt': ('BLEU', True),
    'dep': ('accuracy', True),
    'el': ('accuracy', True),
    'pos': ('accuracy', True),
    'taxi1500': ('f1_score', False),
    'xnli': ('accuracy', False),
}
for task in tasks:
    # replace/add distances in the `distance_types` columns of the dataset CSV using the corresponding calculators passed into the evaluator.
    evaluator.replace_distances(
        dataset_path=f'data/{task}.csv', # path to the task dataset (the one containing columns for task lang, transfer lang, performance)
        distance_types=features, # list of distance types to replace in the dataset. these should match the keys of the dict passed into the evaluator.
        task_col_name=task_col_name, # name of the task language column in your dataset
        transfer_col_name=transfer_col_name, # name of the transfer language column in your dataset
        iso_conversion=tasks[task][1] # indicate whether to convert lang codes in your dataset from ISO to Glottocode using the file in self.iso_map_file
    )
    # run LangRank and evaluate task performance
    score = evaluator.evaluate(
        dataset_path=f'data/{task}_updated.csv',
        features=features, # list of columns in the dataset CSV to use for evaluation
        performance_col_name=tasks[task][0], # name of the column in the dataset CSV containing task performance scores
        task_col_name=task_col_name,
        transfer_col_name=transfer_col_name,
        baseline_ndcg_scores=baseline_ndcg[task], # list of baseline NDCG scores for the task
    )
    if is_baseline:
        baseline_ndcg[task] = score[1]
        print(f'Task: {task} NDCG: {score[0]}')
    else:
        print(f'Task: {task} NDCG: {score[0]} (p-value: {score[2]})')

2862it [01:36, 29.63it/s]


Task: mt NDCG: 32.56337413068959 (p-value: 0.5368681166575215)


870it [00:28, 30.16it/s]


Task: dep NDCG: 73.56733054012477 (p-value: 0.6704307962942986)


477it [00:15, 30.69it/s]


Task: el NDCG: 70.45301410005519 (p-value: 0.2886382710503333)


1545it [00:52, 29.35it/s]


Task: pos NDCG: 22.351564033415556 (p-value: 0.9926477761244141)


26334it [14:31, 30.22it/s]


Task: taxi1500 NDCG: 24.281940463792363 (p-value: 0.053371336298922205)


225it [00:07, 30.21it/s]


Task: xnli NDCG: 77.37428250797899 (p-value: 0.03886305604810488)
