In [1]:
import pandas as pd
from evaluator import SyntacticCalculator, MorphologicalCalculator, InventoryCalculator, PhonologicalCalculator, FeaturalCalculator, GenericCalculator, LangRankEvaluator, IslandCalculator, GeographicCalculator, URIELCalculator

  import pkg_resources


In [None]:
# Load data files as DataFrames
import pandas as pd

scriptural_df = pd.read_csv('data/URIEL_Scriptural.csv', index_col=0)
islands_df = pd.read_csv('data/URIELPlus_Union_SoftImpute.csv', index_col=0)
phylogeny_df = pd.read_csv('data/URIEL_Phylogeny.csv', index_col=0)
geography_df = pd.read_csv('data/URIEL_Geography.csv', index_col=0)
typological_df = pd.read_csv('data/URIELPlus_Union.csv', index_col=0)

# Initialize calculators with DataFrames
script = GenericCalculator(scriptural_df)
islands = IslandCalculator(islands_df)
distribution_geo = GeographicCalculator(1)

gen = GenericCalculator(phylogeny_df)
geo = GenericCalculator(geography_df)
syn = SyntacticCalculator(typological_df)
morph = MorphologicalCalculator(typological_df)
inv = InventoryCalculator(typological_df)
phon = PhonologicalCalculator(typological_df)
feat = FeaturalCalculator(typological_df)

# gen = URIELCalculator(dist='genetic')
# geo = URIELCalculator(dist='geographic')
# syn = URIELCalculator(dist='syntactic')
# inv = URIELCalculator(dist='inventory')
# phon = URIELCalculator(dist='phonological')
# feat = URIELCalculator(dist='featural')

evaluator = LangRankEvaluator(
    calculators={
    'syntactic': syn,
    'morphological': morph,
    'inventory': inv,
    'phonological': phon,
    'featural': feat,
    'scriptural': script,
    'islands': islands,
    'new_geographic': distribution_geo, # do not change the name "new_geographic"
    'geographic': geo,
    'genetic': gen,
    },
    iso_map_file='data/code_mapping.csv', # path to the ISO to Glottocode mapping file
)

In [3]:
baseline_ndcg = {
    'mt': [],
    'dep': [],
    'el': [],
    'pos': [],
    'taxi1500': [],
    'xnli': []
}

In [None]:
is_baseline = True # set to True to store the result in baseline_ndcg, which, when passed in the baseline_ndcg_scores parameter in future runs, will give us a p-value
features = ['syntactic', 'phonological', 'inventory', 'featural', 'geographic', 'genetic']  
features += ['morphological']
# features += ['scriptural']
# features += ['islands']
task_col_name = 'task_lang'
transfer_col_name = 'transfer_lang'

tasks = {
    'mt': ('BLEU', True),
    'dep': ('accuracy', True),
    'el': ('accuracy', True),
    'pos': ('accuracy', True),
    'taxi1500': ('f1_score', False),
    'xnli': ('accuracy', False),
}

for task in tasks:
    # Replace/add distances in the task data using the calculators
    task_data_with_distances = evaluator.replace_distances(
        data_file=f'data/{task}.csv', # Path to CSV file containing task, transfer, and performance columns
        distance_types=features, # list of distance types to replace in the dataset. these should match the keys of the dict passed into the evaluator.
        task_col_name=task_col_name, # name of the task language column in your dataset
        transfer_col_name=transfer_col_name, # name of the transfer language column in your dataset
        iso_conversion=tasks[task][1] # indicate whether to convert lang codes in your dataset from ISO to Glottocode using the file in self.iso_map_file
    )
    
    # Run LangRank and evaluate task performance
    score = evaluator.evaluate(
        data=task_data_with_distances, # DataFrame with distance columns added
        features=features, # list of columns in the dataset DataFrame to use for evaluation
        performance_col_name=tasks[task][0], # name of the column in the dataset DataFrame containing task performance scores
        task_col_name=task_col_name,
        transfer_col_name=transfer_col_name,
        baseline_ndcg_scores=baseline_ndcg[task], # list of baseline NDCG scores for the task
    )
    
    if is_baseline:
        baseline_ndcg[task] = score[1]
        print(f'Task: {task} NDCG: {score[0]}')
    else:
        print(f'Task: {task} NDCG: {score[0]} (p-value: {score[2]})')

0it [00:00, ?it/s]

2862it [00:16, 177.63it/s]



Task: mt NDCG: 30.923161574748253


870it [00:03, 237.99it/s]



Task: dep NDCG: 72.80298281709919


477it [00:02, 230.78it/s]
477it [00:02, 230.78it/s]


Task: el NDCG: 65.83247072894687


1545it [00:07, 196.65it/s]



Task: pos NDCG: 22.399150587343524


26334it [01:44, 252.06it/s]



Task: taxi1500 NDCG: 22.552406960492387


225it [00:01, 213.75it/s]



Task: xnli NDCG: 70.64718813913777
