In [2]:
from pathlib import Path
import pandas as pd

starting_path = Path('data/results/rewrite')
cols = [
    "precision",
    "recall",
    "f1",
    "precision_lenient",
    "recall_lenient",
    "f1_lenient",
]

for file in starting_path.glob('**/final_*.csv'):
    items = str(file).split('/')
    split = items[-2]
    lang = items[-3]
    system = file.stem.split('_')[1]
    df = pd.read_csv(file)
    for col in cols:
        df[col] = df[col].fillna(0)

    msg = f"""
SYSTEM: {system}
LANG:   {lang}
SPLIT:  {split}

FAILED DOCS:          {len(df[df['error'].notnull()])}
FAILED DOCS %:        {100 * (len(df[df['error'].notnull()]) / len(df))}
TOTAL DOCS:           {len(df)}

AVERAGE P (strict):  {100 * df["precision"].mean():.3} ({100 * df["precision"].min():.3} - {100 * df["precision"].max():.3})
AVERAGE R (strict): {100 * df["recall"].mean():.3} ({100 * df["recall"].min():.3} - {100 * df["recall"].max():.3})
AVERAGE F1 (strict): {100 * df["f1"].mean():.3} ({100 * df["f1"].min():.3} - {100 * df["f1"].max():.3})

AVERAGE P (lenient): {100 * df["precision_lenient"].mean():.3} ({100 * df["precision_lenient"].min():.3} - {100 * df["precision_lenient"].max():.3})
AVERAGE R (lenient): {100 * df["recall_lenient"].mean():.3} ({100 * df["recall_lenient"].min():.3} - {100 * df["recall_lenient"].max():.3})
AVERAGE F1 (lenient): {100 * df["f1_lenient"].mean():.3} ({100 * df["f1_lenient"].min():.3} - {100 * df["f1_lenient"].max():.3})
    """
    print(msg)


SYSTEM: stanza
LANG:   en
SPLIT:  eval

FAILED DOCS:          4
FAILED DOCS %:        0.4830917874396135
TOTAL DOCS:           828

AVERAGE P (strict):  83.1 (0.0 - 1e+02)
AVERAGE R (strict): 80.5 (0.0 - 1e+02)
AVERAGE F1 (strict): 81.3 (0.0 - 1e+02)

AVERAGE P (lenient): 83.6 (0.0 - 1e+02)
AVERAGE R (lenient): 80.8 (0.0 - 1e+02)
AVERAGE F1 (lenient): 81.6 (0.0 - 1e+02)
    

SYSTEM: trankit
LANG:   en
SPLIT:  eval

FAILED DOCS:          0
FAILED DOCS %:        0.0
TOTAL DOCS:           828

AVERAGE P (strict):  83.6 (34.3 - 1e+02)
AVERAGE R (strict): 80.7 (37.3 - 1e+02)
AVERAGE F1 (strict): 81.5 (50.0 - 1e+02)

AVERAGE P (lenient): 84.1 (35.1 - 1e+02)
AVERAGE R (lenient): 81.0 (38.6 - 1e+02)
AVERAGE F1 (lenient): 81.9 (47.6 - 1e+02)
    

SYSTEM: stanza
LANG:   en
SPLIT:  test

FAILED DOCS:          0
FAILED DOCS %:        0.0
TOTAL DOCS:           1044

AVERAGE P (strict):  83.8 (32.0 - 1e+02)
AVERAGE R (strict): 81.4 (40.3 - 1e+02)
AVERAGE F1 (strict): 82.0 (36.4 - 1e+02)

AVERAGE 

In [3]:
import pandas as pd
starting_path = Path('data/results/seq2seq')

cols = [
    "precision",
    "recall",
    "f1",
    "precision_lenient",
    "recall_lenient",
    "f1_lenient",
]

msgs = []
final = []
for file in starting_path.glob('**/results_strict_indices_*.csv'):
    items = str(file).split('/')
    lang = items[3]
    split = items[-2]
    train = items[-1].split('results_strict_indices_')[1].replace('.csv', '')

    if lang =='en':
        continue

    df = pd.read_csv(file)
    for col in cols:
        df[col] = df[col].fillna(0)

    f1 = df["f1"].mean()
    f1_len = df["f1_lenient"].mean()

    msg = f"""
SYSTEM: seq2seq
LANG:   {lang}
TRAIN:  {train}
SPLIT:  {split}

FAILED DOCS:          {len(df[df['error'].notnull()])}
FAILED DOCS %:        {100 * (len(df[df['error'].notnull()]) / len(df))}
TOTAL DOCS:           {len(df)}

AVERAGE P (strict):  {100 * df["precision"].mean():.3} ({100 * df["precision"].min():.3} - {100 * df["precision"].max():.3})
AVERAGE R (strict): {100 * df["recall"].mean():.3} ({100 * df["recall"].min():.3} - {100 * df["recall"].max():.3})
AVERAGE F1 (strict): {100 * f1:.3} ({100 * df["f1"].min():.3} - {100 * df["f1"].max():.3})

AVERAGE P (lenient):  {100 * df["precision_lenient"].mean():.3} ({100 * df["precision_lenient"].min():.3} - {100 * df["precision_lenient"].max():.3})
AVERAGE R (lenient): {100 * df["recall_lenient"].mean():.3} ({100 * df["recall_lenient"].min():.3} - {100 * df["recall_lenient"].max():.3})
AVERAGE F1 (lenient): {100 * f1_len:.3} ({100 * df["f1_lenient"].min():.3} - {100 * df["f1_lenient"].max():.3})
    """


# AVERAGE P (ignore_sense): {100 * df["precision_lenient"].mean():.3} ({100 * df["precision_lenient"].min():.3} - {100 * df["precision_lenient"].max():.3})
# AVERAGE R (ignore_sense): {100 * df["recall_lenient"].mean():.3} ({100 * df["recall_lenient"].min():.3} - {100 * df["recall_lenient"].max():.3})
# AVERAGE F1 (ignore_sense): {100 * df["f1_lenient"].mean():.3} ({100 * df["f1_lenient"].min():.3} - {100 * df["f1_lenient"].max():.3})
    print(msg)
    final.append(' '.join([lang, train, split, f'{f1_len * 100:.3}', f'{f1 * 100:.3}']))
    msgs.append(msg)
Path('all_new_langs.txt').write_text('\n\n'.join(msgs))
Path('all_results.txt').write_text('\n'.join(sorted(final)))


SYSTEM: seq2seq
LANG:   nl
TRAIN:  gold
SPLIT:  test

FAILED DOCS:          15
FAILED DOCS %:        3.054989816700611
TOTAL DOCS:           491

AVERAGE P (strict):  28.2 (0.0 - 82.6)
AVERAGE R (strict): 22.7 (0.0 - 73.9)
AVERAGE F1 (strict): 24.9 (0.0 - 73.9)

AVERAGE P (lenient):  60.2 (0.0 - 82.6)
AVERAGE R (lenient): 45.3 (0.0 - 73.9)
AVERAGE F1 (lenient): 51.1 (0.0 - 73.9)
    

SYSTEM: seq2seq
LANG:   nl
TRAIN:  gold_silver_bronze
SPLIT:  test

FAILED DOCS:          8
FAILED DOCS %:        1.6293279022403258
TOTAL DOCS:           491

AVERAGE P (strict):  71.2 (0.0 - 90.9)
AVERAGE R (strict): 69.3 (0.0 - 90.9)
AVERAGE F1 (strict): 70.0 (0.0 - 90.9)

AVERAGE P (lenient):  73.0 (0.0 - 90.9)
AVERAGE R (lenient): 70.8 (0.0 - 90.9)
AVERAGE F1 (lenient): 71.6 (0.0 - 90.9)
    

SYSTEM: seq2seq
LANG:   nl
TRAIN:  gold_silver
SPLIT:  test

FAILED DOCS:          28
FAILED DOCS %:        5.7026476578411405
TOTAL DOCS:           491

AVERAGE P (strict):  52.2 (0.0 - 90.9)
AVERAGE R (stric

530

In [2]:
from pathlib import Path
for lang in ['de', 'nl', 'it']:
    for split in ['dev', 'test']:
        for folder in ['gold_only', 'gold_silver', 'gold_silver_bronze']:
            output_lines = Path(f'../multiling_exps/{folder}/{lang}_{split}.drs.out').read_text(errors='replace').rstrip().split('\n')
            ids = Path(f'data/splits/{lang}_{split}.txt').read_text().rstrip().split('\n')

            print(lang, split, folder)
            print(len(output_lines), len(ids))

            assert len(output_lines) == len(ids)

            # final_lines = ['id,sbn']
            final_lines = [
                f'{i},{l}'
                for i, l in zip(ids, output_lines)
            ]

            if folder == 'gold_only':
                folder = 'gold'

            Path(f'data/seq2seq/{lang}/{folder}_{split}.txt').write_text('\n'.join(final_lines))

de dev gold_only
559 559
de dev gold_silver
559 559
de dev gold_silver_bronze
559 559
de test gold_only
547 547
de test gold_silver
547 547
de test gold_silver_bronze
547 547
nl dev gold_only
437 437
nl dev gold_silver
437 437
nl dev gold_silver_bronze
437 437
nl test gold_only
491 491
nl test gold_silver
491 491
nl test gold_silver_bronze
491 491
it dev gold_only
540 540
it dev gold_silver
540 540
it dev gold_silver_bronze
540 540
it test gold_only
461 461
it test gold_silver
461 461
it test gold_silver_bronze
461 461


In [4]:
False and True

False