In [3]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path('../data/seq2seq/raw').resolve()
IDS_DIR = Path('../data/splits').resolve()

In [9]:
for split in ['dev', 'eval', 'test']:
    lines = Path(DATA_DIR / f'{split}_gold_only.out').read_text().rstrip().split('\n')
    ids = Path(IDS_DIR / f'{split}.txt').read_text().rstrip().split('\n')

    assert len(lines) == len(ids)

    Path(DATA_DIR / f'seq2seq_gold_only_{split}.txt').write_text(
        '\n'.join([f'{i},{o}' for i, o in zip(ids, lines)])
    )

In [16]:
from synse.sbn import SBNGraph

IDS_DIR = Path('data/splits')

for split in ['dev', 'eval', 'test']:
    ids = set([
        f'en/{i}' for i in  Path(IDS_DIR / f"{split}.txt").read_text().split("\n")
    ])

    df = pd.read_csv(f'seq2seq_results_{split}.csv')
    ids_missing = ids.difference(set(df.pmb_id.tolist()))

    error_records = []
    for _id in ids_missing:
        try:
            SBNGraph().from_path(f'../../data/pmb_dataset/pmb-extracted/pmb-4.0.0/data/en/gold/{_id.replace("en/", "")}/en.drs.sbn')
        except Exception as e:
            error_records.append(
                {
                    'pmb_id': _id, 
                    'raw_sent': Path(f'../../data/pmb_dataset/pmb-extracted/pmb-4.0.0/data/en/gold/{_id.replace("en/", "")}/en.raw').read_text(),
                    "error": f'Dataset error: {str(e)}'
                }
            )
            
    df_final = pd.concat([df, pd.DataFrame().from_records(error_records)])

    df_final.loc[df_final['raw_sent'].isnull(), 'raw_sent'] = df_final[df_final['raw_sent'].isnull()]['pmb_id'].apply(lambda x: Path(f'../../data/pmb_dataset/pmb-extracted/pmb-4.0.0/data/en/gold/{x.replace("en/", "")}/en.raw').read_text())

    df_final['raw_sent'] = df_final['raw_sent'].str.replace('\n', ' ')
    df_final['raw_sent'] = df_final['raw_sent'].str.rstrip()
    df_final.to_csv(f'seq/seq2seq_results_{split}.csv', index=False)






{'en/p20/d2343', 'en/p10/d1823', 'en/p00/d2229', 'en/p10/d3022', 'en/p40/d2979', 'en/p30/d1528', 'en/p10/d2815'}
{'en/p70/d2267', 'en/p60/d0784'}
{'en/p21/d3487', 'en/p81/d2490', 'en/p01/d1840', 'en/p81/d2197'}


In [3]:
SBNGraph().from_path(f'../../data/pmb_dataset/pmb-extracted/pmb-4.0.0/data/en/gold/p10/d3022/en.drs.sbn')

Initialized cyclic SBN graph, this will work for most tasks, but can cause problems later on when exporting to Penman for instance.


<synse.sbn.SBNGraph at 0x7fd90d65f250>

In [31]:
for split in ['dev', 'eval', 'test']:
    df = pd.read_csv(f'seq/seq2seq_results_{split}.csv')

    df['error'] = df['error'].fillna('')
    df = df[~df['error'].str.contains('Dataset error')]

    df['f1'] = df['f1'].fillna(0)
    df['f1_lenient'] = df['f1_lenient'].fillna(0)
    print(f'{split}: f1_strict {df.f1.mean():.3} f1_lenient {df.f1_lenient.mean():.3}')


dev: f1_strict 0.925 f1_lenient 0.933
eval: f1_strict 0.917 f1_lenient 0.926
test: f1_strict 0.924 f1_lenient 0.931


In [32]:
df = pd.read_csv(f'data/results/dev/dev_results_negation.csv')
print(f'{split}: f1_strict {df.f1.mean():.3} f1_lenient {df.f1_lenient.mean():.3}')

test: f1_strict 0.767 f1_lenient 0.779
