In [131]:
import pandas as pd
from ud_boxer.helpers import PMB
from ud_boxer.sbn_spec import get_doc_id
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="darkgrid")
sns.despine()

DPI = 200

lang_mapping = {
    "en": "English",
    "de": "German",
    "nl": "Dutch",
    "it": "Italian",
}

placement = {
    0: (0),
    1: (1),
    2: (2),
    3: (3),
}

palette = {
    "U - Stanza": "#50b131",
    "U - Trankit": "#f77189",
    "N - Gold": "#bb9832",
    "N - Gold & Silver": "#464d49",
    "N - Gold (strict)": "#3ba3ec",
    "N - Gold & Silver (strict)": "#e866f4",
}

# fig, axes = plt.subplots(4, 1, figsize=(8, 12), dpi=700)

# for i, lang in enumerate(["de", "nl", "it", "en"]):
for i, lang in enumerate(["it"]):
    df = pd.DataFrame()
    for data_split in ["dev", "test", "eval"]:
        if lang != "en" and data_split == "eval":
            continue

        s_df = pd.read_csv(
            f"data/results/rewrite/{lang}/{data_split}/final_stanza.csv"
        )
        t_df = pd.read_csv(
            f"data/results/rewrite/{lang}/{data_split}/final_trankit.csv"
        )
        s_df["ud_system"] = "U - Stanza"
        t_df["ud_system"] = "U - Trankit"

        df = pd.concat([df, s_df, t_df], ignore_index=True)

        if lang == "en":
            g_df = pd.read_csv(
                f"data/results/seq2seq/{data_split}/results_{data_split}_gold_only.csv"
            )
            gs_df = pd.read_csv(
                f"data/results/seq2seq/{data_split}/results_{data_split}_gold_and_silver.csv"
            )
            g_df["ud_system"] = "N - Gold"
            gs_df["ud_system"] = "N - Gold & Silver"

            df = pd.concat([df, g_df, gs_df], ignore_index=True)

            g_df = pd.read_csv(
                f"data/results/seq2seq/{data_split}/results_strict_indices_gold.csv"
            )
            gs_df = pd.read_csv(
                f"data/results/seq2seq/{data_split}/results_strict_indices_gold_silver.csv"
            )
            g_df["ud_system"] = "N - Gold (strict)"
            gs_df["ud_system"] = "N - Gold & Silver (strict)"

            df = pd.concat([df, g_df, gs_df], ignore_index=True)

        pmb = PMB(data_split, lang)
        for path in pmb.generator(
            f"../../data/pmb_dataset/pmb-extracted/pmb-4.0.0/data/{lang}/gold",
            "**/*.drs.sbn",
        ):
            doc_id = get_doc_id(lang, path)
            df.loc[df["pmb_id"] == doc_id, "gold_sbn_str"] = path.read_text().rstrip()

    df["f1"] = df["f1"].fillna(0)
    df["precision"] = df["precision"].fillna(0)
    df["recall"] = df["recall"].fillna(0)

    # ax = sns.lineplot(
    #     x="n_tokens",
    #     y="value",
    #     hue="ud_system",
    #     ci=None,
    #     data=pd.melt(
    #         df[["f1", "n_tokens", "ud_system"]],
    #         ["n_tokens", "ud_system"],
    #         var_name="Scores",
    #     ),
    #     palette=palette,
    #     linewidth=2.0,
    #     ax=axes[placement[i]],
    #     legend=i==3,
    # )
    # ax.set(ylim=(0, 1.0), xlim=(0, 35), xlabel="", ylabel="")
    # if i != 3:
    #     ax.tick_params(bottom=False)
    #     ax.set(xticklabels=[])

    # if i == 3:
    #     ax.set(xlabel="Number of tokens")

    # ax.legend(loc="lower left", facecolor="white", fontsize="10")
    # ax.set(ylabel=lang_mapping[lang])
    # ax.set_ylabel(ax.get_ylabel(), rotation=0, horizontalalignment="right")

# fig.supylabel(" ")
# # fig.suptitle(f'Scores by input length for {lang_mapping[lang]}')
# fig.suptitle(f"")
# fig.tight_layout()

# fig.savefig(f"analysis/scores_vs_len.pdf")


1686it [00:00, 7261.97it/s]
1686it [00:00, 5892.27it/s]


<Figure size 432x288 with 0 Axes>

In [132]:
df.to_csv('all_italian.csv', index=False)

In [79]:
selection = (
    (df['ud_system'].str.contains('Trankit')) & 
    (df.gold_sbn_str.str.count('NEGATION') == 1)
)
df[selection].sort_values('f1', ascending=False).sort_values(by="raw_sent", key=lambda x: x.str.len(), ascending=False)

Unnamed: 0,pmb_id,source,raw_sent,sbn_str,error,precision,recall,f1,precision_lenient,recall_lenient,f1_lenient,ud_system,gold_sbn_str
792,de/p00/d1391,GREW,Wieviele Menschen in den USA haben keine Krank...,time.n.08 EQU now\nwieviel.n.01...,,0.531915,0.641026,0.581395,0.525,0.617647,0.567568,U - Trankit,%%% This output was generated by the following...
1906,de/p31/d3277,GREW,Tom süßt seinen Tee mit Honig statt mit Zucker.,time.n.08 EQU now\nfemale.n.02 Name Tom\nsüß...,,0.804878,0.66,0.725275,0.771429,0.627907,0.692308,U - Trankit,%%% This output was generated by the following...
853,de/p00/d1777,GREW,Seine Schwester geht nicht nach Amerika.,NEGATION -1\ntime.n.08 EQU now\nSch...,,0.83871,0.604651,0.702703,0.814815,0.594595,0.6875,U - Trankit,%%% This output was generated by the following...
1047,de/p10/d2041,GREW,Ich habe keine giftigen Pilze gegessen!,time.n.08 EQU now\nperson.n.01 EQU speaker\n...,,0.658537,0.72973,0.692308,0.657143,0.71875,0.686567,U - Trankit,%%% This output was generated by the following...
1861,de/p41/d2274,GREW,Sie haben das Gesetz nicht gebrochen.,NEGATION -1\ntime.n.08 EQU now\nperson....,,0.862069,0.641026,0.735294,0.88,0.647059,0.745763,U - Trankit,%%% This output was generated by the following...
935,de/p40/d1552,GREW,Sie hat nicht mit der Wimper gezuckt.,NEGATION -1\ntime.n.08 EQU now\nfemale.n....,,0.793103,0.793103,0.793103,0.8,0.8,0.8,U - Trankit,%%% This output was generated by the following...
787,de/p00/d1757,GREW,Der Wecker hat Tom nicht aufgeweckt.,NEGATION -1\ntime.n.08 EQU now\nWec...,,0.774194,0.615385,0.685714,0.814815,0.647059,0.721311,U - Trankit,%%% This output was generated by the following...
1009,de/p10/d3307,GREW,Es ist elf Uhr und er kommt nicht.,NEGATION -1\ntime.n.08 EQU now\nperson.n....,,0.513514,0.542857,0.527778,0.53125,0.548387,0.539683,U - Trankit,%%% This output was generated by the following...
921,de/p40/d0769,GREW,Ich bin überhaupt nicht überzeugt.,NEGATION -1\ntime.n.08 EQU now\nper...,,0.733333,0.88,0.8,0.730769,0.863636,0.791667,U - Trankit,%%% This output was generated by the following...
2081,de/p01/d1404,GREW,"Ich weiß nicht, wo Toms Büro ist.",NEGATION -1\ntime.n.08 EQU now\nperson.n....,,0.686275,0.555556,0.614035,0.659091,0.527273,0.585859,U - Trankit,%%% This output was generated by the following...


In [80]:
# Loop over data split in pmb and add original sbn string to csv
# negation precision recall f1 sbn_str vs gold_sbn_str

# Best scoring sentences
df[selection].sort_values('f1', ascending=False).head(10).sort_values(by="raw_sent", key=lambda x: x.str.len(), ascending=False)

Unnamed: 0,pmb_id,source,raw_sent,sbn_str,error,precision,recall,f1,precision_lenient,recall_lenient,f1_lenient,ud_system,gold_sbn_str
817,de/p00/d1693,GREW,Deine Witze sind nicht lustig.,NEGATION -1\ntime.n.08 EQU now\nWitze.n.0...,,0.869565,0.8,0.833333,0.85,0.772727,0.809524,U - Trankit,%%% This output was generated by the following...
2157,de/p21/d0875,GREW,In diesem Haus wohnt niemand.,NEGATION -1\ntime.n.08 EQU now\ndies.n.01...,,0.862069,0.862069,0.862069,0.84,0.84,0.84,U - Trankit,%%% This output was generated by the following...
1877,de/p91/d2487,GREW,Die Ampel funktioniert nicht.,NEGATION -1\ntime.n.08 EQU ...,,0.826087,0.826087,0.826087,0.85,0.85,0.85,U - Trankit,%%% This output was generated by the following...
2166,de/p21/d1918,GREW,Die Waffe ist nicht geladen.,NEGATION -1\ntime.n.08 EQU now\nWaffe.n.01\...,,0.826087,0.826087,0.826087,0.85,0.85,0.85,U - Trankit,%%% This output was generated by the following...
1862,de/p91/d1766,GREW,Wir sind nicht verheiratet.,NEGATION -1\ntime.n.08 EQU now\...,,0.88,0.88,0.88,0.863636,0.863636,0.863636,U - Trankit,%%% This output was generated by the following...
1054,de/p10/d1997,GREW,Ich habe dich nie gesehen.,NEGATION -1\ntime.n.08 EQU now\nperson.n....,,0.848485,0.848485,0.848485,0.827586,0.827586,0.827586,U - Trankit,%%% This output was generated by the following...
869,de/p00/d1830,GREW,Ich bin nicht kreativ.,NEGATION -1\ntime.n.08 EQU now\nperson....,,0.92,0.92,0.92,0.909091,0.909091,0.909091,U - Trankit,%%% This output was generated by the following...
1761,de/p51/d2339,GREW,Ist sie nicht süß?,NEGATION -1\ntime.n.08 EQU now\nfemale.n....,,0.913043,0.913043,0.913043,0.9,0.9,0.9,U - Trankit,%%% This output was generated by the following...
829,de/p00/d1994,GREW,Ken rannte nicht.,NEGATION -1\ntime.n.08 TPR now\nfemale.n....,,0.84,0.84,0.84,0.818182,0.818182,0.818182,U - Trankit,%%% This output was generated by the following...
868,de/p00/d2720,GREW,Ich lüge nie.,NEGATION -1\ntime.n.08 EQU now\nperson.n....,,0.88,0.88,0.88,0.909091,0.909091,0.909091,U - Trankit,%%% This output was generated by the following...


In [145]:
file = 'all_german.csv'
df = pd.read_csv(file)

full = ''
for system in ['Stanza', 'Trankit']:
    for i in range(1, 5):
        selection = (
            # (df['ud_system'] == 'N - Gold (strict)') & 
            (df['ud_system'].str.contains(system)) & 
            (df.sbn_str.str.count('NEGATION') == i)
        )
        msg = f"""
        SYSTEM: {system}
        COUNT:  {i}
        LANG:   {file}

        AVERAGE P (strict):  {100 * df[selection]["precision"].mean():.3}
        AVERAGE R (strict): {100 * df[selection]["recall"].mean():.3}
        AVERAGE F1 (strict): {100 * df[selection]["f1"].mean():.3}

        """
        full += f' & {100 * df[selection]["precision"].mean():.3} & {100 * df[selection]["recall"].mean():.3} & {100 * df[selection]["f1"].mean():.3}'
        print(msg)
    print(full)


        SYSTEM: Stanza
        COUNT:  1
        LANG:   all_german.csv

        AVERAGE P (strict):  80.4
        AVERAGE R (strict): 77.3
        AVERAGE F1 (strict): 78.2

        

        SYSTEM: Stanza
        COUNT:  2
        LANG:   all_german.csv

        AVERAGE P (strict):  84.4
        AVERAGE R (strict): 72.3
        AVERAGE F1 (strict): 77.4

        

        SYSTEM: Stanza
        COUNT:  3
        LANG:   all_german.csv

        AVERAGE P (strict):  nan
        AVERAGE R (strict): nan
        AVERAGE F1 (strict): nan

        

        SYSTEM: Stanza
        COUNT:  4
        LANG:   all_german.csv

        AVERAGE P (strict):  nan
        AVERAGE R (strict): nan
        AVERAGE F1 (strict): nan

        
 & 80.4 & 77.3 & 78.2 & 84.4 & 72.3 & 77.4 & nan & nan & nan & nan & nan & nan

        SYSTEM: Trankit
        COUNT:  1
        LANG:   all_german.csv

        AVERAGE P (strict):  79.9
        AVERAGE R (strict): 78.0
        AVERAGE F1 (strict): 78.4

        



In [160]:
df = pd.read_csv('all_dutch.csv')

full_cnt = ""
df_tmp = df.drop_duplicates(['pmb_id'])
for i in range(0, 10):
    cnt = len(df_tmp[df_tmp.gold_sbn_str.str.count('NEGATION') == i])
    print(f'{i} -> {cnt}')
    full_cnt += f'& {cnt}'
print(full_cnt)

0 -> 841
1 -> 65
2 -> 20
3 -> 2
4 -> 0
5 -> 0
6 -> 0
7 -> 0
8 -> 0
9 -> 0
& 841& 65& 20& 2& 0& 0& 0& 0& 0& 0


In [156]:
print('\n\n'.join(df[df.gold_sbn_str.str.count('NEGATION') == 4].gold_sbn_str.tolist()))

%%% This output was generated by the following command:
            NEGATION -1                        % Jeder  [0-5]
person.n.01                                    %        
            NEGATION -1                        %        
            NEGATION -1                        % jeden. [12-18]
person.n.01                                    %        
            NEGATION -1                        %        
know.v.04   Experiencer -2 Stimulus -1 Time +1 % kennt  [6-11]
time.n.08   EQU now                            %

%%% This output was generated by the following command:
            NEGATION -1                        % Jeder  [0-5]
person.n.01                                    %        
            NEGATION -1                        %        
            NEGATION -1                        % jeden. [12-18]
person.n.01                                    %        
            NEGATION -1                        %        
know.v.04   Experiencer -2 Stimulus -1 Time +1 % kennt  [6-11]
time

In [185]:
from sklearn.metrics import classification_report

# for file in ['all_english.csv', 'all_dutch.csv', 'all_italian.csv', 'all_german.csv']:
for file in ['all_english.csv']:
    df = pd.read_csv(file)
    # for system in ['Stanza', 'Trankit']:
    for system in ['N - Gold (strict)', 'N - Gold & Silver (strict)']:
        selection = (
            # df['ud_system'].str.contains(system)
            df['ud_system'] == system
        )
        y_true = df[selection].gold_sbn_str.str.count('NEGATION')
        y_pred = df[selection].sbn_str.str.count('NEGATION').fillna(-1)
        
        print(f'{file} -> {system}')
        print(classification_report(y_true, y_pred, digits=3))
        print(f'\n\n')

all_english.csv -> N - Gold (strict)
              precision    recall  f1-score   support

        -1.0      0.000     0.000     0.000         0
         0.0      0.983     0.910     0.945      2676
         1.0      0.765     0.810     0.787       253
         2.0      0.780     0.419     0.545        93
         3.0      0.000     0.000     0.000         7
         4.0      0.500     0.400     0.444         5

    accuracy                          0.883      3034
   macro avg      0.505     0.423     0.454      3034
weighted avg      0.956     0.883     0.916      3034




all_english.csv -> N - Gold & Silver (strict)
              precision    recall  f1-score   support

        -1.0      0.000     0.000     0.000         0
         0.0      0.995     0.965     0.980      2676
         1.0      0.964     0.953     0.958       253
         2.0      0.941     0.860     0.899        93
         3.0      0.600     0.429     0.500         7
         4.0      1.000     0.200     0.333   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [188]:
df[df['pmb_id'] == 'en/p01/d2144']

Unnamed: 0,pmb_id,source,raw_sent,sbn_str,error,precision,recall,f1,precision_lenient,recall_lenient,f1_lenient,ud_system,gold_sbn_str
7667,en/p01/d2144,GREW,Everyone knows everyone.,NEGATION -1\nNEGATION -1\ntime.n.08 ...,,0.8,0.571429,0.666667,0.818182,0.580645,0.679245,U - Stanza,%%% This output was generated by the following...
8714,en/p01/d2144,GREW,Everyone knows everyone.,NEGATION -1\nNEGATION -1\ntime.n.08 ...,,0.8,0.571429,0.666667,0.818182,0.580645,0.679245,U - Trankit,%%% This output was generated by the following...
9757,en/p01/d2144,SEQ2SEQ,Everyone knows everyone.,NEGATION -1\nperson.n.01\nNEGATION -1\nN...,,0.971429,0.971429,0.971429,0.967742,0.967742,0.967742,N - Gold,%%% This output was generated by the following...
10802,en/p01/d2144,SEQ2SEQ,Everyone knows everyone.,NEGATION -1\nperson.n.01\nNEGATION -1\nN...,,0.772727,0.971429,0.860759,0.769231,0.967742,0.857143,N - Gold & Silver,%%% This output was generated by the following...
11841,en/p01/d2144,SEQ2SEQ,Everyone knows everyone.,NEGATION -1\nperson.n.01\nNEGATION -1\nN...,,0.971429,0.971429,0.971429,,,,N - Gold (strict),%%% This output was generated by the following...
12889,en/p01/d2144,SEQ2SEQ,Everyone knows everyone.,NEGATION -1\nperson.n.01\nNEGATION -1\nN...,,0.772727,0.971429,0.860759,,,,N - Gold & Silver (strict),%%% This output was generated by the following...


In [187]:
print('\n\n'.join(df[df.sbn_str.str.count('NEGATION') == 6].sbn_str.tolist()))

NEGATION    -1
person.n.01
NEGATION    -1
NEGATION    -1
person.n.01
NEGATION    -1
NEGATION    -1
person.n.01
NEGATION    -1
know.v.04   Experiencer -2 Stimulus -1 Time +1
time.n.08   EQU now

NEGATION    -1
person.n.01
NEGATION    -1
NEGATION    -1
person.n.01
NEGATION    -1
NEGATION    -1
person.n.01
NEGATION    -1
know.v.04   Experiencer -2 Stimulus -1 Time +1
time.n.08   EQU now


In [7]:
from sklearn.metrics import classification_report
import pandas as pd

for file in ['all_english.csv', 'all_dutch.csv', 'all_italian.csv', 'all_german.csv']:
# for file in ['all_english.csv']:
    df = pd.read_csv(file)
    for system in ['Stanza', 'Trankit']:
    # for system in ['N - Gold (strict)', 'N - Gold & Silver (strict)']:
        full = ""
        for i in [1,2]:
            df['y_true'] = df.gold_sbn_str.str.count('NEGATION')
            df['y_pred'] = df.sbn_str.str.count('NEGATION').fillna(-1)
            selection = (
                (df['ud_system'].str.contains(system)) &
                # (df['ud_system'] == system) &
                (df['y_true'] == df['y_pred']) &
                (df['y_true'] == i)
            )
            
            print(f'{file} -> {system}')
            msg = f"""
            SYSTEM: {system}
            COUNT:  {i}
            LANG:   {file}

            AVERAGE P (strict):  {100 * df[selection]["precision"].mean():.3}
            AVERAGE R (strict): {100 * df[selection]["recall"].mean():.3}
            AVERAGE F1 (strict): {100 * df[selection]["f1"].mean():.3}

            """
            full += f' & {100 * df[selection]["precision"].mean():.3} & {100 * df[selection]["recall"].mean():.3} & {100 * df[selection]["f1"].mean():.3}'
            print(msg)
        print(full)

all_english.csv -> Stanza

            SYSTEM: Stanza
            COUNT:  1
            LANG:   all_english.csv

            AVERAGE P (strict):  82.7
            AVERAGE R (strict): 80.0
            AVERAGE F1 (strict): 80.7

            
all_english.csv -> Stanza

            SYSTEM: Stanza
            COUNT:  2
            LANG:   all_english.csv

            AVERAGE P (strict):  85.3
            AVERAGE R (strict): 72.0
            AVERAGE F1 (strict): 77.5

            
 & 82.7 & 80.0 & 80.7 & 85.3 & 72.0 & 77.5
all_english.csv -> Trankit

            SYSTEM: Trankit
            COUNT:  1
            LANG:   all_english.csv

            AVERAGE P (strict):  84.1
            AVERAGE R (strict): 80.2
            AVERAGE F1 (strict): 81.4

            
all_english.csv -> Trankit

            SYSTEM: Trankit
            COUNT:  2
            LANG:   all_english.csv

            AVERAGE P (strict):  86.3
            AVERAGE R (strict): 71.7
            AVERAGE F1 (strict): 77.8

      