In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

In [None]:
sims_single = pd.read_csv("data/similarities_bsbbert_kelex_only.tsv", sep="\t", decimal=",")
sims_ex = pd.read_csv("data/similarities_ex_bsbbert_kelex_only.tsv", sep="\t", decimal=",")

In [None]:
sims_single

In [None]:
sims_ex

In [None]:
sims_single["sim_diff"] = sims_single.apply(lambda row: (float(row["pseudword_sim"]) - float(row["bert_sim"]))/float(row["bert_sim"]), axis=1)
sims_ex["sim_diff"] = sims_ex.apply(lambda row: (float(row["pseudword_sim"]) - float(row["bert_sim"]))/float(row["bert_sim"]), axis=1)

sims_single["euclidean_diff"] = sims_single.apply(lambda row: (float(row["bert_euclidean"]) - float(row["pseudword_euclidean"]))/float(row["pseudword_euclidean"]), axis=1)
sims_ex["euclidean_diff"] = sims_ex.apply(lambda row: (float(row["bert_euclidean"]) - float(row["pseudword_euclidean"]))/float(row["pseudword_euclidean"]), axis=1)

sims_single["manhattan_diff"] = sims_single.apply(lambda row: (float(row["bert_manhattan"]) - float(row["pseudword_manhattan"]))/float(row["pseudword_manhattan"]), axis=1)
sims_ex["manhattan_diff"] = sims_ex.apply(lambda row: (float(row["bert_manhattan"]) - float(row["pseudword_manhattan"]))/float(row["pseudword_manhattan"]), axis=1)

In [None]:
sims_single

In [None]:
sims_ex

In [None]:
constr_avg_sims = sims_single.groupby('constr')["sim_diff"].mean()
constr_avg_euclidean = sims_single.groupby('constr')["euclidean_diff"].mean()
constr_avg_manhattan = sims_single.groupby('constr')["manhattan_diff"].mean()

constr_avg_sims = constr_avg_sims.sort_values(ascending=False)
constr_strings = [str(i) for i in list(constr_avg_sims.index)]

plt.rcParams['font.family'] = 'Libertinus Serif'
fig, ax = plt.subplots(figsize=(8, 3))

# Plot the grouped bars
bar_width = 0.25
bar_positions_sims = np.arange(len(constr_strings))
bar_positions_euclidean = bar_positions_sims + bar_width
bar_positions_manhattan = bar_positions_sims + 2 * bar_width

ax.bar(bar_positions_sims, constr_avg_sims.values, color='dimgrey', width=bar_width, zorder=4)
ax.bar(bar_positions_euclidean, constr_avg_euclidean[constr_avg_sims.index], color='tab:blue', width=bar_width, zorder=4)
ax.bar(bar_positions_manhattan, constr_avg_manhattan[constr_avg_sims.index], color='tab:orange', width=bar_width, zorder=4)

plt.grid(which='major', axis='y', linestyle='-', color='darkgrey', zorder=0)

ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

overall_avg = sims_single[['sim_diff', 'euclidean_diff', 'manhattan_diff']].mean().mean()
ax.axhline(overall_avg, color='xkcd:almost black', linestyle='--', zorder=3)

ax.legend(labels=[f'Mittelwert: {overall_avg:.2%}', 'gemäß Kosinus-Ähnlichkeit', 'gemäß euklidischer Distanz', 'gemäß Manhattan-Distanz'])

plt.ylim(-0.35, 0.35)
plt.xlabel('Konstruktion (ID)')
plt.ylabel('Häufigere Übereinstimmung \n(Allg. Embedding : Pseudowort)')

plt.xticks(bar_positions_sims + bar_width, constr_strings, rotation=90)

plt.tight_layout()

plt.savefig('ratio_single_in_matches.pdf')
plt.show()

In [None]:
constr_avg_sims = sims_ex.groupby('constr')["sim_diff"].mean()
constr_avg_euclidean = sims_ex.groupby('constr')["euclidean_diff"].mean()
constr_avg_manhattan = sims_ex.groupby('constr')["manhattan_diff"].mean()

constr_avg_sims = constr_avg_sims.sort_values(ascending=False)
constr_strings = [str(i) for i in list(constr_avg_sims.index)]

plt.rcParams['font.family'] = 'Libertinus Serif'
fig, ax = plt.subplots(figsize=(8, 3))

# Plot the grouped bars
bar_width = 0.25
bar_positions_sims = np.arange(len(constr_strings))
bar_positions_euclidean = bar_positions_sims + bar_width
bar_positions_manhattan = bar_positions_sims + 2 * bar_width

ax.bar(bar_positions_sims, constr_avg_sims.values, color='dimgrey', width=bar_width, zorder=4)
ax.bar(bar_positions_euclidean, constr_avg_euclidean[constr_avg_sims.index], color='tab:blue', width=bar_width, zorder=4)
ax.bar(bar_positions_manhattan, constr_avg_manhattan[constr_avg_sims.index], color='tab:orange', width=bar_width, zorder=4)

plt.grid(which='major', axis='y', linestyle='-', color='darkgrey', zorder=0)

ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

overall_avg = sims_ex[['sim_diff', 'euclidean_diff', 'manhattan_diff']].mean() .mean()
ax.axhline(overall_avg, color='xkcd:almost black', linestyle='--', zorder=3)

ax.legend(labels=[f'Mittelwert: {overall_avg:.2%}', 'gemäß Kosinus-Ähnlichkeit', 'gemäß euklidischer Distanz', 'gemäß Manhattan-Distanz'])

plt.ylim(-0.35, 0.35)
plt.xlabel('Konstruktion (ID)')
plt.ylabel('Häufigere Übereinstimmung \n(Allg. Embedding : Pseudowort)')

plt.xticks(bar_positions_sims + bar_width, constr_strings, rotation=90)

plt.tight_layout()

plt.savefig('ratio_ex_in_matches.pdf')
plt.show()

Display the ratio between pseudo vs. usual embeds better:

In [None]:
sims_single["pseudo_pred_sim"] = sims_single.apply(lambda row: row["sim_diff"] > 0, axis=1)
sims_single["pseudo_pred_euclidean"] = sims_single.apply(lambda row: row["euclidean_diff"] > 0, axis=1)
sims_single["pseudo_pred_manhattan"] = sims_single.apply(lambda row: row["manhattan_diff"] > 0, axis=1)
sims_single

In [None]:
for sim in ["sim", "euclidean", "manhattan"]:
    constr_counts = sims_single["pseudo_pred_" + sim].value_counts().sort_values(ascending=False)
    constr_strings = ["Konstruktion\nvorhergesagt" if i else "keine Konstr.\nvorhergesagt" for i in list(constr_counts.index)]
    
    plt.rcParams['font.family'] = 'Libertinus Serif'
    fig, ax = plt.subplots(figsize=(3, 3))
    
    plt.pie(constr_counts, labels=constr_strings, autopct='%1.1f%%', labeldistance=1.2, colors=["darkgrey", "grey"])
    
    plt.tight_layout()
    
    plt.savefig(f'pseudo_vs_common_in_matches_{sim}.pdf')
    plt.show()

In [None]:
sims_ex["pseudo_pred_sim"] = sims_ex.apply(lambda row: row["sim_diff"] > 0, axis=1)
sims_ex["pseudo_pred_euclidean"] = sims_ex.apply(lambda row: row["euclidean_diff"] > 0, axis=1)
sims_ex["pseudo_pred_manhattan"] = sims_ex.apply(lambda row: row["manhattan_diff"] > 0, axis=1)
sims_ex

In [None]:
for sim in ["sim", "euclidean", "manhattan"]:
    constr_counts = sims_ex["pseudo_pred_" + sim].value_counts().sort_values(ascending=False)
    constr_strings = ["Konstruktion\nvorhergesagt" if i else "keine Konstr.\nvorhergesagt" for i in list(constr_counts.index)]
    
    fig, ax = plt.subplots(figsize=(3, 3))
    
    plt.rcParams['font.family'] = 'Libertinus Serif'
    
    plt.pie(constr_counts, labels=constr_strings, autopct='%1.1f%%', labeldistance=1.2, colors=["grey", "darkgrey"], counterclock=False)
    
    plt.tight_layout()
    
    plt.savefig(f'pseudo_vs_common_in_matches_{sim}_ex.pdf')
    plt.show()

Finally, get a ranking of example quality per construction $q = m \cdot \frac{\Delta + 1}{2} = m \cdot \frac{\frac{\Delta_{\cos} + \Delta_{2} + \Delta_{1}}{3} + 1}{2} \in [0, 1]$:

In [None]:
sims_single["q"] = sims_single.apply(lambda row: (((row["sim_diff"] + row["euclidean_diff"] + row["manhattan_diff"])/3)+1)/2 * max(float(row["m"]), 0), axis=1)
sims_ex["q"] = sims_single.apply(lambda row: (((row["sim_diff"] + row["euclidean_diff"] + row["manhattan_diff"])/3)+1)/2 * max(float(row["m"]), 0), axis=1)

In [None]:
sims_single

In [None]:
sims_ex

Create a pretty output for human annotation:

In [None]:
q_single = sims_single[["constr", "sentence", "q"]].groupby(["constr", "sentence"]).mean().sort_values(by=["constr", "q"], ascending=[True, False])
q_single.to_csv(f"../../out/matches/q_single.tsv", sep="\t", decimal=",")
q_single

In [None]:
q_ex = sims_ex[["constr", "sentence", "q"]].groupby(["constr", "sentence"]).mean().sort_values(by=["constr", "q"], ascending=[True, False])
q_ex.to_csv(f"../../out/matches/q_ex.tsv", sep="\t", decimal=",")
q_ex

In [None]:
q_single_comp = sims_single[["constr", "sentence", "m", "q"]].groupby(["constr", "sentence", "m"]).mean().sort_values(by=["constr", "q"], ascending=[True, False])
q_single_comp = q_single_comp.reset_index().groupby(["constr"]).head(5)
q_single_comp = pd.MultiIndex.from_frame(q_single_comp).to_frame().iloc[:, 0:0]
#q_single_comp.to_latex("latex/matches_single.tex", multirow=True, column_format="clrr", longtable=True)
q_single_comp.to_csv(f"../../out/matches/q_single_comp.tsv", sep="\t", decimal=",")
q_single_comp

In [None]:
q_ex_comp = sims_ex[["constr", "sentence", "m", "q"]].groupby(["constr", "sentence", "m"]).mean().sort_values(by=["constr", "q"], ascending=[True, False])
q_ex_comp = q_ex_comp.reset_index().groupby(["constr"]).head(5)
q_ex_comp = pd.MultiIndex.from_frame(q_ex_comp).to_frame().iloc[:, 0:0]
#q_ex_comp.to_latex("latex/matches_ex.tex", multirow=True, column_format="clrr", longtable=True)
q_ex_comp.to_csv(f"../../out/matches/q_ex_comp.tsv", sep="\t", decimal=",")
q_ex_comp