# Evaluation
## Setup

In [None]:
import plotly.express as px
import plotly.figure_factory as ff
import pandas as pd
from os.path import join as path_join

import aggregation.aggregator as ag
import data.file_handler as fh
import data.file_name_generator as fg
from aggregation.html_generator import render_dictionary_in_html
from aggregation.pipeline_blocks import add_sense_counts_to_id_map, calc_ari, calc_ari_per_token, calc_silhouette_score_per_token
from clustering.metric_name import MetricName
from data.corpus_handler import CorpusName, CorpusHandler

In [None]:
CORPUS_NAMES = [CorpusName.SEMCOR.value, CorpusName.SENSEVAL2.value,
                CorpusName.SENSEVAL3.value, CorpusName.SEMEVAL07.value,
                CorpusName.SEMEVAL13.value, CorpusName.SEMEVAL15.value]
CORPUS_CACHE_PATH = './data/corpus_cache'
RESULTS_PATH = fh.add_and_get_abs_path('./data/results')

In [None]:
SHOW_FIG = False
SAVE_FIG = False

In [None]:
LABEL_DICT = {'unique_sense_count': 'Unique Sense Count',
              'total_token_count': 'Total Token Count',
              'unique_token_count': 'Unique Token Count',
              'ari': 'ARI',
              'silhouette_score': 'Silhouette Score'}

## Corpus Evaluation
### Corpus Statistics as LaTeX Table
Presents statistics about corpora as LaTeX tabular rows.

In [None]:
for corpus_name in CORPUS_NAMES:
    corpus = CorpusHandler(corpus_name, CORPUS_CACHE_PATH)
    st = ag.calc_corpus_statistics_for_tagged_senses(corpus)
    print(f"    \\textbf{{{corpus.corpus_name}}} & ${st['unique_sense_count']:,}$ & ${st['total_sense_count']:,}$ & ${st['unique_token_count']:,}$ & ${st['total_token_count']:,}$ & ${st['unique_monosemous_token_count']:,}$ & ${st['total_monosemous_token_count']:,}$ & ${st['unique_polysemous_token_count']:,}$ & ${st['total_polysemous_token_count']:,}$ \\\\")

### Our Baselines as LaTeX Table
Presents the No Disambiguation baseline (one cluster/sense per unique token) and Complete Disambiguation baseline (one cluster/sense per token) per corpus as LaTeX tabular rows.

In [None]:
for corpus_name in CORPUS_NAMES:
    corpus = CorpusHandler(corpus_name, CORPUS_CACHE_PATH)
    tagged_tokens = corpus.get_tagged_tokens()

    tagged_tokens['sense'] = range(tagged_tokens.token.count())
    ari_complete_disambiguation = calc_ari(corpus.get_tagged_tokens(), tagged_tokens)['ari']

    tagged_tokens['sense'] = tagged_tokens.token.factorize()[0]
    ari_no_disambiguation = calc_ari(corpus.get_tagged_tokens(), tagged_tokens)['ari']

    print(f"    \\textbf{{{corpus_name}}} & ${ari_complete_disambiguation:.4f}$ & ${ari_no_disambiguation:.4f}$ \\\\")

### Relationship Between Sense Counts and Token Counts as Scatter Plot

In [None]:
WIDTH = 512
HEIGHT = 512

for corpus_name in CORPUS_NAMES:
    corpus = CorpusHandler(corpus_name, CORPUS_CACHE_PATH)
    tagged_tokens = corpus.get_tagged_tokens()
    tagged_tokens_and_counts = ag.count_unique_senses_per_token(tagged_tokens[tagged_tokens.tagged_sense])
    token_counts_per_sense_count = ag.count_tokens_per_sense_count(tagged_tokens_and_counts)

    fig = px.scatter(token_counts_per_sense_count, x='unique_sense_count', y='unique_token_count',
                     color='total_token_count', size='total_token_count', color_continuous_scale=px.colors.sequential.Greys,
                     log_x=True, log_y=True, labels=LABEL_DICT,
                     template='plotly_white', width=WIDTH, height=HEIGHT)
    fig.update_layout(coloraxis_colorbar=dict(title_side='right',
                                              thicknessmode="fraction", thickness=0.04,
                                              ticks="outside"))
    fig.update_traces(marker={'line': {'color': 'black'}})

    if SHOW_FIG:
        fig.show()
    if SAVE_FIG:
        fig_name = f'fig_{corpus_name.lower()}-tokens_and_senses-scatter'
        fig.write_image(f'./data/plots/{fig_name}.pdf')
        print(f"  \\subfloat[][{corpus_name}]{{\\includegraphics[width=0.38\\textwidth]{{./fig/{fig_name}}}}} \\;")

## Dictionary Evaluation
### ARI per Unique Sense Count as Bar Plot (Multiple Linkage Criteria)
Combined bar plot for one corpus with unique sense counts on the x-axis and their average ARI on the y-axis.
Presents different linkage criteria with separate colors and adds a corresponding legend.

In [None]:
WIDTH = 1024
HEIGHT = 387

EXPERIMENT_NAMES = []

sense_stats = list()
for experiment_name in EXPERIMENT_NAMES:
    stats = fh.load_stats(RESULTS_PATH, fg.gen_stats_file_name(experiment_name))
    corpus = CorpusHandler(stats['corpus_name'], CORPUS_CACHE_PATH)
    dictionary = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(experiment_name))
    dictionary = calc_ari_per_token(corpus.get_tagged_tokens(), dictionary)
    if 'unique_sense_count' not in dictionary.columns:
        dictionary = add_sense_counts_to_id_map(corpus.get_tagged_tokens(), dictionary)
    ari_per_sense_count = dictionary[dictionary.tagged_token].groupby(by='unique_sense_count').aggregate({'ari': 'mean'}).reset_index()
    ari_per_sense_count['Linkage'] = stats['linkage_name']
    ari_per_sense_count['Corpus'] = corpus.corpus_name
    sense_stats.append(ari_per_sense_count)

ari_per_sense_count = pd.concat(sense_stats)
assert ari_per_sense_count.Corpus.nunique() == 1
corpus_name = ari_per_sense_count.loc[0, 'Corpus']

fig_ari_per_sense_count = px.bar(
    ari_per_sense_count, x='unique_sense_count', y='ari',
    pattern_shape='Linkage', color='Linkage', barmode='group',
    labels=LABEL_DICT, template='plotly_white', width=WIDTH, height=HEIGHT)
fig_ari_per_sense_count.update_xaxes(
    showgrid=False, ticks="outside", tickson="labels", ticklen=4,
    nticks=int(ari_per_sense_count.unique_sense_count.max()))

if SHOW_FIG:
    ari_per_sense_count.show()
if SAVE_FIG:
    fig_name = f'fig_{corpus_name.lower()}-affinity_euclidean-ari-bar'
    fig_ari_per_sense_count.write_image(f'./data/plots/{fig_name}.pdf')
    print(f"  \\subfloat[][{corpus_name}]{{\\includegraphics[width=0.49\\textwidth]{{./fig/{fig_name}}}}} \\;")

### Silhouette Coefficient per Unique Sense Count as Bar Plot
Combined bar plot for one corpus with unique sense counts on the x-axis and their Silhouette Coefficient on the y-axis.
Presents different linkage criteria with separate colors and adds a corresponding legend.

In [None]:
WIDTH = 1024
HEIGHT = 387

EXPERIMENT_NAMES = []

sense_stats = list()
for experiment_name in EXPERIMENT_NAMES:
    stats = fh.load_stats(RESULTS_PATH, fg.gen_stats_file_name(experiment_name))
    corpus = CorpusHandler(stats['corpus_name'], CORPUS_CACHE_PATH)
    word_vectors = fh.load_matrix(RESULTS_PATH, fg.gen_word_vec_file_name(corpus.corpus_name))
    dictionary = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(experiment_name))
    dictionary = calc_silhouette_score_per_token(word_vectors, dictionary, MetricName.EUCLIDEAN)
    dictionary = calc_ari_per_token(corpus.get_tagged_tokens(), dictionary)
    if 'unique_sense_count' not in dictionary.columns:
        dictionary = add_sense_counts_to_id_map(corpus.get_tagged_tokens(), dictionary)
    silhouette_score_per_sense_count = dictionary[dictionary.tagged_token].groupby(by='unique_sense_count').aggregate({'silhouette_score': 'mean'}).dropna().reset_index()
    silhouette_score_per_sense_count['Linkage'] = stats['linkage_name']
    silhouette_score_per_sense_count['Corpus'] = corpus.corpus_name
    sense_stats.append(silhouette_score_per_sense_count)

silhouette_score_per_sense_count = pd.concat(sense_stats)
assert silhouette_score_per_sense_count.Corpus.nunique() == 1
corpus_name = silhouette_score_per_sense_count.loc[0, 'Corpus']

fig_silhouette_score_per_sense_count = px.bar(
    silhouette_score_per_sense_count, x='unique_sense_count', y='silhouette_score',
    pattern_shape='Linkage', color='Linkage', barmode='group',
    labels=LABEL_DICT, template='plotly_white', width=WIDTH, height=HEIGHT)
fig_silhouette_score_per_sense_count.update_xaxes(
    showgrid=False, ticks="outside", tickson="labels", ticklen=4,
    nticks=int(silhouette_score_per_sense_count.unique_sense_count.max()))
if SHOW_FIG:
    fig_silhouette_score_per_sense_count.show()
if SAVE_FIG:
    fig_name = f'fig_{corpus_name.lower()}-affinity_euclidean-silhouette-bar'
    fig_silhouette_score_per_sense_count.write_image(f'./data/plots/{fig_name}.pdf')
    print(f"  \\subfloat[][{corpus_name}]{{\includegraphics[width=0.49\\textwidth]{{./fig/{fig_name}}}}} \\;")

### Confusion Matrix for Sense Counts
Confusion matrix showing the true sense count vs the predicted sense count.

In [None]:
WIDTH = 512
HEIGHT = 512

EXPERIMENT_NAMES = ['silhouette/exp_silhouette_SemEval2007', 'silhouette/exp_silhouette_SemEval2013', 'silhouette/exp_silhouette_SemEval2015', 'silhouette/exp_silhouette_Senseval2', 'silhouette/exp_silhouette_Senseval3']

for experiment_name in EXPERIMENT_NAMES:
    stats = fh.load_stats(RESULTS_PATH, fg.gen_stats_file_name(experiment_name))
    corpus = CorpusHandler(stats['corpus_name'], CORPUS_CACHE_PATH)

    id_map = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(experiment_name))
    tagged_tokens = corpus.get_tagged_tokens()
    id_map = ag.unpack_and_sort_per_token_id(id_map, ['sentence_id', 'token_id', 'sense'])
    id_map = id_map[tagged_tokens.tagged_sense]
    tagged_tokens = tagged_tokens[tagged_tokens.tagged_sense]
    id_map.rename(columns = {'sense': 'pred_sense'}, inplace = True)
    id_map['true_sense'] = tagged_tokens.sense
    id_map = id_map.groupby(by='token').agg({'pred_sense': 'nunique', 'true_sense': 'nunique'}).reset_index()
    confusion_matrix = pd.crosstab(id_map.pred_sense, id_map.true_sense)

    fig_confusion_matrix = ff.create_annotated_heatmap(confusion_matrix.to_numpy(), showscale=True,
                                                       x=confusion_matrix.columns.to_list(), y=confusion_matrix.index.to_list())
    fig_confusion_matrix.update_layout(width=WIDTH, height=HEIGHT, template='plotly_white')
    fig_confusion_matrix.update_xaxes(title='True Sense Count', side='bottom')
    fig_confusion_matrix.update_yaxes(title='Predicted Sense Count')

    if SHOW_FIG:
        fig_confusion_matrix.show()
    if SAVE_FIG:
        fig_name = f'{experiment_name}-sense_count_confusion_matrix'
        fig_confusion_matrix.write_image(f'./data/plots/{fig_name}.pdf')
        print(f"  \\subfloat[][{stats['corpus_name']}]{{\\includegraphics[width=0.49\\textwidth]{{./fig/{fig_name}}}}} \\;")

### Histogram for Last Merge Linkage Distances
Combined histogram for one corpus with last merge linkage distances on the x-axis and their frequencies on the y-axis.
Presents different linkage criteria with separate colors and adds a corresponding legend.

In [None]:
WIDTH = 1024
HEIGHT = 387

EXPERIMENT_NAMES = ['experiment_linkage_criterion/Senseval3_Euclidean_Average_known_senses',
                    'experiment_linkage_criterion/Senseval3_Euclidean_Complete_known_senses',
                    'experiment_linkage_criterion/Senseval3_Euclidean_Single_known_senses']

distance_stats = []
for experiment_name in EXPERIMENT_NAMES:
    stats = fh.load_stats(RESULTS_PATH, fg.gen_stats_file_name(experiment_name))
    dictionary = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(experiment_name))

    dictionary = dictionary.loc[dictionary.total_token_count >= 2, ['last_merge_dist']]
    dictionary['Linkage'] = stats['linkage_name']
    dictionary['Corpus'] = stats['corpus_name']
    distance_stats.append(dictionary)

distance_stats = pd.concat(distance_stats).reset_index()
assert distance_stats.Corpus.nunique() == 1
corpus_name = distance_stats.loc[0, 'Corpus']

fig = px.histogram(distance_stats, x='last_merge_dist', color='Linkage', barmode='overlay',
                   width=WIDTH, height=HEIGHT, template='plotly_white')
fig.update_xaxes(title= 'Last Merge Linkage Distance')
fig.update_yaxes(title='Count')

if SHOW_FIG:
    fig.show()
if SAVE_FIG:
    fig_name = f'{corpus_name}-linkage_distance_hist'
    fig.write_image(f'./data/plots/linkage_distances/{fig_name}.pdf')
    print(f"  \\subfloat[][{corpus_name}]{{\\includegraphics[width=0.49\\textwidth]{{./fig/linkage_distances/{fig_name}}}}} \\;")

### Average and Standard Deviation of Last Merge Linkage Distances as LaTeX Table
Presents the mean and standard deviation of linkage distances per corpus as LaTeX tabular rows.
Each linkage criterion has its own column.

In [None]:
EXPERIMENT_NAMES = ['experiment_linkage_criterion/SemCor_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemCor_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemCor_Euclidean_Single_known_senses', 'experiment_linkage_criterion/Senseval2_Euclidean_Average_known_senses', 'experiment_linkage_criterion/Senseval2_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/Senseval2_Euclidean_Single_known_senses', 'experiment_linkage_criterion/Senseval3_Euclidean_Average_known_senses', 'experiment_linkage_criterion/Senseval3_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/Senseval3_Euclidean_Single_known_senses', 'experiment_linkage_criterion/SemEval2007_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemEval2007_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemEval2007_Euclidean_Single_known_senses', 'experiment_linkage_criterion/SemEval2013_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemEval2013_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemEval2013_Euclidean_Single_known_senses', 'experiment_linkage_criterion/SemEval2015_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemEval2015_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemEval2015_Euclidean_Single_known_senses']

distance_stats = []
for experiment_name in EXPERIMENT_NAMES:
    stats = fh.load_stats(RESULTS_PATH, fg.gen_stats_file_name(experiment_name))
    dictionary = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(experiment_name))

    dictionary = dictionary[dictionary.total_token_count >= 2].reset_index().aggregate({'last_merge_dist': ['mean', 'std']}).transpose()
    dictionary['linkage'] = stats['linkage_name']
    dictionary['corpus'] = stats['corpus_name']
    distance_stats.append(dictionary)

distance_stats = pd.concat(distance_stats)
distance_stats = distance_stats.pivot(index='corpus', columns='linkage', values=['mean', 'std'])
for corpus_name, stats in distance_stats.iterrows():
    print(f"    \\textbf{{{corpus_name}}} & ${stats.loc[('mean', 'Average')]:.4f}$ & ${stats.loc[('std', 'Average')]:.4f}$ & ${stats.loc[('mean', 'Complete')]:.4f}$ & ${stats.loc[('std', 'Complete')]:.4f}$ & ${stats.loc[('mean', 'Single')]:.4f}$ & ${stats.loc[('std', 'Single')]:.4f}$ \\\\")

### Linkage Distances for All Corpora

In [None]:
WIDTH = 1024
HEIGHT = 387
MAX_LEN = None  # maximum number of linkage distances per token

EXPERIMENT_NAMES = ['experiment_linkage_criterion/SemCor_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemCor_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemCor_Euclidean_Single_known_senses', 'experiment_linkage_criterion/Senseval2_Euclidean_Average_known_senses', 'experiment_linkage_criterion/Senseval2_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/Senseval2_Euclidean_Single_known_senses', 'experiment_linkage_criterion/Senseval3_Euclidean_Average_known_senses', 'experiment_linkage_criterion/Senseval3_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/Senseval3_Euclidean_Single_known_senses', 'experiment_linkage_criterion/SemEval2007_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemEval2007_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemEval2007_Euclidean_Single_known_senses', 'experiment_linkage_criterion/SemEval2013_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemEval2013_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemEval2013_Euclidean_Single_known_senses', 'experiment_linkage_criterion/SemEval2015_Euclidean_Average_known_senses', 'experiment_linkage_criterion/SemEval2015_Euclidean_Complete_known_senses', 'experiment_linkage_criterion/SemEval2015_Euclidean_Single_known_senses']

linkage_distances = []
for experiment_name in EXPERIMENT_NAMES:
    stats = fh.load_stats(RESULTS_PATH, fg.gen_stats_file_name(experiment_name))
    dictionary = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(experiment_name))
    linkage_distances.append(pd.DataFrame({'Linkage': stats['linkage_name'],
                                           'linkage_dists': [dist_arr[:MAX_LEN] for dist_arr in dictionary.linkage_dists.tolist()]}))

linkage_distances = pd.concat(linkage_distances).explode(column='linkage_dists', ignore_index=True).dropna()
fig = px.histogram(linkage_distances, x='linkage_dists', color='Linkage', barmode='overlay',
                   width=WIDTH, height=HEIGHT, template='plotly_white')
fig.update_xaxes(title= 'Linkage Distance')
fig.update_yaxes(title='Count')

if SHOW_FIG:
    fig.show()
if SAVE_FIG:
    fig_name = f'all_corpora-linkage_distance_hist'
    fig.write_image(f'./data/plots/linkage_distances/{fig_name}.pdf')
    print(f"  \\subfloat[][All Corpora]{{\\includegraphics[width=0.49\\textwidth]{{./fig/linkage_distances/{fig_name}}}}} \\;")

### Browse a Dictionary as DataFrame

In [None]:
EXPERIMENT_PREFIX = ''
dictionary = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(EXPERIMENT_PREFIX))
ag.pack_sentence_ids_and_token_ids(
    ag.unpack_and_sort_per_token_id(
        dictionary,
        ['sentence_id', 'token_id', 'sense']),
    ['token', 'sense'])

### Browse a Dictionary as Website

The following code generates an HTML file from the given dictionary.

In [None]:
EXPERIMENT_PREFIX = 'toy-affinity_cosine-linkage_single-dist_0.4'
dictionary = fh.load_df(RESULTS_PATH, fg.gen_dictionary_file_name(EXPERIMENT_PREFIX))
stats = fh.load_stats(RESULTS_PATH, fg.gen_stats_file_name(EXPERIMENT_PREFIX))
corpus = CorpusHandler(stats['corpus_name'], CORPUS_CACHE_PATH)

html_dictionary = render_dictionary_in_html(
    dictionary, corpus.get_sentences_as_list(),
    EXPERIMENT_PREFIX)

html_file_name = fg.gen_html_dictionary_file_name(EXPERIMENT_PREFIX)
with open(path_join(RESULTS_PATH, html_file_name), "w") as f:
    f.write(html_dictionary)
    print(f"HTML file at: {html_file_name}")