In [None]:
import analysis.aggregator as ag
import data.file_handler as fh
from analysis.pipeline_blocks import evaluate_clustering
from data.corpus_handler import CorpusName, CorpusHandler
import plotly.express as px

In [None]:
ABS_PATH = fh.add_and_get_abs_path('./data/results')
dictionary = fh.load_df(ABS_PATH, 'Toy-linkage_complete-dist_0.4-dictionary.pkl')
#dictionary = fh.load_df(ABS_PATH, 'SemCor-dist_0.4-dictionary.pkl')

In [None]:
ag.collect_references_and_word_vectors(
    ag.unpack_per_word_vector(
        dictionary,
        ['reference_id', 'word_vector_id', 'sense']),
    ['token', 'sense'])

## Corpus Baseline

In [None]:
corpus = CorpusHandler(CorpusName.SEMCOR, './data/corpus_cache')
tagged_tokens = corpus.get_tagged_tokens()

In [None]:
ag.calc_corpus_statistics(corpus)

One cluster/sense per token

In [None]:
tagged_tokens['sense'] = range(tagged_tokens.token.count())
tagged_tokens.sense.nunique(), evaluate_clustering(corpus, tagged_tokens)

One cluster/sense per unique token

In [None]:
tagged_tokens['sense'] = tagged_tokens.token.factorize()[0]
tagged_tokens.sense.nunique(), evaluate_clustering(corpus, tagged_tokens)

## Plots

In [None]:
LABEL_DICT = {'unique_sense_count': 'Unique Sense Count',
              'total_token_count': 'Total Token Count',
              'unique_token_count': 'Unique Token Count'}
WIDTH = 512
HEIGHT = 256

In [None]:
tagged_tokens = corpus.get_tagged_tokens()
tagged_tokens_and_counts = ag.count_unique_senses_per_token(tagged_tokens)
token_counts_per_sense_count = ag.count_tokens_per_sense_count(tagged_tokens_and_counts)
token_counts_per_sense_count.head()

In [None]:
fig = px.scatter(token_counts_per_sense_count, x='unique_sense_count', y='unique_token_count',
                 color='total_token_count', size='total_token_count', color_continuous_scale=px.colors.sequential.Greys,
                 log_x=True, log_y=True, labels=LABEL_DICT,
                 template='plotly_white', width=WIDTH, height=WIDTH)
fig.update_layout(coloraxis_colorbar=dict(title_side='right',
                                          thicknessmode="fraction", thickness=0.04,
                                          ticks="outside", dtick=10000))
fig.update_traces(marker={'line': {'color': 'black'}})

fig.show()
#fig.write_image('data/plots/fig_semcor_tokens_and_senses.pdf')

In [None]:
tagged_tokens_and_counts[tagged_tokens_and_counts.unique_sense_count > 50].sort_values(by='unique_sense_count')