In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np
import analysis_functions as af
import analysis_functions2 as af2
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [None]:
sd_path = "tac.pkl"
sd = af.get_pickle(sd_path)

# EDA

### print score ranges

In [None]:
af.print_score_ranges(sd)

### how many summaries per document?

In [None]:
print(np.mean([len(d['system_summaries']) for d in sd.values()]))

### what metrics found for each document?

In [None]:
mlist = af.get_metrics_list(sd)
mlist

### Filter some metrics

In [None]:
mlist = ['bert_recall_score', 'mover_score', 'rouge_1_recall', 'rouge_2_recall', 'rouge_l_recall', 'js-2']

# Properties of documents

## Ease of summarization

In [None]:
x_type = 'avg__all_max_m'
y_type = 'ktau'
metrics = mlist
for x_modifier in ['doc']:
    af2.plot_doc(sd, x_type, y_type, x_modifier=x_modifier, abs_pr_plot_cutoff=0, 
                 fit_line_window=10, metrics_list=mlist, show_title=True, save_fig=True)

## Abstractiveness

In [None]:
x_type = 'ref_abstractiveness_wrt_doc'
y_type = 'ktau'
metrics = mlist
for x_modifier in ['doc']:
    af2.plot_doc(sd, x_type, y_type, x_modifier=x_modifier, abs_pr_plot_cutoff=0, 
                 fit_line_window=15, metrics_list=mlist, show_y_label=False, show_yticklabels=True, 
                 show_title=True, save_fig=True)

## Coverage

In [None]:
x_type = 'coverage'
y_type = 'ktau'
                
af2.plot_doc(sd, x_type, y_type, x_modifier=None, 
             abs_pr_plot_cutoff=0, fit_line_window=10,
             cutoff_metric=None, percentile=None, metrics_list=mlist, show_y_label=False, show_yticklabels=True, 
            show_title=True, save_fig=True)

# Disagreement

In [None]:
from disagreement_utils import get_pairwise_disagreement
from disagreement_utils import non_cum_get_pairwise_disagreement
import pandas as pd
import seaborn as sns; sns.set(color_codes=True)
import itertools

In [None]:
short_name = {'bert_recall_score': 'BScore', 'mover_score': 'MS', 
              'rouge_1_recall': 'R1', 'rouge_2_recall': 'R2', 'rouge_l_recall': 'RL', 'js-2': 'JS2'}

In [None]:
data = {}
for doc_id, summaries in sd.items():
    data[doc_id] = [{
        'text': summ['system_summary'],
        **summ['scores']
    } for summ in summaries['system_summaries'].values()]

## Cumulative

In [None]:
ax = None
for pair in itertools.combinations(mlist, 2):
    mat = get_pairwise_disagreement(pair[0], pair[1], [data])
    metric_string = f'({short_name[pair[0]]}, {short_name[pair[1]]})'
    df_random = pd.DataFrame(mat, columns=['average pair score', metric_string])
    if ax == None:
        ax = df_random.plot(x='average pair score', y=metric_string, figsize=(10, 7), linewidth=3)
    else:
        ax = df_random.plot(x='average pair score', y=metric_string, ax=ax, linewidth=3)
        
ax.legend(prop={'size': 16}, loc='upper left')
ax.set_xlabel('')
ax.set_ylabel('Disagreement (%)', fontsize=35)
ax.tick_params(axis='both', labelsize=25)

## Non-cumulative

In [None]:
ax = None
for pair in itertools.combinations(mlist, 2):
    mat = non_cum_get_pairwise_disagreement(pair[0], pair[1], [data])
    metric_string = f'({short_name[pair[0]]}, {short_name[pair[1]]})'
    df_random = pd.DataFrame(mat, columns=['average pair score', metric_string])
    if ax == None:
        ax = df_random.plot(x='average pair score', y=metric_string, figsize=(10, 7), linewidth=3)
    else:
        ax = df_random.plot(x='average pair score', y=metric_string, ax=ax, linewidth=3)
        
# ax.legend(prop={'size': 16})
ax.set_xlabel('')
# ax.set_ylabel('Disagreement (%)', fontsize=25)
ax.tick_params(axis='both', labelsize=25)
ax.get_legend().remove()

# Proportion of overall improvements

In [None]:
from disagreement_utils import proportion_better
from disagreement_utils import proportion_worse
from plotting_utils import plot_proportion_improvements
from plotting_utils import plot_proportion_worse

In [None]:
datapoints = proportion_better([data], m=1000)
datapoints.extend(proportion_better([data], m=1000, min_avg=0.2))
datapoints.extend(proportion_better([data], m=1500, min_avg=0.3))
datapoints.extend(proportion_better([data], m=2500, min_avg=0.35))
plot_proportion_improvements(datapoints)

# Proportion of overall improvements with random data

In [None]:
import random
def get_random_scores(mlist):
    return {m: random.random() for m in mlist}

data_r = {}
for doc_id, summs in data.items():
    data_r[doc_id] = [{
        'text': summ['text'],
        **get_random_scores(mlist)
    } for summ in summs]
    
datapoints = proportion_better([data_r], m=1000)
datapoints.extend(proportion_better([data_r], m=1000, min_avg=0.2))
datapoints.extend(proportion_better([data_r], m=1500, min_avg=0.3))
datapoints.extend(proportion_better([data_r], m=2500, min_avg=0.35))
plot_proportion_improvements(datapoints, show_ylabel=False)

# Proportion worse

In [None]:
datapoints = proportion_worse([data], m=1000)
datapoints.extend(proportion_worse([data], m=1000, min_avg=0.2))
datapoints.extend(proportion_worse([data], m=1500, min_avg=0.3))
datapoints.extend(proportion_worse([data], m=2500, min_avg=0.35))
plot_proportion_worse(datapoints)

## Proportion worse with random data

In [None]:
datapoints = proportion_worse([data_r], m=1000)
datapoints.extend(proportion_worse([data_r], m=1000, min_avg=0.2))
datapoints.extend(proportion_worse([data_r], m=1500, min_avg=0.3))
datapoints.extend(proportion_worse([data_r], m=2500, min_avg=0.35))
plot_proportion_worse(datapoints, show_ylabel=False)