In [1]:
from tqdm import tqdm
import itertools
import os
import json
from collections import defaultdict
import itertools
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import pickle
import random
import seaborn as sns

In [2]:
%cd -q ..

# Save each version of the dataset in the form of :

$$M = \begin{bmatrix} &  & M_{1} & M_{2} & \cdots & M_{|M|} & H_{1} & H_{2} & \cdots &H_{|H|} \\
S_1 & u^{S_1}_1  &  &   &   &   &   &   &   &  \\
S_1 & \cdots  &  &   &   &   &   &   &   &  \\
S_1 & u^{S_1}_{|S_1|}   &  &   &   &   &   &   &   &  \\
 \cdots &     &  &   &   &   &   &   &   &  \\
  \cdots &    &  &   &   &   &   &   &   &  \\
S_{|S|} & u^{S_{|S|}}_1  &  &   &   &   &   &   &   &  \\
S_{|S|} & \cdots  &  &   &   &   &   &   &   &  \\
S_{|S|} & u^{S_{|S|}}_{|S|}   &  &   &   &   &   &   &   &  \\
\end{bmatrix}$$


## Helpers

In [3]:
def prepare_df_for_muli(dic_df):
    l_df = []
    for _,v in dic_df.items():
        v = v.transpose()
        v_s = []
        for i in range(v.shape[1]):
            v_s_int = v[i].reset_index(level=0)
            v_s_int.columns= [0,1]
            v_s.append(v_s_int)
        v_s = pd.concat(v_s,axis=0)
        v_s['System'] = sum([[i]*v.shape[0] for i in range(v.shape[1])],[])
        l_df.append(v_s)
    final_dfs = []
    for index, df in enumerate(l_df):
        df.columns = ['Utterance',"Score","System"]
        df = df.set_index(['System','Utterance'])
        df.columns = [list(dic_df.keys())[index]]
        final_dfs.append(df)
    
    return pd.concat(final_dfs,axis=1)

In [4]:
path = 'data/human_annotations'
AVAILABLE_HUMAN_DATASETS = ['Dialogue_dataframe.pickle','Dialogue_qa_engaging_dataframe.pickle',
                      'Dialogue_qa_maintain_context_dataframe.pickle','Dialogue_qa_natural_dataframe.pickle',
                      'Dialogue_qa_overall_dataframe.pickle','Dialogue_qa_understandable_dataframe.pickle',
                      'Dialogue_qa_uses_knowledge_dataframe.pickle','Flickr_dataframe.pickle',
                      'Flickr_qa_overall_dataframe.pickle','mlqe-pe_human_dataframe.pickle',
                      'mlqe-pe_metric_dataframe.pickle','REALSumm_dataframe.pickle','REALSumm_pyr_dataframe.pickle',
                      'SUMM_v2_dataframe.pickle','SUMM_v2_qa_coherence_dataframe.pickle',
                      'SUMM_v2_qa_consistency_dataframe.pickle','SUMM_v2_qa_fluency_dataframe.pickle',
                      'SUMM_v2_qa_relevance_dataframe.pickle','TAC_dataframe.pickle','TAC_pyr_dataframe.pickle',
                      'TAC_responsiveness_dataframe.pickle']
dataset_h = 'TAC_responsiveness_dataframe.pickle'
assert dataset_h in AVAILABLE_HUMAN_DATASETS

In [5]:
AVAILABLE_ORIGINAL_DATASETS = ['Dialogue_dataframe.pickle','Flickr_dataframe.pickle',
                    'REALSumm_dataframe.pickle','SUMM_v2_dataframe.pickle','TAC_dataframe.pickle']
dataset_o = 'TAC_dataframe.pickle'
assert dataset_o in AVAILABLE_ORIGINAL_DATASETS

# TAC 

In [6]:
dataset_h = ['TAC_pyr_dataframe.pickle','TAC_responsiveness_dataframe.pickle']
dataset_o = 'TAC_dataframe.pickle'
df_humans = {
    'H:pyr':pd.read_pickle(os.path.join(path,dataset_h[0])),
    'H:resp':pd.read_pickle(os.path.join(path,dataset_h[1]))
           }
df_original = pd.read_pickle(os.path.join(path,dataset_o))

In [7]:
assert len(df_original.keys()) == 1
key_name_o = list(df_original.keys())[0]
print('Keys:',key_name_o)
sub_keys_o = list(df_original[key_name_o].keys())
print('Sub Keys:',sub_keys_o)

Keys: TAC
Sub Keys: ['08', '09', '11']


In [8]:
key_name_h = list(df_humans['H:pyr'].keys())[0]
print('Keys:',key_name_h)
sub_keys_h = list(df_humans['H:pyr'][key_name_h].keys())
print('Sub Keys:',sub_keys_h)

Keys: TAC_pyr
Sub Keys: ['08', '09', '11']


In [9]:
assert sub_keys_h ==sub_keys_o

### SAVING

In [10]:
for TAC_name in sub_keys_h:
    final_to_save= {
    }
    for k,v in df_humans.items():
        for sub_vkeys in  v.keys(): 
            print('Sub_keys',sub_vkeys)
            human_df = v[sub_vkeys][TAC_name]
        final_to_save[k]= human_df
    metric_df = df_original[key_name_o][TAC_name]
    for k,v in df_original[key_name_o][TAC_name].items():
        final_to_save[k] = v
        
    processed_df = prepare_df_for_muli(final_to_save)
    processed_df.to_csv('final_df/TAC_{}.csv'.format(TAC_name))


    

Sub_keys TAC_pyr
Sub_keys TAC_responsiveness
Sub_keys TAC_pyr
Sub_keys TAC_responsiveness
Sub_keys TAC_pyr
Sub_keys TAC_responsiveness


In [11]:
processed_df

Unnamed: 0_level_0,Unnamed: 1_level_0,H:pyr,H:resp,S3_pyr,S3_resp,ROUGE_WE_1,ROUGE_WE_2,JS_1,JS_2,ROUGE_L,ROUGE_1,ROUGE_2,BLEU,Chrfpp,BERTScore,MoverScore
System,Utterance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,M0,0.675,1.0,0.723394,0.798821,0.555831,0.393484,-0.320665,-0.465778,0.401985,0.580645,0.333333,100.000000,0.421909,0.726998,0.413697
0,M1,1.062,1.0,0.740880,0.802141,0.578164,0.398496,-0.301063,-0.465248,0.399504,0.593052,0.325815,100.000000,1.000000,0.741266,0.444839
0,M2,1.030,1.0,0.735437,0.795142,0.560794,0.393484,-0.304503,-0.478472,0.401985,0.588089,0.320802,100.000000,0.468354,0.731971,0.431084
0,M3,0.763,1.0,0.678842,0.753617,0.533499,0.385965,-0.343083,-0.475570,0.372208,0.553350,0.310777,100.000000,0.391787,0.720984,0.401022
0,M4,0.553,0.8,0.277277,0.402272,0.347395,0.155388,-0.469595,-0.637978,0.141439,0.354839,0.067669,6.190456,0.364575,0.588147,0.156335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,M49,0.098,0.2,0.093416,0.304547,0.199005,0.067839,-0.558379,-0.674522,0.119403,0.211443,0.022613,2.258185,0.283628,0.506886,0.017145
43,M50,0.146,0.4,0.164041,0.336601,0.258706,0.118090,-0.538582,-0.665251,0.121891,0.271144,0.040201,3.413756,0.356994,0.562395,0.141237
43,M51,0.244,0.4,0.159321,0.349101,0.248756,0.087940,-0.538704,-0.652819,0.134328,0.276119,0.052764,4.249683,0.340977,0.553288,0.111457
43,M52,0.293,0.4,0.239507,0.381235,0.288557,0.135678,-0.480853,-0.657634,0.116915,0.315920,0.047739,2.442528,0.351633,0.566271,0.138926


# DIALOG DATAFRAME

In [12]:
dataset_h = ['Dialogue_qa_engaging_dataframe.pickle','Dialogue_qa_maintain_context_dataframe.pickle',
             'Dialogue_qa_natural_dataframe.pickle','Dialogue_qa_overall_dataframe.pickle',
             'Dialogue_qa_understandable_dataframe.pickle','Dialogue_qa_uses_knowledge_dataframe.pickle']
dataset_o = 'Dialogue_dataframe.pickle'
df_humans = {
    'H:engaging':pd.read_pickle(os.path.join(path,dataset_h[0])),
    'H:context':pd.read_pickle(os.path.join(path,dataset_h[1])),
    'H:natural':pd.read_pickle(os.path.join(path,dataset_h[2])),
    'H:overall':pd.read_pickle(os.path.join(path,dataset_h[3])),
    'H:understandable':pd.read_pickle(os.path.join(path,dataset_h[4])),
    'H:useknowledge':pd.read_pickle(os.path.join(path,dataset_h[5]))
           }
df_original = pd.read_pickle(os.path.join(path,dataset_o))

In [13]:
assert len(df_original.keys()) == 1
key_name_o = list(df_original.keys())[0]
print('Keys:',key_name_o)
sub_keys_o = list(df_original[key_name_o].keys())
print('Sub Keys:',sub_keys_o)

Keys: Dialogue
Sub Keys: ['tc', 'pc']


In [14]:
key_name_h = list(df_humans.keys())[0]
print('Keys:',key_name_h)
sub_keys_h = list(df_humans[key_name_h].keys())
print('Sub Keys:',sub_keys_h)

Keys: H:engaging
Sub Keys: ['Dialogue_qa_engaging']


In [15]:
df_original

{'Dialogue': {'tc': {'S3_pyr':           M0        M1        M2        M3        M4
   0  -0.029711  0.013919 -0.085604  0.034792 -0.032013
   1   0.036826  0.064506 -0.000927  0.029123  0.101405
   2   0.085593 -0.018293  0.070064  0.032954  0.187842
   3  -0.086544  0.003041 -0.094708  0.060452 -0.058103
   4   0.300050  0.126067 -0.102334  0.041370  0.171972
   5   0.131952  0.063243  0.047517 -0.091006  0.200616
   6  -0.084546 -0.084546 -0.064130 -0.084546  0.016450
   7   0.154773  0.121887  0.319098  0.450337  0.324301
   8   1.188523  1.241371  1.203002  0.408331  1.095598
   9  -0.039604 -0.034296 -0.066023 -0.054407 -0.017405
   10  0.304444  0.113840  0.359809  0.102536  0.300157
   11  0.032645 -0.054604 -0.015623 -0.025246  0.051001
   12  0.103129  0.074483  0.118823  0.002891  0.296029
   13 -0.080388  0.064381 -0.067963 -0.035136 -0.049757
   14  0.074648  0.178965  0.152989 -0.066524  0.784585
   15  0.235173 -0.054604  0.335995 -0.080388  0.133937
   16 -0.044407  0.0

In [16]:
df_original['Dialogue']['tc']["BERTScore"]

Unnamed: 0,M0,M1,M2,M3,M4
0,0.504437,0.533874,0.465017,0.554774,0.490507
1,0.580885,0.553735,0.510175,0.485242,0.516849
2,0.460653,0.45684,0.473013,0.487738,0.540503
3,0.459751,0.498177,0.417464,0.525212,0.445403
4,0.581841,0.49728,0.428881,0.511646,0.619344
5,0.468969,0.402806,0.486724,0.448577,0.536342
6,0.398995,0.377736,0.407099,0.388871,0.472796
7,0.513905,0.543891,0.602393,0.590127,0.644078
8,0.746474,0.807496,0.758556,0.640802,0.738469
9,0.431347,0.42679,0.388713,0.462369,0.475962


### SAVING

In [17]:
for DIALOG_name in sub_keys_o:
    final_to_save= {
    }
    for k,v in df_humans.items():
        for sub_vkeys in  v.keys(): 
            human_df = v[sub_vkeys][DIALOG_name]
        final_to_save[k]= human_df
    metric_df = df_original[key_name_o][DIALOG_name]
    for k,v in df_original[key_name_o][DIALOG_name].items():
        final_to_save[k] = v
        
    processed_df = prepare_df_for_muli(final_to_save)
    processed_df.to_csv('final_df/DIALOG_{}.csv'.format(DIALOG_name))
    

In [18]:
processed_df

Unnamed: 0_level_0,Unnamed: 1_level_0,H:engaging,H:context,H:natural,H:overall,H:understandable,H:useknowledge,S3_pyr,S3_resp,ROUGE_WE_1,ROUGE_WE_2,JS_1,JS_2,ROUGE_L,ROUGE_1,ROUGE_2,BLEU,Chrfpp,BERTScore,MoverScore
System,Utterance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,M0,3.000000,1.0,3.000000,2.000000,1.000000,0.666667,-0.045194,0.228619,0.166667,0.000000,-0.693147,-0.693147,0.166667,0.166667,0.000000,1.419919,0.102026,0.432498,-0.087815
0,M1,1.666667,3.0,1.666667,3.000000,1.000000,0.000000,0.205413,0.378592,0.333333,0.000000,-0.412726,-0.693147,0.333333,0.333333,0.000000,3.747777,0.133028,0.605510,0.295771
0,M2,1.666667,1.0,2.333333,2.000000,0.333333,0.000000,-0.045194,0.228619,0.166667,0.000000,-0.693147,-0.693147,0.166667,0.166667,0.000000,2.383854,0.074627,0.523330,0.122039
0,M3,3.000000,3.0,3.000000,5.000000,1.000000,1.000000,-0.045194,0.228619,0.166667,0.000000,-0.693147,-0.693147,0.166667,0.166667,0.000000,1.873888,0.108561,0.476112,-0.011978
1,M0,1.666667,1.0,1.666667,1.333333,0.333333,0.000000,-0.102334,0.196061,0.000000,0.000000,-0.693147,-0.693147,0.000000,0.000000,0.000000,2.024446,0.119021,0.411848,-0.096912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,M3,3.000000,3.0,3.000000,5.000000,1.000000,1.000000,-0.147933,0.153592,0.125000,0.000000,-0.693147,-0.693147,0.000000,0.000000,0.000000,1.012223,0.179232,0.414108,-0.066690
59,M0,2.666667,2.0,2.333333,3.000000,0.666667,0.000000,0.141499,0.308119,0.187500,0.200000,-0.576714,-0.615040,0.187500,0.187500,0.066667,5.464223,0.146788,0.471078,-0.023760
59,M1,1.333333,2.0,2.333333,2.333333,1.000000,0.000000,-0.102334,0.196061,0.000000,0.000000,-0.693147,-0.693147,0.000000,0.000000,0.000000,1.792860,0.105058,0.415581,-0.163929
59,M2,2.333333,1.0,2.333333,2.333333,1.000000,0.000000,-0.102334,0.196061,0.000000,0.000000,-0.693147,-0.693147,0.000000,0.000000,0.000000,2.083729,0.095809,0.455860,-0.050552


# Flickr

In [19]:
dataset_h = 'Flickr_qa_overall_dataframe.pickle'
dataset_o = 'Flickr_dataframe.pickle'
df_humans = { 'H:overall':pd.read_pickle(os.path.join(path,dataset_h))}
df_original = pd.read_pickle(os.path.join(path,dataset_o))

In [20]:
assert len(df_original.keys()) == 1
key_name_o = list(df_original.keys())[0]
print('Keys:',key_name_o)
sub_keys_o = list(df_original[key_name_o].keys())
print('Sub Keys:',sub_keys_o)

Keys: Flickr
Sub Keys: ['S3_pyr', 'S3_resp', 'ROUGE_WE_1', 'ROUGE_WE_2', 'JS_1', 'JS_2', 'ROUGE_L', 'ROUGE_1', 'ROUGE_2', 'BLEU', 'Chrfpp', 'BERTScore', 'MoverScore']


In [21]:
assert len(df_humans.keys()) == 1
key_name_h = list(df_humans.keys())[0]
print('Keys:',key_name_h)
sub_keys_h = list(df_humans[key_name_h].keys())
print('Sub Keys:',sub_keys_h)

Keys: H:overall
Sub Keys: ['Flickr_qa_overall']


### SAVING

In [22]:
final_to_save= {
    }
for k,v in df_humans.items():
    for sub_vkeys in  v.keys(): 
        human_df = v[sub_vkeys] 
    final_to_save[k]= human_df
metric_df = df_original[key_name_o] 
for k,v in df_original[key_name_o].items():
    final_to_save[k] = v
        
processed_df = prepare_df_for_muli(final_to_save)
processed_df.to_csv('final_df/FLICKR.csv')
    

# REALSumm

In [23]:
dataset_h = 'REALSumm_pyr_dataframe.pickle'
dataset_o = 'REALSumm_dataframe.pickle'
df_humans = { 'H:pyr':pd.read_pickle(os.path.join(path,dataset_h))}
df_original = pd.read_pickle(os.path.join(path,dataset_o))

In [24]:
assert len(df_original.keys()) == 1
key_name_o = list(df_original.keys())[0]
print('Keys:',key_name_o)
sub_keys_o = list(df_original[key_name_o].keys())
print('Sub Keys:',sub_keys_o)

Keys: REALSumm
Sub Keys: ['bert_f_score', 'bert_precision_score', 'bert_recall_score', 'js-2', 'mover_score', 'rouge_1_f_score', 'rouge_1_precision', 'rouge_1_recall', 'rouge_2_f_score', 'rouge_2_precision', 'rouge_2_recall', 'rouge_l_f_score', 'rouge_l_precision', 'rouge_l_recall']


In [25]:
assert len(df_humans.keys()) == 1
key_name_h = list(df_humans.keys())[0]
print('Keys:',key_name_h)
sub_keys_h = list(df_humans[key_name_h].keys())
print('Sub Keys:',sub_keys_h)

Keys: H:pyr
Sub Keys: ['REALSumm']


### SAVING

In [26]:
final_to_save= {
    }
for k,v in df_humans.items():
    for sub_vkeys in  v.keys(): 
        human_df = v[sub_vkeys] 
    final_to_save[k]= human_df
metric_df = df_original[key_name_o] 
for k,v in df_original[key_name_o].items():
    final_to_save[k] = v
        
processed_df = prepare_df_for_muli(final_to_save)
processed_df.to_csv('final_df/REAL_SUM.csv')
    

# SumEval

In [27]:
dataset_h = ['SUMM_v2_qa_coherence_dataframe.pickle','SUMM_v2_qa_consistency_dataframe.pickle',
             'SUMM_v2_qa_fluency_dataframe.pickle','SUMM_v2_qa_relevance_dataframe.pickle']
dataset_o = 'SUMM_v2_dataframe.pickle'
df_humans = {
    'H:coherence':pd.read_pickle(os.path.join(path,dataset_h[0])),
    'H:consistency':pd.read_pickle(os.path.join(path,dataset_h[1])),
    'H:fluency':pd.read_pickle(os.path.join(path,dataset_h[2])),
    'H:relevance':pd.read_pickle(os.path.join(path,dataset_h[3]))
           }
df_original = pd.read_pickle(os.path.join(path,dataset_o))

In [28]:
assert len(df_original.keys()) == 1
key_name_o = list(df_original.keys())[0]
print('Keys:',key_name_o)
sub_keys_o = list(df_original[key_name_o].keys())
print('Sub Keys:',sub_keys_o)

Keys: SUMM_v2
Sub Keys: ['S3_pyr', 'S3_resp', 'ROUGE_WE_1', 'ROUGE_WE_2', 'JS_1', 'JS_2', 'ROUGE_L', 'ROUGE_1', 'ROUGE_2', 'BLEU', 'Chrfpp', 'BERTScore', 'MoverScore']


In [29]:
key_name_h = list(df_humans.keys())[0]
print('Keys:',key_name_h)
sub_keys_h = list(df_humans[key_name_h].keys())
print('Sub Keys:',sub_keys_h)

Keys: H:coherence
Sub Keys: ['SUMM_v2_qa_coherence']


### SAVING

In [30]:
final_to_save= {
    }
for k,v in df_humans.items():
    for sub_vkeys in  v.keys(): 
        human_df = v[sub_vkeys] 
    final_to_save[k]= human_df
metric_df = df_original[key_name_o] 
for k,v in df_original[key_name_o].items():
    final_to_save[k] = v
        
processed_df = prepare_df_for_muli(final_to_save)
processed_df.to_csv('final_df/SUM_EVAL.csv')

# MLQE

In [31]:
dataset_h = 'mlqe-pe_human_dataframe.pickle'
dataset_o = 'mlqe-pe_metric_dataframe.pickle'
df_humans = { 'H:mlqe':pd.read_pickle(os.path.join(path,dataset_h))}
df_original = pd.read_pickle(os.path.join(path,dataset_o))

In [32]:
assert len(df_original.keys()) == 1
key_name_o = list(df_original.keys())[0]
print('Keys:',key_name_o)
sub_keys_o = list(df_original[key_name_o].keys())
print('Sub Keys:',sub_keys_o)

Keys: MLQE-PE
Sub Keys: ['XBERTScore', 'XMoverScore (UMD)', 'XMoverScore (CLP)', 'Unsupervised XMoverScore', 'Unsupervised ContrastScore', 'Unsupervised XmoverScore + ContrastScore', 'Fine-tuned XMoverScore', 'Fine-tuned ContrastScore', 'Fine-tuned XMoverScore + ContrastScore']


In [33]:
assert len(df_humans.keys()) == 1
key_name_h = list(df_humans.keys())[0]
print('Keys:',key_name_h)
sub_keys_h = list(df_humans[key_name_h].keys())
print('Sub Keys:',sub_keys_h)

Keys: H:mlqe
Sub Keys: ['MLQE-PE']


### SAVING

In [34]:
final_to_save= {
    }
for k,v in df_humans.items():
        for sub_vkeys in  v.keys(): 
            human_df = v[sub_vkeys]
        final_to_save[k]= human_df
metric_df = df_original[key_name_o]
for k,v in df_original[key_name_o].items():
    final_to_save[k] = v

processed_df = prepare_df_for_muli(final_to_save)
processed_df.to_csv('final_df/MLQE.csv')