In [1]:
import pandas as pd
import qgrid
import numpy as np
from scipy.stats import pearsonr, spearmanr, wasserstein_distance
import json, bz2, pickle
from pprint import pprint
from tqdm.autonotebook import tqdm
from sklearn.preprocessing import minmax_scale

  import sys


In [2]:
with bz2.open('./convai2_results.pickle.bz2') as fin:
    convai2_data = pickle.load(fin)
len(convai2_data)

The history saving thread hit an unexpected error (OperationalError('disk I/O error',)).History will not be written to the database.


2471

In [3]:
dialogue_scores = list()
indices = list()
dialogue_data = dict()

for d in tqdm(convai2_data):
    d_item = dict()
    dialogue_data[str(d['dialog_id'])] = d
    indices.append(str(d['dialog_id']))
    d_item['quality'] = d['quality']
    
    pred_keys = list(d['predictions'].keys())
    bert_keys = list(filter(lambda x: 'bert' in x, pred_keys))
    
    for pred_key in bert_keys:
        pred_sum = sum([np.log(x) for x in d['predictions'][pred_key] if x != 0])
        pred_avg = pred_sum / len(d['predictions'][pred_key])
        d_item['{}_log_sum'.format(pred_key)] = pred_sum
        d_item['{}_log_avg'.format(pred_key)] = pred_avg
        
        pred_sum = sum(d['predictions'][pred_key])
        pred_avg = pred_sum / len(d['predictions'][pred_key])
        d_item['{}_sum'.format(pred_key)] = pred_sum
        d_item['{}_avg'.format(pred_key)] = pred_avg
        
        d_item['{}_prd'.format(pred_key)] = np.prod(d['predictions'][pred_key])        
        d_item['{}_prd_avg'.format(pred_key)] = d_item['{}_prd'.format(pred_key)] / len(d['predictions'][pred_key])
        
    prob_keys = list(filter(lambda x: 'prob' in x and x not in bert_keys, pred_keys))
    
    for pred_key in prob_keys: 
        s_sums = [sum(x) for x in d['predictions'][pred_key]]
        s_sums_d_sum = sum([x for x in s_sums if x != 0])
        s_sums_d_avg = s_sums_d_sum / len(s_sums)
        
        d_item['{}_s_sums_d_sum'.format(pred_key)] = pred_sum
        d_item['{}_s_sums_d_avg'.format(pred_key)] = pred_avg
        
        s_sums = [sum([np.log(x_1) for x_1 in x if x_1 != 0]) for x in d['predictions'][pred_key]]
        s_sums_d_sum = sum([x for x in s_sums if x != 0])
        s_sums_d_avg = s_sums_d_sum / len(s_sums)
        
        d_item['{}_s_log_sums_d_sum'.format(pred_key)] = pred_sum
        d_item['{}_s_log_sums_d_avg'.format(pred_key)] = pred_avg
        
        s_prd = [np.prod(x) for x in d['predictions'][pred_key]]
        s_prd_d_sum = sum(s_sums)
        s_prd_d_avg = s_sums_d_sum / len(s_sums)
        
        d_item['{}_s_prod_d_sum'.format(pred_key)] = pred_sum
        d_item['{}_s_prod_d_avg'.format(pred_key)] = pred_avg
        
        s_avg = [float(sum([np.log(x_1) for x_1 in x if x_1 != 0]) / len(x)) for x in d['predictions'][pred_key] if len(x) > 0]        
        s_avg_d_sum = sum(s_avg)
        s_avg_d_avg = s_avg_d_sum / len(s_avg)        
        
        d_item['{}_s_log_avg_d_sum'.format(pred_key)] = s_avg_d_sum
        d_item['{}_s_log_avg_d_avg'.format(pred_key)] = s_avg_d_avg
        
        s_avg = [float(sum(x) / len(x)) for x in d['predictions'][pred_key] if len(x) > 0]        
        s_avg_d_sum = sum(s_avg)
        s_avg_d_avg = s_avg_d_sum / len(s_avg)        
        
        d_item['{}_s_avg_d_sum'.format(pred_key)] = s_avg_d_sum
        d_item['{}_s_avg_d_avg'.format(pred_key)] = s_avg_d_avg
        
    dialogue_scores.append(d_item)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2471.0), HTML(value='')))




In [4]:
dialogue_scores = pd.DataFrame(dialogue_scores)
dialogue_scores.index = indices
dialogue_scores.head()

Unnamed: 0,quality,bert-base-uncased_nsp_0_log_sum,bert-base-uncased_nsp_0_log_avg,bert-base-uncased_nsp_0_sum,bert-base-uncased_nsp_0_avg,bert-base-uncased_nsp_0_prd,bert-base-uncased_nsp_0_prd_avg,bert-base-uncased_nsp_1_log_sum,bert-base-uncased_nsp_1_log_avg,bert-base-uncased_nsp_1_sum,...,gpt2-large_sentences_best_word_probs_s_sums_d_sum,gpt2-large_sentences_best_word_probs_s_sums_d_avg,gpt2-large_sentences_best_word_probs_s_log_sums_d_sum,gpt2-large_sentences_best_word_probs_s_log_sums_d_avg,gpt2-large_sentences_best_word_probs_s_prod_d_sum,gpt2-large_sentences_best_word_probs_s_prod_d_avg,gpt2-large_sentences_best_word_probs_s_log_avg_d_sum,gpt2-large_sentences_best_word_probs_s_log_avg_d_avg,gpt2-large_sentences_best_word_probs_s_avg_d_sum,gpt2-large_sentences_best_word_probs_s_avg_d_avg
0xab38710,1.0,-3.881846,-0.388185,9.020201,0.90202,0.020613,0.002061273,-98.081533,-9.808153,0.979799,...,0.002793,0.000279,0.002793,0.000279,0.002793,0.000279,-16.465518,-1.646552,2.37464,0.237464
0x7fcf7907,1.0,-0.026552,-0.003319,7.973796,0.996725,0.973798,0.1217247,-86.976495,-10.872062,0.026204,...,0.001093,0.000137,0.001093,0.000137,0.001093,0.000137,-11.353486,-1.419186,2.444737,0.305592
0x7ebe8afe,2.0,-7.814247,-0.48839,14.129607,0.8831,0.000404,2.524617e-05,-132.596779,-8.287299,1.870392,...,1.932181,0.120761,1.932181,0.120761,1.932181,0.120761,-23.252595,-1.453287,4.700146,0.293759
0x7d519415,4.0,-0.004033,-0.000288,13.99597,0.999712,0.995975,0.07114108,-146.911929,-10.493709,0.00403,...,0.001678,0.00012,0.001678,0.00012,0.001678,0.00012,-18.444747,-1.317482,4.799712,0.342837
0x1d81519,1.0,-12.826997,-1.425222,6.97182,0.774647,3e-06,2.985819e-07,-56.654462,-6.29494,2.028179,...,0.992469,0.110274,0.992469,0.110274,0.992469,0.110274,-15.517803,-1.7242,2.266936,0.251882


In [5]:
print(dialogue_scores.shape)
dialogue_scores = dialogue_scores.dropna()
print(dialogue_scores.shape)

(2471, 125)
(2279, 125)


In [6]:
for col in dialogue_scores.columns:
    dialogue_scores[col] = minmax_scale(dialogue_scores[col])

dialogue_scores.head()

Unnamed: 0,quality,bert-base-uncased_nsp_0_log_sum,bert-base-uncased_nsp_0_log_avg,bert-base-uncased_nsp_0_sum,bert-base-uncased_nsp_0_avg,bert-base-uncased_nsp_0_prd,bert-base-uncased_nsp_0_prd_avg,bert-base-uncased_nsp_1_log_sum,bert-base-uncased_nsp_1_log_avg,bert-base-uncased_nsp_1_sum,...,gpt2-large_sentences_best_word_probs_s_sums_d_sum,gpt2-large_sentences_best_word_probs_s_sums_d_avg,gpt2-large_sentences_best_word_probs_s_log_sums_d_sum,gpt2-large_sentences_best_word_probs_s_log_sums_d_avg,gpt2-large_sentences_best_word_probs_s_prod_d_sum,gpt2-large_sentences_best_word_probs_s_prod_d_avg,gpt2-large_sentences_best_word_probs_s_log_avg_d_sum,gpt2-large_sentences_best_word_probs_s_log_avg_d_avg,gpt2-large_sentences_best_word_probs_s_avg_d_sum,gpt2-large_sentences_best_word_probs_s_avg_d_avg
0xab38710,0.0,0.990521,0.94347,0.010007,0.901918,0.020613,0.004122574,0.988179,0.222702,0.010018,...,2.9e-05,0.000327,2.9e-05,0.000327,2.9e-05,0.000327,0.989993,0.582809,0.007627,0.353607
0x7fcf7907,0.0,0.999935,0.999517,0.008846,0.996724,0.973804,0.2434511,0.989517,0.138378,0.000268,...,1.1e-05,0.000159,1.1e-05,0.000159,1.1e-05,0.000159,0.99347,0.706141,0.007867,0.503101
0x7ebe8afe,0.25,0.980918,0.928877,0.015676,0.882978,0.000404,5.049267e-05,0.984019,0.343243,0.019124,...,0.01977,0.142464,0.01977,0.142464,0.01977,0.142464,0.985376,0.687643,0.015609,0.477136
0x7d519415,0.75,0.99999,0.999959,0.015528,0.999715,0.995982,0.1422831,0.982294,0.168366,4.1e-05,...,1.7e-05,0.000139,1.7e-05,0.000139,1.7e-05,0.000139,0.988647,0.761309,0.015951,0.584827
0x1d81519,0.0,0.968677,0.792449,0.007734,0.774407,3e-06,5.971678e-07,0.993172,0.501155,0.020738,...,0.010155,0.130092,0.010155,0.130092,0.010155,0.130092,0.990638,0.54069,0.007257,0.385244


In [7]:
def rmse(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

all_scores = {col:dict() for col in dialogue_scores.columns[1:]}

for col in dialogue_scores.columns[1:]:
    for f in (pearsonr, spearmanr, wasserstein_distance, rmse):
        scores = f(dialogue_scores.quality, dialogue_scores[col])
        if np.isscalar(scores):
            scores = [scores]
        
        for score, name in zip(scores, [f.__name__, f.__name__+'_p']):
            all_scores[col][name] = round(score, 3)

all_scores = pd.DataFrame.from_dict(all_scores, orient='index')
qgrid.show_grid(all_scores,
               grid_options={
    # SlickGrid options
    'fullWidthRows': True,
    'syncColumnCellResize': True,
    'forceFitColumns': False,
    'defaultColumnWidth': 80,
    'rowHeight': 28,
    'enableColumnReorder': False,
    'enableTextSelectionOnCells': True,
    'editable': True,
    'autoEdit': False,
    'explicitInitialization': True,

    # Qgrid options
    'maxVisibleRows': 15,
    'minVisibleRows': 8,
    'sortable': True,
    'filterable': True,
    'highlightSelectedCell': False,
    'highlightSelectedRow': True
})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

In [8]:
key = '0x6cf6296a'
d = dialogue_data[key]
pprint(d['quality'])
pprint(list((idx, u) for idx,u in enumerate(d['utterances'])))
pprint({k:[(idx,'__'.join(u)) for idx,u in enumerate(v)] for k,v in d['predictions'].items() if 'best_words' in k})
pprint(dialogue_scores[dialogue_scores.index == key].to_dict('list'))

5
[(0, 'Yo!'),
 (1, 'Hi!'),
 (2, 'Hi! i am in alaska.'),
 (3, 'Cool! What are you doing there?'),
 (4, 'I am a bit of a yoga instructor.'),
 (5, 'Wow! Are you man or woman?'),
 (6, 'I am! i am a woman.'),
 (7, 'How nice!  Do you have a boyfriend?'),
 (8, 'I do not. i am a single mom.'),
 (9, 'Sad. How old are you?'),
 (10, 'I am 25. i am a bit older than you.'),
 (11, "No, I'am older! Do you have any hobbies?"),
 (12, 'I like to read and write. i love to read.'),
 (13, 'What books do you like to read?'),
 (14, 'I like to read a lot'),
 (15, 'What books?'),
 (16, "I don't know."),
 (17, "I don't believe you."),
 (18, 'I am sorry to hear that. i am a bit of a loner.')]
{'gpt2-large_sentences_best_words': [(0, '!__ĠI'),
                                     (1, "!__Ċ__'m__Ġa__Ġthe__aska__Ġand__Ġi"),
                                     (2, '!__ĠI__Ġis__Ġyou__Ġdoing__Ġin__?__Ċ'),
                                     (3,
                                      "'m__Ġa__Ġstudent__Ġof__Ġa__Ġfan_