In [203]:
import utils
import relations

import pandas as pd
import os
import csv

from collections import defaultdict

pd.set_option('display.max_rows', None)


def get_overview(metrics, level, model_name, evidence_type):
    path_dir = f'../analysis/{model_name}/{level}'
    overview_dict = defaultdict(dict)
    if level == 'properties':
        target = 'property'
    elif level == 'relations':
        target = 'relation'
    for metric in metrics:
        path = f'{path_dir}/{metric}.csv'
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        for d in data:
            if target in d:
                prop = d[target]
            else:
                prop = d['']
            sc = d[evidence_type]
            if sc == '':
                sc = 0
            overview_dict[prop][metric] = float(sc)
    df = pd.DataFrame(overview_dict)
    return df.T

# Overview properties

## Giga

In [189]:
metrics = ['proportion', 'diversity', 'coherence',
           #, 'coherence', 
           'dist-mean', 'dist-max', 
           'str-mean', 'str-max'
          ]
level = 'properties'
evidence_type = 'u'
model_name = 'giga_full_updated'
df = get_overview(metrics, level, model_name, evidence_type)
df = utils.raw_to_distance(df, score_names = metrics, reference_name = 'median', 
                            score = 'dist-percent', sum_scores = ['sum', 'bin'])

df = df.sort_values('sum', ascending=False)
#print(df[3:7].round(4).to_latex())
df

Unnamed: 0,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max,sum,bin
median-reference,0.921154,722.0,0.100993,0.614247,0.822578,0.002555,0.028974,103.498643,1.0
roll,0.04396,4.174515,-0.087623,0.058955,0.085406,-0.139451,0.354987,0.641536,0.714286
square,0.039399,0.99446,0.0146,0.048517,0.215691,0.234612,1.197943,0.392175,1.0
warm,0.023252,1.750693,-0.086445,-0.194613,-0.06938,-0.008776,0.984035,0.342681,0.428571
blue,0.003471,1.860111,-0.082255,0.068068,0.103284,-0.291829,0.252356,0.273315,0.714286
cold,0.012285,0.396122,-0.014383,-0.054938,0.091753,0.386357,0.976742,0.256277,0.714286
red,0.026615,1.314404,-0.066213,-0.135015,0.049739,-0.288636,0.042667,0.134795,0.571429
juicy,-0.05111,-0.019391,0.123604,-0.016785,-0.013744,0.402592,0.450158,0.125047,0.428571
black,0.03174,0.512465,-0.044269,-0.052819,-0.061746,-0.086024,0.328263,0.089659,0.428571
swim,0.043146,0.83795,-0.091815,-0.118913,0.125639,-0.230697,0.006234,0.081649,0.571429


In [190]:
metrics = ['proportion', 'diversity', 'coherence', 
           'dist-mean', 'dist-max', 
           'str-mean', 'str-max']
level = 'properties'
evidence_type = 'prop-specific'
model_name = 'giga_full_updated'
df = get_overview(metrics, level, model_name, evidence_type)
df = utils.raw_to_distance(df, score_names = metrics, reference_name = 'median', 
                           score = 'dist-percent', sum_scores = ['sum', 'bin'])
#df['sum'] = df.sum(axis=1)
df = df.sort_values('sum', ascending=False)
#print(df.round(2).fillna('-').to_latex())
#df.round(2)
df

Unnamed: 0,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max,sum,bin
female,11.531947,0.2,0.673054,-0.152607,-0.123384,-0.383965,-0.444442,1.614372,0.428571
sweet,9.741669,-0.2,0.351083,0.096445,0.121969,0.173015,0.177552,1.494533,0.857143
dangerous,2.881079,8.2,-0.310735,-0.033313,0.060196,-0.56567,-0.32637,1.415027,0.428571
hot,5.713543,0.2,-0.193906,-0.027047,-0.019015,1.548077,0.830291,1.150278,0.571429
juicy,-0.535854,-0.2,0.840266,0.060196,0.0,3.304115,3.376023,0.977821,0.571429
lay_eggs,4.080519,-0.2,1.395491,0.111017,0.063633,0.431466,-0.055215,0.832416,0.714286
used_in_cooking,1.62908,2.2,0.110738,0.101309,0.174255,0.308937,0.531356,0.722239,1.0
median-reference,0.00532,2.5,0.346576,0.747309,0.792711,0.007886,0.014082,0.630555,1.0
wheels,2.081626,0.6,0.112627,0.060358,0.063682,0.48359,0.505446,0.55819,1.0
yellow,2.614985,-0.6,,0.057362,-0.003198,0.905098,0.066838,0.506848,0.666667


# Both models

In [223]:
from collections import defaultdict
import numpy as np
import csv
import os

metrics = [
        'proportion', 
        'diversity', 
           'coherence', 
           'dist-mean', 'dist-max', 
           'str-mean', 'str-max'
]

level = 'properties'
evidence_types = ['all', 'prop-specific', 'non-specific', 'u']

model_name = 'wiki_updated'
model_name_d = 'wiki_corpus'
# model_name = 'giga_full_updated'
# model_name_d = 'giga_corpus'
# load diagnostic results
path = '../data/diagnostic_classification/selectivity-giga_corpus_wiki_corpus_googlenews.csv'
df_d = pd.read_csv(path, index_col=0)


model_dict = defaultdict(dict)
new_model_dict = dict()

for evidence_type in evidence_types:
    df = get_overview(metrics, level, model_name, evidence_type)
    df = utils.raw_to_distance(df, score_names = metrics, reference_name = 'median', 
                           score = 'dist-percent', sum_scores = ['sum', 'bin'])
    
    learned = df_d[model_name_d]['learned'].split(' ')
    not_learned = df_d[model_name_d]['not learned'].split(' ')
    for i, row in df.iterrows():
        model_dict[i][f'{evidence_type}-sum'] = round(row['sum'], 2)
        model_dict[i][f'{evidence_type}-bin'] = round(row['bin'], 2)
for p, d in model_dict.items():
    if p not in ['median', 'median-reference']:
        if p in learned:
            diag = True
        elif p in not_learned:
            diag = False
        elif p == 'female':
            diag = True
        d = model_dict[p]
        d[f'cl'] = diag
        enough_scores = set()
        evidence_types_enough = set()
        for e in evidence_types:
            s = d[f'{e}-sum']
            b = d[f'{e}-bin']
            if s > 0.0 and b > 0.5:
                enough_scores.add(True)
                evidence_types_enough.add(e)
        d['et_enough'] = ' '.join(sorted(list(evidence_types_enough)))
        new_model_dict[p] = d

             
df = pd.DataFrame(new_model_dict).T.fillna('-')
# print(df.to_latex())    
#print(df.to_latex())
path_dir = '../analysis/diagnostic_classification'
os.makedirs(path_dir, exist_ok=True)
df.to_csv(f'{path_dir}/comparison-{model_name}.csv')

df.sort_values('cl')

Unnamed: 0,all-sum,all-bin,prop-specific-sum,prop-specific-bin,non-specific-sum,non-specific-bin,u-sum,u-bin,cl,et_enough
warm,0.22,0.29,-0.11,0.43,0.1,0.43,0.42,0.57,False,u
black,0.06,0.29,-0.03,0.29,-0.18,0.14,0.22,0.43,False,
red,0.1,0.14,-0.02,0.5,0.07,0.29,0.16,0.43,False,
round,0.11,0.29,-0.23,0.14,-0.01,0.57,0.28,0.57,False,u
cold,0.03,0.57,-0.25,0.0,-0.25,0.14,0.19,0.71,False,all u
sweet,0.15,0.57,1.09,0.83,0.79,0.71,-0.3,0.29,False,all non-specific prop-specific
hot,0.04,0.43,2.57,0.71,0.28,0.86,-0.09,0.29,False,non-specific prop-specific
roll,0.81,0.57,0.31,0.43,0.14,0.29,1.08,0.71,False,all u
green,-0.08,0.43,-0.17,0.5,0.11,0.29,-0.06,0.29,False,
blue,0.29,0.43,-0.13,0.5,0.26,0.43,0.37,0.57,False,u


In [231]:
# compare evidence type scores

def get_comparison_evidence(evidence_type, model_name, metrics):
    
    path_dir = '../analysis/diagnostic_classification'
    df = pd.read_csv(f'{path_dir}/comparison-{model_name}.csv', index_col = 0)
    prop_dict = df.to_dict('index')
    
    level = 'properties'
    print(evidence_type)
    df_overview = get_overview(metrics, level, model_name, evidence_type)
    prop_overview_dict = df_overview.to_dict('index')
   
    new_df_dict_enough = dict()
    new_df_dict_not_enough = dict()
    df_total = dict()
    
    cl_dict = dict()

    for prop, d in prop_overview_dict.items():
        if prop not in ['median', 'median-reference']:
            cl = prop_dict[prop]['cl']
            if cl == False:
                new_df_dict_not_enough[(prop+'-'+evidence_type)] = d
                df_total[prop] = dict()
                df_total[prop].update(d)
                df_total[prop]['cl'] = cl

            else:
                et_types = prop_dict[prop]['et_enough']
                if type(et_types) == str:
                    if evidence_type in et_types:
                        new_df_dict_enough[(prop+'-'+evidence_type)] = d
                        df_total[prop] = dict()
                        df_total[prop].update(d)
                        df_total[prop]['cl'] = cl
            
            #df_total[prop]['i'] = i
            
           
    df_enough = pd.DataFrame(new_df_dict_enough).T.sort_index()

    df_mean = pd.DataFrame()

    df_mean[('mean', 'enough')] = df_enough.mean(axis = 0)
    df_mean[('median', 'enough' )] = df_enough.median(axis = 0)
    df_mean[('std', 'enough')] = df_enough.std(axis = 0)

    df_not_enough = pd.DataFrame(new_df_dict_not_enough).T.sort_index()
    df_mean[('mean', 'not-enough')] = df_not_enough.mean(axis = 0, skipna = True)
    df_mean[('median', 'not-enough' )] = df_not_enough.median(axis = 0, skipna = True)
    df_mean[('std', 'not-enough' )] = df_not_enough.std(axis = 0, skipna = True)
    print('n enough', len(new_df_dict_enough.keys()))
    print('n not enough', len(new_df_dict_not_enough.keys()))
    
    df_total = pd.DataFrame(df_total).T
    return df_mean, df_total




In [232]:

metrics = [
            'proportion',
            'diversity', 
           'coherence', 
           'dist-mean', 'dist-max', 
           'str-mean', 'str-max'
          ]


evidence_type = 'all' #'prop-specific'#, 'non-specific', 'u']
#model_name = 'giga_full_updated'
model_name = 'wiki_updated'
df_mean, df_total = get_comparison_evidence(evidence_type, model_name, metrics)
df_mean.round(4).T.sort_index()
#df_total.sort_values('cl')
#df_mean

all
n enough 5
n not enough 10


Unnamed: 0,Unnamed: 1,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max
mean,enough,1.0,296.4,0.1349,0.6649,0.8837,0.0058,0.0449
mean,not-enough,1.0,1087.1,0.1098,0.5945,0.8317,0.0036,0.0357
median,enough,1.0,122.0,0.1311,0.661,0.8926,0.0056,0.0436
median,not-enough,1.0,927.0,0.1008,0.5882,0.8163,0.0028,0.0373
std,enough,0.0,269.2727,0.0141,0.0383,0.0546,0.0021,0.0074
std,not-enough,0.0,1002.8202,0.0176,0.0666,0.07,0.002,0.0104


In [233]:
df_total.sort_values('cl')

Unnamed: 0,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max,cl
warm,1,1181,0.0996022,0.505907,0.758864,0.00304041,0.0559694,False
black,1,820,0.097678,0.615303,0.773284,0.00247025,0.0429802,False
red,1,1275,0.100407,0.537008,0.780208,0.00210826,0.029776,False
sweet,1,26,0.150881,0.689588,0.80821,0.00798246,0.0366528,False
hot,1,92,0.129011,0.631017,0.830864,0.00654335,0.0294876,False
green,1,550,0.113325,0.553604,0.843946,0.00289506,0.0187873,False
blue,1,1819,0.0983739,0.672057,1.0,0.00197607,0.0253429,False
roll,1,3484,0.0969261,0.657345,0.888729,0.00247937,0.041851,False
cold,1,590,0.11094,0.522039,0.812768,0.00359904,0.0378891,False
round,1,1034,0.101197,0.561045,0.819798,0.00280233,0.0385253,False


In [234]:
evidence_type = 'prop-specific'#, 'non-specific', 'u']
df_mean, df_total = get_comparison_evidence(evidence_type, model_name, metrics)
df_mean.round(4).T.sort_index()
#df_total.sort_values('cl')

prop-specific
n enough 6
n not enough 10


Unnamed: 0,Unnamed: 1,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max
mean,enough,0.0326,5.8333,0.3409,0.7629,0.8215,0.0129,0.0199
mean,not-enough,0.015,3.9,0.1589,0.6834,0.74,0.0101,0.0154
median,enough,0.0181,5.5,0.3846,0.7652,0.8269,0.0093,0.0227
median,not-enough,0.0036,3.0,0.2031,0.6856,0.7773,0.0096,0.0142
std,enough,0.0317,2.9944,0.1813,0.0559,0.0478,0.0116,0.0114
std,not-enough,0.0277,3.3483,0.1437,0.1297,0.1155,0.0045,0.0079


In [235]:
df_total.sort_values('cl')

Unnamed: 0,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max,cl
warm,0.00592718,7,0.338158,0.459522,0.528743,0.0033631,0.00536185,False
black,0.00487805,4,0.27363,0.613907,0.773284,0.0065165,0.0174455,False
red,0.000784314,1,0.0,0.780208,0.780208,0.0169457,0.0169457,False
sweet,0.0384615,1,0.0,0.78555,0.78555,0.015178,0.015178,False
hot,0.0869565,8,0.232203,0.702513,0.830864,0.0133791,0.0294876,False
green,0.00181818,1,0.0,0.843946,0.843946,0.0106611,0.0106611,False
blue,0.000549753,1,0.0,0.830071,0.830071,0.0132659,0.0132659,False
roll,0.00287026,10,0.256339,0.668639,0.774423,0.00855531,0.0273939,False
cold,0.00338983,2,0.314954,0.523592,0.535868,0.00757405,0.0099351,False
round,0.00386847,4,0.173969,0.625804,0.716757,0.00529467,0.00791909,False


In [217]:
evidence_type = 'non-specific'#, 'non-specific', 'u']
df_mean, df_total = get_comparison_evidence(evidence_type, model_name, metrics)
df_mean.round(4).T.sort_index()

non-specific
n enough 7
n not enough 12


Unnamed: 0,Unnamed: 1,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max
mean,enough,0.2956,87.7143,0.258,0.7293,0.8832,0.0101,0.042
mean,not-enough,0.0871,79.25,0.1683,0.6073,0.7599,0.0054,0.0241
median,enough,0.2703,36.0,0.2733,0.7279,0.8684,0.0096,0.0428
median,not-enough,0.0603,81.0,0.1415,0.586,0.7503,0.0053,0.0224
std,enough,0.2199,124.3741,0.0512,0.0479,0.0479,0.0022,0.0164
std,not-enough,0.0644,48.86,0.0486,0.066,0.0463,0.0026,0.0122


In [218]:
evidence_type = 'u'#, 'non-specific', 'u']
df_mean, df_total = get_comparison_evidence(evidence_type, model_name, metrics)
df_mean.round(4).T.sort_index()

u
n enough 3
n not enough 12


Unnamed: 0,Unnamed: 1,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max
mean,enough,0.9549,1222.6667,0.097,0.6168,0.9753,0.0023,0.0416
mean,not-enough,0.9044,1195.1667,0.1009,0.5911,0.8279,0.0027,0.0314
median,enough,0.9574,1327.0,0.0968,0.644,1.0,0.002,0.0319
median,not-enough,0.9284,994.5,0.0976,0.5785,0.831,0.0026,0.0332
std,enough,0.0076,284.2434,0.0054,0.0663,0.0428,0.0007,0.0192
std,not-enough,0.0724,1044.3117,0.0125,0.0597,0.0583,0.0009,0.0163


# Overview relations

In [128]:
metrics = ['proportion', 'diversity', 'coherence',
            'dist-mean', 'dist-max', 'str-mean', 'str-max']
level = 'relations'
evidence_type = 'prop-specific'
model_name = 'giga_full_updated'
df = get_overview(metrics, level, model_name, evidence_type)
df = utils.raw_to_distance(df, score_names = metrics, reference_name = 'all', 
                           score = 'dist-percent', sum_scores = ['sum', 'bin'])
df['sum'] = df.sum(axis=1)
df.round(6)

Unnamed: 0,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max,sum,bin
pos,-0.472386,-0.5,0.071302,-0.023468,-0.021723,-0.175199,-0.29089,-1.471274,0.142857
neg,,,0.143935,-0.074833,-0.053654,-0.809087,-0.829011,-1.747181,0.2
all,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
some,-0.730388,-0.5,0.222055,-0.039449,-0.044725,-0.302367,-0.489796,-2.011052,0.142857
few,,,0.143935,-0.074833,-0.053654,-0.809087,-0.829011,-1.747181,0.2
evidence,-0.432051,-0.5,0.071302,-0.013354,-0.021723,-0.174682,-0.262281,-1.380331,0.142857
no_evidence_pos,-0.920131,-0.75,0.591446,-0.093247,-0.073916,-0.566273,-0.711933,-2.741775,0.142857
no_evidence_neg,,,0.09712,-0.057819,-0.052993,-0.825176,-0.849827,-1.826434,0.2
implied_category,-0.75158,-0.5,-0.047619,-0.001321,-0.029476,-0.801536,-0.870005,-3.430329,0.0
typical_of_concept,-0.478179,0.333333,0.168314,0.018863,0.0,1.803397,0.812856,3.752666,0.714286


In [129]:
metrics = ['proportion', 'diversity', 'coherence',
           'dist-mean', 'dist-max', 'str-mean', 'str-max']
level = 'relations'
evidence_type = 'prop-specific'
model_name = 'wiki_updated'
df = get_overview(metrics, level, model_name, evidence_type)
df = utils.raw_to_distance(df, score_names = metrics, reference_name = 'all', 
                           score = 'dist-percent', sum_scores = ['sum', 'bin'])
df['sum'] = df.sum(axis=1)
df.round(4)

Unnamed: 0,proportion,diversity,coherence,dist-mean,dist-max,str-mean,str-max,sum,bin
pos,-0.4847,0.0,0.1151,-0.0107,-0.0068,-0.065,-0.1528,-0.5484,0.1429
neg,,,0.2009,-0.0521,-0.0297,-0.7618,-0.7804,-1.5077,0.2
all,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
some,-0.6565,0.0,0.1744,-0.0526,-0.0095,-0.1093,-0.3254,-0.976,0.1429
few,,,0.2009,-0.0521,-0.0297,-0.7618,-0.7804,-1.5077,0.2
evidence,-0.4023,0.0,0.1151,-0.011,-0.0068,-0.0719,-0.1322,-0.439,0.1429
no_evidence_pos,-0.8933,-0.5,0.6056,-0.0791,-0.0156,-0.2662,-0.5349,-1.781,0.1429
no_evidence_neg,,,0.1744,-0.0415,-0.0159,-0.804,-0.8221,-1.611,0.2
implied_category,-0.7373,-0.25,-0.1343,-0.0929,-0.0654,-0.3355,-0.6771,-2.6202,0.0
typical_of_concept,3.1889,1.6667,0.0358,0.0819,0.0643,-0.2544,-0.6225,5.4694,0.7143
