# General overview


In [1]:
import glob
import csv 
import utils
from DataSet import DataSet
import pandas as pd
import inflect
#import seaborn as sns
#sns.set_theme(style="whitegrid")
import json
from collections import defaultdict

# print full dataframes:
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.set_option('display.max_colwidth', 0)

In [2]:
lexical_data = utils.load_lexical_data()


leopard = [('experiment1', '3'), ('experiment2', '4')]
current = [('experiment3', '5_pilot'), ('scalar_heat','5_scalar_heat')]
total = leopard  +current
print(total)

[('experiment1', '3'), ('experiment2', '4'), ('experiment3', '5_pilot'), ('scalar_heat', '5_scalar_heat')]


In [3]:
# clean
source = 'clean'

all_data = []
for group, run in total:
    d = utils.load_data(run, group, source)
    print(group, run, len(d))
    all_data.extend(d)
print(len(all_data))
data_set_clean = DataSet(all_data, lexical_data, metric = 'prop_true')
#data/clean/diagnostic_dataset/annotations_clean_contradictions_batch_0.5

../data/clean/diagnostic_dataset/annotations_clean_contradictions_batch_0.5/run3-group_experiment1/*.csv
experiment1 3 32994
../data/clean/diagnostic_dataset/annotations_clean_contradictions_batch_0.5/run4-group_experiment2/*.csv
experiment2 4 108022
../data/clean/diagnostic_dataset/annotations_clean_contradictions_batch_0.5/run5_pilot-group_experiment3/*.csv
experiment3 5_pilot 98714
../data/clean/diagnostic_dataset/annotations_clean_contradictions_batch_0.5/run5_scalar_heat-group_scalar_heat/*.csv
scalar_heat 5_scalar_heat 14145
253875


In [17]:
# pilot data properties 

path = '../data/aggregated/pilot_blackbox/blackbox_dataset.csv'

with open(path) as infile:
    data = list(csv.DictReader(infile))
    
properties = set()

for d in data:
    properties.add(d['property'])

print(', '.join(properties))

is_found_in_kitchens, does_live_in_water, is_white, is_eaten_edible, has_skin, is_pink, is_a_plant, has_leaves, has_a_stalk_stem, has_teeth, is_dangerous, is_clothing, is_a_vegetable, made_of_metal, has_an_engine, made_of_fabric_cloth_material, has_a_seat_seats, made_of_glass, is_green, has_a_tail, does_make_music, is_an_insect, made_of_wood, is_red, is_pretty_attractive_beautiful, is_soft, is_sharp, is_tasty, has_skin_peel, is_fast, is_long, is_a_mammal, is_food, is_a_vehicle, is_strong, is_juicy, has_claws, is_healthy, has_pips_seeds, is_sweet, has_flesh, is_circular_round, is_small, is_found_in_seas, is_an_animal, has_roots, made_of_cotton, is_a_fruit, is_tall, has_fur_hair, is_grown, does_grow, has_legs, is_for_children, is_a_tool, does_lay_eggs, is_useful, has_four_legs, does_smell_is_smelly, is_played_does_play, does_fly, is_hard, has_a_beak, is_a_bird, is_expensive, is_a_weapon, made_of_plastic, keeps_warm_makes_warm, has_wings, does_swim, does_kill, is_yellow, has_keys, has_eye

In [6]:
for p, pair in data_set_clean.pairs.items():
    print(p, pair.ml_label)
    break

('red', 'fenugreek') few


In [4]:
source = 'raw'

all_data = []
for group, run in total:
    d = utils.load_data(run, group, source)
    print(group, run, len(d))
    all_data.extend(d)
print(len(all_data))
data_set_full = DataSet(all_data, lexical_data, metric = 'prop_true')

../data/raw_annonymised/diagnostic_dataset/run3-group_experiment1/*.csv
experiment1 3 0
../data/raw_annonymised/diagnostic_dataset/run4-group_experiment2/*.csv
experiment2 4 0
../data/raw_annonymised/diagnostic_dataset/run5_pilot-group_experiment3/*.csv
experiment3 5_pilot 0
../data/raw_annonymised/diagnostic_dataset/run5_scalar_heat-group_scalar_heat/*.csv
scalar_heat 5_scalar_heat 0
0


In [9]:
def get_general_overview(dataset_clean):
    
    data_dict = dict()
    data_dict['total n'] =  {
    'units': len(dataset_clean.units),
    'pairs': len(dataset_clean.pairs),
    'concepts': len(dataset_clean.concept_sets),
    'properties': len(dataset_clean.prop_sets),
    'fine-grained relations': len(dataset_clean.relation_sets),
    'coarse-grained relations': len(set([pair.ml_label for p, pair in 
                                         dataset_clean.pairs.items() if pair.ml_label is not None]))
    }
    print(set([pair.ml_label for p, pair in dataset_clean.pairs.items()]))
    return data_dict
    
overview_dict = get_general_overview(data_set_clean)
df = pd.DataFrame(overview_dict)
print(df.to_latex())

{'few', 'all-some', None, 'few-some', 'some', 'all'}
\begin{tabular}{lr}
\toprule
{} &  total n \\
\midrule
coarse-grained relations &  5 \\
concepts                 &  1756 \\
fine-grained relations   &  12 \\
pairs                    &  3304 \\
properties               &  21 \\
units                    &  30650 \\
\bottomrule
\end{tabular}



In [10]:
def overview_processing(data_set):
    data_dict = {
   
        #'annotations': len(data_set.annotations),
        'mean annotations per unit': sum([len(u.annotations) 
                                          for u in data_set.units])/len(data_set.units),
        'Krip. alpha ': data_set.get_alpha(),

        'mean duration per unit': data_set.get_clean_average_seconds(),
    }
    return data_dict


data_dict_full = dict()
data_dict_full['raw'] = overview_processing(data_set_full)
data_dict_full['clean'] = overview_processing(data_set_clean)

df  = pd.DataFrame(data_dict_full).round(2)
print(df.to_latex())

\begin{tabular}{lrr}
\toprule
{} &    raw &  clean \\
\midrule
mean annotations per unit &  10.08 &  8.06 \\
Krip. alpha               &  0.36 &  0.40 \\
mean duration per unit    &  9.24 &  9.25 \\
\bottomrule
\end{tabular}



In [26]:
def overview_props(dataset):
    
    data_dict = dict()
    data_dict['mean'] = dict()
    
#     pos_labels = ['all', 'all-some', 'some', 'few-some']
#     neg_labels = ['few']
    
    total_pos = []
    total_neg = []
    total_no = []
    
    for prop, prop_set in dataset.prop_sets.items():
        labels = []
        total_pos.append(len(prop_set.pos.pairs))
        total_neg.append(len(prop_set.neg.pairs))
        total_no.append(len(prop_set.nolabel.pairs))
    data_dict['mean']['examples pos'] = sum(total_pos)/len(total_pos)
    data_dict['mean']['examples neg'] = sum(total_neg)/len(total_neg)
    data_dict['mean']['examples no label'] = sum(total_no)/len(total_no)
    return data_dict
    

data_dict = overview_props(data_set_clean)  
df = pd.DataFrame(data_dict)
print(df.round(2).to_latex())

\begin{tabular}{lr}
\toprule
{} &   mean \\
\midrule
examples neg      &  57.43 \\
examples no label &  13.81 \\
examples pos      &  86.10 \\
\bottomrule
\end{tabular}



In [6]:
# relation overview

rel_type = 'fine-grained'
rel_dict = data_set_clean.get_relation_overview(rel_type)
df = pd.DataFrame(rel_dict).round(2)
print(df.to_latex(index=False))
#df

\begin{tabular}{lrrrrr}
\toprule
            relation &  alpha &  seconds &  pairs &  properties &  candidate pairs \\
\midrule
 creative &  0.17 &  7.55 &  400 &  21 &  3095 \\
 rare &  0.28 &  7.07 &  630 &  21 &  3095 \\
 impossible &  0.41 &  8.04 &  617 &  21 &  3095 \\
 implied\_category &  0.53 &  7.73 &  1060 &  21 &  3045 \\
 typical\_of\_property &  0.42 &  7.23 &  633 &  21 &  3045 \\
 unusual &  0.30 &  8.11 &  839 &  21 &  3095 \\
 variability\_open &  0.32 &  9.05 &  672 &  17 &  2422 \\
 typical\_of\_concept &  0.55 &  6.84 &  1056 &  21 &  3045 \\
 affording\_activity &  0.51 &  7.22 &  732 &  17 &  2422 \\
 variability\_limited &  0.34 &  9.32 &  967 &  21 &  3045 \\
 afforded\_unusual &  0.27 &  9.40 &  107 &  4 &  623 \\
 afforded\_usual &  0.63 &  7.81 &  214 &  4 &  623 \\
\bottomrule
\end{tabular}



In [5]:
# coarse-grained

rel_type = 'coarse-grained'
rel_dict = data_set_clean.get_relation_overview(rel_type)
df = pd.DataFrame(rel_dict).round(2)
print(df.to_latex(index=False))

\begin{tabular}{lrrrrr}
\toprule
 relation &  alpha &  seconds &  pairs &  properties &  candidate pairs \\
\midrule
 few &  0.36 &  7.66 &  2289 &  21 &  10908 \\
 all &  0.53 &  7.87 &  3092 &  21 &  7996 \\
 None &  0.07 &  7.64 &  50 &  21 &  2410 \\
 all-some &  0.49 &  8.28 &  827 &  19 &  1936 \\
 some &  0.29 &  9.32 &  1475 &  20 &  6690 \\
 few-some &  0.29 &  6.99 &  194 &  17 &  710 \\
\bottomrule
\end{tabular}



In [36]:

def get_concept_overview(data_set):
    concept_dict = defaultdict(list)
    for p, pair in data_set.pairs.items():
        prop, concept = p
        concept_dict[c].append(pair)
        
    total_pairs = []
    total_pos = []
    total_neg = []
    total_no = []
    
    for c, pairs in concept_dict.items():
        labels = []
        
        

In [47]:
def get_concept_distribution(data_set):
    concept_label_dict = defaultdict(list)
    pos_labels = ['all', 'all-some', 'some', 'few-some']
    neg_labels = ['few']
    for c, concept in data_set.concept_sets.items():
        for p, pair in concept.pairs.items():
            concept_label_dict[c].append('properties')
            ml_label = pair.ml_label
            if ml_label in pos_labels:
                concept_label_dict[c].append('positive examples')
            elif ml_label in neg_labels:
                concept_label_dict[c].append('negative examples')
            else:
                concept_label_dict[c].append('invalid examples')

    cols = ['properties', 'positive examples', 'negative examples', 'invalid examples']

    data_dict = dict()

    for col in cols:
        d = dict()
        counts = []
        for c, labels in concept_label_dict.items():
            cnt = labels.count(col)
            counts.append(cnt)
        mean = sum(counts)/len(counts)
        d['mean'] = mean
        data_dict[col] = d
        
    return data_dict

data_dict = get_concept_distribution(data_set_clean)
df = pd.DataFrame(data_dict)
print(df.round(2).T.to_latex())

\begin{tabular}{lr}
\toprule
{} &  mean \\
\midrule
properties        &  1.88 \\
positive examples &  1.03 \\
negative examples &  0.69 \\
invalid examples  &  0.17 \\
\bottomrule
\end{tabular}



In [10]:
from collections import defaultdict

In [8]:
# relation configurations

def get_configs(data_set, single=True):
    
    config_pair_dict = defaultdict(list)
    config_prop_dict = defaultdict(set)
    
    
    for p, pair in data_set.pairs.items():
        prop, concept = p
        rels = pair.relations
        config = ' '.join((sorted(rels)))
        if single==True:
            if len(rels) == 1:
                config_pair_dict[config].append(p)
                config_prop_dict[config].add(prop)
        else:
            config_pair_dict[config].append(p)
            config_prop_dict[config].add(prop)
            
        
    config_dict = dict()
    for conf, pairs in config_pair_dict.items():
        d = dict()
        d['pairs'] = len(pairs)
        d['properties'] = len(config_prop_dict[conf])
        config_dict[conf] = d
    return config_dict
    
   

In [9]:
config_dict = get_configs(data_set_clean, single=False)
df = pd.DataFrame(config_dict)
print(df.T.sort_values('pairs', ascending = False)[:11].to_latex())

\begin{tabular}{lrr}
\toprule
{} &  pairs &  properties \\
\midrule
impossible                                                                                     &  313 &  19 \\
                                                                                               &  274 &  21 \\
rare unusual                                                                                   &  242 &  19 \\
affording\_activity implied\_category typical\_of\_concept typical\_of\_property variability\_limited &  179 &  12 \\
variability\_open                                                                               &  175 &  12 \\
creative impossible                                                                            &  170 &  18 \\
affording\_activity implied\_category typical\_of\_concept typical\_of\_property                     &  155 &  13 \\
variability\_limited                                                                            &  114 &  15 \\
affording\_activity implied\_

In [21]:
config_dict = get_configs(data_set_clean, single=True)
df = pd.DataFrame(config_dict)
df.T.sort_values('pairs', ascending = False)

Unnamed: 0,pairs,properties
impossible,313,19
variability_open,175,12
variability_limited,114,15
unusual,84,20
creative,23,14
afforded_unusual,20,2
rare,19,10
implied_category,16,8
affording_activity,4,3
typical_of_concept,4,4
