# Analyze based on semantic categories

1.) change tfidf so we compare equivalent categories only - done
2.) update ranking accordingly

In [1]:
import os
from collections import Counter, defaultdict
import csv
import pandas as pd
import numpy as np


pd.set_option('display.max_rows', None)
from sklearn.metrics import precision_recall_fscore_support

In [3]:
f_original = os.listdir('../contexts/giga_full/vocab')
print(len(f_original))
f_update = os.listdir('../contexts/giga_full_updated/vocab')
print(len(f_update))

1446
1636


In [2]:
f_original = os.listdir('../contexts/wiki/vocab')
print(len(f_original))
f_update = os.listdir('../contexts/wiki_updated/vocab')
print(len(f_update))

1669
1874


In [10]:
def get_categories(prop, model_name):
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    categories = set()
    for d in os.listdir(path_dir):
        categories.add(d)
    return categories

def get_context_cnts(prop, cat, label, model_name):
    
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    path_label = f'{path_dir}/{cat}/{label}'
    
    context_cnt = Counter()
    for f in os.listdir(path_label):
        full_path = f'{path_label}/{f}'
        if full_path.endswith('.csv'):
            with open(full_path) as infile:
                data = list(csv.DictReader(infile))
            for d in data:
                context = d['']
                diff = float(d['diff'])
                if diff > 0:
                    context_cnt[context] += 1
    return context_cnt
    
def get_n_concepts_total(prop, cat, model_name):
    
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    label = 'pos'
    path_pos = f'{path_dir}/{cat}/{label}'
    label = 'neg'
    path_neg = f'{path_dir}/{cat}/{label}'
    
    files_pos = [f for f in os.listdir(path_pos) if f.endswith('.csv')]
    files_neg = [f for f in os.listdir(path_neg) if f.endswith('.csv')]
    
    return len(files_pos), len(files_neg)

def get_f1_distinctiveness(n_pos, n_neg, total_pos, total_neg):
    
   
    total_instances = total_pos + total_neg
    labels = []
    [labels.append('pos') for i in range(total_pos)]
    [labels.append('neg') for i in range(total_neg)]
    pred_labels_pos = []
    for i in range(total_pos):
        if i < n_pos:
            pred_labels_pos.append('pos')
        else:
            pred_labels_pos.append('neg')
#     print(n_pos, total_pos)
#     print(pred_labels_pos.count('pos'), pred_labels_pos.count('neg'))
    
    pred_labels_neg = []
    for i in range(total_neg):
        if i < n_neg:
            pred_labels_neg.append('pos')
        else:
            pred_labels_neg.append('neg')
#     print(n_neg, total_neg)
#     print(pred_labels_neg.count('pos'), pred_labels_neg.count('neg'))
    
    predictions = pred_labels_pos + pred_labels_neg
    
    
    #print(len(labels), len(predictions))
    #print(pos_predictions, neg_predictions)
    
    p, r, f1, supp = precision_recall_fscore_support(labels, predictions, average = 'weighted', 
                                                     zero_division=0)
    #average='weighted'
    
    return p, r, f1


    
def aggregate_contexts(prop, cutoff, model_name):
    aggregation_name = 'aggregated-tfidf-raw-10000-categories'
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
    os.makedirs(path_dir_agg, exist_ok = True)
    
    context_cnts_all = Counter()
    context_cat_dict = defaultdict(set)

    cats = get_categories(prop, model_name)

    for cat in cats:
        context_cnts_pos = get_context_cnts(prop, cat, 'pos', model_name)
        context_cnts_neg = get_context_cnts(prop, cat, 'neg', model_name)
        total_pos, total_neg = get_n_concepts_total(prop, cat, model_name)
        
        context_f1_dict = Counter()
        context_score_dict = defaultdict(dict)
        
        # get distinctiveness
        for c, cnt_pos in context_cnts_pos.most_common():
            cnt_neg = context_cnts_neg[c]
            p, r, f1 = get_f1_distinctiveness(cnt_pos, cnt_neg, total_pos, total_neg)
            context_f1_dict[c] = f1
            context_score_dict[c] = {'p': p,'r':r, 'f1': f1}
        
        table = []
        for c, f1 in context_f1_dict.most_common():
            scores = context_score_dict[c]
            d = dict()
            d['context'] = c
            d.update(scores)
            d['n_pos'] = context_cnts_pos[c]
            d['total_pos'] = total_pos
            d['n_neg'] = context_cnts_neg[c]
            d['total_neg'] = total_neg
            table.append(d)
        
        # collect and write to file
        f = f'{path_dir_agg}/{cat}.csv'
        
        header = table[0].keys()
        with open(f, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = header)
            writer.writeheader()
            for d in table:
                writer.writerow(d)
        
                
def prepare_annotation(prop, model_name, cutoff=3, cutoff_concepts = 5):
    
    annotation_name = f'annotation-tfidf-top_{cutoff}_{cutoff_concepts}-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    os.makedirs(path_dir_annotation, exist_ok = True)
    f_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}/annotation-updated.csv'
    
    # paths aggregated files:
    aggregation_name = 'aggregated-tfidf-raw-10000-categories'
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'

    
    # get categories
    cats = get_categories(prop, model_name)
    
    # collect all contexts and categories 
    context_cats_dict = defaultdict(set)
    
    # load top per category
    for cat in cats:
        path = f'{path_dir_agg}/{cat}.csv'
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        # sort by f1
        f1_dict  = defaultdict(list)
        for d in data:
            f1 = d['f1']
            f1_dict[f1].append(d)
        scores = sorted(list(f1_dict.keys()), reverse=True)
        top_scores = scores[:cutoff]
        top_context_dicts = []
        for ts in top_scores:
            dicts = f1_dict[ts]
            for d in dicts:
                n_pos = int(d['n_pos'])
                if n_pos > cutoff_concepts:
                    top_context_dicts.append(d)
    
        contexts = [d['context'] for d in top_context_dicts]
        # record categories
        for c in contexts:
            context_cats_dict[c].add(cat)
    
    with open(f_annotation, 'w') as outfile:
        outfile.write('context,evidence_type,categories\n')
        for c, cats in context_cats_dict.items():
            outfile.write(f'{c}, ,{" ".join(cats)}\n')

def get_properties():
    properties = []
    for path in os.listdir('../data/aggregated/'):
        prop = path.split('.')[0]
        if 'female-' not in prop and prop != '':
            properties.append(prop)
    return properties

def get_top_distinctive_contexts(properties, model_name, top_cutoff=3, concept_cutoff=3):
    aggregation_name = 'aggregated-tfidf-raw-10000-categories'
    ann_name = f'annotation-tfidf-top_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    path_results = f'../results/{model_name}/tfidf-raw-10000/each_target_vs_corpus_per_category'
    table = []
    for prop in properties:
        path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
        path = path = f'{path_dir_agg}/all.csv'
        # load file containing all contexts
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        # top distinctive context
        d_prop = dict()
        d_prop['property'] = prop
        # sort data by f1
        f1_dict = defaultdict(list)
        for d in data:
            f1 = d['f1']
            f1_dict[f1].append(d)
            
        # get n extracted candidates
        f_ann =  f'../analysis/{model_name}/{ann_name}/{prop}/annotation-updated.csv'
        with open(f_ann) as infile:
            data = list(csv.DictReader(infile))
        n_contexts = len(data)
        
        # get number concepts
        dir_results = f'{path_results}/{prop}/all/pos/'
        n_files = len([f for f in os.listdir(dir_results) if f.endswith('.csv')])
        
        top_score = max(list(f1_dict.keys()))
        top_dicts = f1_dict[top_score]
        top_context_dict = top_dicts[0]
        top_contexts = ' '.join([d['context'] for d in top_dicts])
        d_prop['n_contexts'] = n_contexts
        d_prop['n_concepts'] = n_files

        for k, v in top_context_dict.items():
            if k != 'context':
                v = float(v)
                d_prop[k] = v
        d_prop['contexts'] = top_contexts
        table.append(d_prop)
    return table

In [6]:
model_name = 'wiki_updated'
properties = get_properties()
#properties_test = ['dangerous', 'cold', 'lay_eggs']
#properties = [p for p in properties if p not in properties_test]
#properties = properties_test
cutoff = 3
cutoff_concepts = 3

for prop in properties:
    print(prop)
    
    aggregate_contexts(prop, cutoff, model_name)
    prepare_annotation(prop, model_name, cutoff, cutoff_concepts) 

square
warm
black
red
fly
dangerous
wings
sweet
hot
used_in_cooking
juicy
green
made_of_wood
blue
yellow
roll
female
cold
round
wheels
lay_eggs
swim


In [16]:
# get top distinctive contexts per prop

model_name = 'giga_full_updated'
properties = get_properties()
table = get_top_distinctive_contexts(properties, model_name)
df = pd.DataFrame(table)
df.sort_values('f1', ascending = False).round(2)

Unnamed: 0,property,n_contexts,n_concepts,p,r,f1,n_pos,total_pos,n_neg,total_neg,contexts
9,used_in_cooking,572,101,0.96,0.95,0.95,93.0,101.0,0.0,56.0,add recipe
4,fly,952,44,0.88,0.87,0.87,29.0,44.0,2.0,90.0,flew
0,square,1504,87,0.91,0.84,0.86,70.0,87.0,0.0,21.0,built
20,lay_eggs,74,33,0.86,0.84,0.83,20.0,33.0,1.0,57.0,eggs
16,female,45,109,0.85,0.84,0.83,77.0,109.0,9.0,144.0,herself
7,sweet,35,91,0.87,0.82,0.83,65.0,91.0,1.0,63.0,sweet
5,dangerous,1114,65,0.86,0.82,0.82,45.0,65.0,1.0,51.0,killed
13,blue,2234,59,0.87,0.83,0.81,31.0,59.0,0.0,106.0,magic
6,wings,560,60,0.85,0.82,0.81,36.0,60.0,1.0,77.0,bird
19,wheels,244,70,0.87,0.79,0.8,51.0,70.0,1.0,25.0,drove wheel


In [33]:
# latex table for paper:
cols = ['property', 'n_concepts', 'n_contexts', 'f1', 'contexts']
df = df.sort_values('f1', ascending = False).round(2)
print(df[cols].to_latex(index=False))

\begin{tabular}{lrrrl}
\toprule
        property &  n\_concepts &  n\_contexts &    f1 &    contexts \\
\midrule
 used\_in\_cooking &         102 &         705 &  0.92 &        meat \\
           wings &          81 &         332 &  0.89 &       birds \\
             fly &          63 &          55 &  0.88 &        bird \\
          female &         122 &         109 &  0.87 &         she \\
            blue &          60 &        1819 &  0.83 &     evening \\
          wheels &          78 &         105 &  0.83 &     chassis \\
            roll &          55 &        3485 &  0.83 &        from \\
          yellow &          42 &         111 &  0.81 &       tribe \\
          square &          90 &         442 &  0.81 &       built \\
           green &          91 &         550 &  0.81 &       green \\
       dangerous &          76 &         627 &  0.79 &     killing \\
        lay\_eggs &          72 &          26 &  0.79 &        bird \\
             hot &         102 &          92

In [14]:
# get top distinctive contexts per prop

# model_name = 'wiki_updated'
# properties = get_properties()
# table = get_top_distinctive_contexts(properties, model_name)
# df = pd.DataFrame(table)
# df.sort_values('f1', ascending = False).round(2)

### Transfer old annotations to new files


In [2]:
properties = get_properties()
model_name_current = 'giga_full_updated'
model_name_old = 'giga_full'


for prop in properties:
    # current file:
    annotation_name = 'annotation-tfidf-top20-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation_new = f'{path_dir_annotation}/annotation.csv'
    f_annotation_tr = f'{path_dir_annotation}/annotation-transferred.csv'

    # old file:
    annotation_name = 'annotation-tfidf-top20-raw-10000'
    path_dir_annotation = f'../analysis/{model_name_old}/{annotation_name}/{prop}-pos'
    f_annotation_old = f'{path_dir_annotation}/annotation-done.csv'

    # load old annotations
    context_annotation_dict=dict()
    with open(f_annotation_old) as infile:
        data = list(csv.DictReader(infile))
        for d in data:
            c = d['context']
            et = d['evidence']
            context_annotation_dict[c] = et
            #c = d['context']

    # load new candidates

    with open(f_annotation_new) as infile:
        data = list(csv.DictReader(infile))

    # fill in old annotations
    for d in data:
        c = d['context']
        if c in context_annotation_dict:
            et = context_annotation_dict[c]
        else:
            et = 'NA'
        d['evidence_type'] = et

    # write to new file

    with open(f_annotation_tr, 'w') as outfile:
        writer = csv.DictWriter(outfile, fieldnames = data[0].keys())
        writer.writeheader()
        for d in data:
            writer.writerow(d)

NameError: name 'get_properties' is not defined

In [29]:
# transfer new annotations to updated f1 scores

properties = get_properties()
#properties = ['dangerous']
model_name_current = 'giga_full_updated'
model_name_old = 'giga_full'


for prop in properties:
    # current file:
    annotation_name = 'annotation-tfidf-top_3_3-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation_new = f'{path_dir_annotation}/annotation-updated.csv'
    f_annotation_tr = f'{path_dir_annotation}/annotation-transferred-updated.csv'

    # old file:
    annotation_name = 'annotation-tfidf-top_3_5-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation_old = f'{path_dir_annotation}/annotation-updated-done.csv'

    # load old annotations
    if os.path.isfile(f_annotation_old):
        print('found file')
        context_annotation_dict=dict()
        with open(f_annotation_old) as infile:
            data = list(csv.DictReader(infile))
            for d in data:
                c = d['context']
                et = d['evidence_type']
                context_annotation_dict[c] = et
                #c = d['context']

        # load new candidates

        with open(f_annotation_new) as infile:
            data = list(csv.DictReader(infile))

        # fill in old annotations
        for d in data:
            c = d['context']
            if c in context_annotation_dict:
                et = context_annotation_dict[c]
            else:
                et = 'NA'
            d['evidence_type'] = et

        # write to new file

        with open(f_annotation_tr, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = data[0].keys())
            writer.writeheader()
            for d in data:
                writer.writerow(d)

found file
found file
found file


In [12]:
# transfer giga annotations to wiki

properties = get_properties()
#properties = ['dangerous']
model_name_current = 'wiki_updated'
model_name_old = 'giga_full_updated'


for prop in properties:
    print(prop)
    # current file:
    annotation_name = 'annotation-tfidf-top_3_3-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name_current}/{annotation_name}/{prop}'
    f_annotation_new = f'{path_dir_annotation}/annotation-updated.csv'
    f_annotation_tr = f'{path_dir_annotation}/annotation-transferred-updated.csv'

    # old file:
    annotation_name = 'annotation-tfidf-top_3_3-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name_old}/{annotation_name}/{prop}'
    f_annotation_old = f'{path_dir_annotation}/annotation-updated-done.csv'

    # load old annotations
    if os.path.isfile(f_annotation_old):
        print('found file')
        context_annotation_dict=dict()
        with open(f_annotation_old) as infile:
            data = list(csv.DictReader(infile))
            for d in data:
                c = d['context']
                et = d['evidence_type']
                context_annotation_dict[c] = et
                #c = d['context']

        # load new candidates

        with open(f_annotation_new) as infile:
            data = list(csv.DictReader(infile))

        # fill in old annotations
        for d in data:
            c = d['context']
            if c in context_annotation_dict:
                et = context_annotation_dict[c]
            else:
                et = 'NA'
            d['evidence_type'] = et

        # write to new file

        with open(f_annotation_tr, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = data[0].keys())
            writer.writeheader()
            for d in data:
                writer.writerow(d)

square
found file
warm
found file
black
found file
red
found file
fly
found file
dangerous
found file
wings
found file
sweet
found file
hot
found file
used_in_cooking
found file
juicy
found file
green
found file
made_of_wood
found file
blue
found file
yellow
found file
roll
found file
female
found file
cold
found file
round
found file
wheels
found file
lay_eggs
found file
swim
found file


### Complete annotations

In [13]:
from collections import defaultdict
import os
import csv

In [128]:
def get_annotation_status(model_name, top_cutoff, concept_cutoff):
    dir_path = f'../analysis/{model_name}'
    dir_annotations = f'{dir_path}/annotation-tfidf-top_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    annotation_dict = defaultdict(set)
    line_dict = dict()

    for f in os.listdir(dir_annotations):
        if  not f.endswith('.csv') and not f.endswith('.ipynb_checkpoints'):
            prop = f.split('/')[-1]
            full_path = f'{dir_annotations}/{f}'
            
            #print(full_path)
            # get categories:
            files = os.listdir(full_path)
            # get number of words
            path_file = f'{full_path}/annotation-updated.csv'
            with open(path_file) as infile:
                lines = infile.read().strip().split('\n')
                not_annotated = [l for l in lines if l.strip().split(',')[1] == 'NA']
            line_dict[prop] = (len(lines), len(not_annotated), len(lines)-len(not_annotated))
            if 'annotation-updated-done.csv' in files:
                annotation_dict['complete'].add(prop)
            else:
                annotation_dict['incomplete'].add(prop)
                
    return annotation_dict, line_dict

def show_annotation_status(model_name, top_cutoff, concept_cutoff):
    annotation_dict, line_dict = get_annotation_status(model_name, 
                                        top_cutoff, concept_cutoff)
    # same category not annotated:
    print('completed:\n')
    for prop in sorted(list(annotation_dict['complete'])):
        # cats open:
        print(prop, line_dict[prop])
    print()
    print('Incomplete:\n')
    for prop in sorted(annotation_dict['incomplete']):
        if prop not in annotation_dict['complete']:
            print(prop, line_dict[prop])
    return annotation_dict
            
            
def get_evidence_dict(model_name, prop, top_cutoff, concept_cutoff):
    
    annotation_name = f'annotation-tfidf-top_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation = f'{path_dir_annotation}/annotation-updated-done.csv'
    
    ev_dict = dict()
    
    with open(f_annotation) as infile:
        data = list(csv.DictReader(infile))
    for d in data:
        et = d['evidence_type']
        ev = d['context']
        ev_dict[ev] = et
    return ev_dict
        
            

def get_evidence_distribution(model_name, prop, top_cutoff, concept_cutoff):
    
    # current file:
    annotation_name = f'annotation-tfidf-top_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation = f'{path_dir_annotation}/annotation-updated-done.csv'
    
    ev_dict = get_evidence_dict(model_name, prop, top_cutoff, concept_cutoff)
    
    ev_cnts = Counter()
    
    for e, et in ev_dict.items():
        ev_cnts[et] += 1
        if et != 'u':
            ev_cnts['all'] += 1
        if et in ['p', 'l', 'n']:
            ev_cnts['prop_specific'] += 1
        elif et in ['i', 'r', 'b']:
            ev_cnts['non-specific'] += 1
    
    total_contexts = len(ev_dict)
    
    ev_counts_norm = dict()
    for ev, cnt in ev_cnts.items():
        ev_counts_norm[ev]  = cnt/total_contexts
    return ev_counts_norm

In [129]:
model_name = 'giga_full_updated'
top_cutoff = 3
concept_cutoff = 3
ann_dict = show_annotation_status(model_name, top_cutoff, concept_cutoff)

completed:

black (1150, 0, 1150)
blue (2235, 0, 2235)
cold (1082, 0, 1082)
dangerous (1115, 0, 1115)
female (46, 0, 46)
fly (953, 0, 953)
green (579, 0, 579)
hot (85, 0, 85)
juicy (811, 0, 811)
lay_eggs (75, 0, 75)
made_of_wood (787, 0, 787)
red (1768, 0, 1768)
roll (3887, 0, 3887)
round (521, 0, 521)
square (1505, 0, 1505)
sweet (36, 0, 36)
swim (1382, 0, 1382)
used_in_cooking (573, 0, 573)
warm (2109, 0, 2109)
wheels (245, 0, 245)
wings (561, 0, 561)
yellow (53, 0, 53)

Incomplete:



In [130]:
model_name = 'giga_full_updated'
properties = ann_dict['complete']
top_cutoff = 3
concept_cutoff = 3

cols = ['property', 'u', 'all', 'prop_specific', 'non-specific', 'p', 'n', 'l', 'i', 'r', 'b']
table = []
for prop in properties:
    d = dict()
    d['property'] =  prop
    d.update(get_evidence_distribution(model_name, prop, top_cutoff, concept_cutoff))
    for c in cols:
        if c not in d:
            d[c] = np.nan
    table.append(d)
   
cols = ['property', 'u', 'all', 'prop_specific', 'non-specific', 'p', 'n', 'l', 'i', 'r', 'b']
df = pd.DataFrame(table)[cols]
df = df[cols].sort_values('all', ascending = False).round(3)
df

Unnamed: 0,property,u,all,prop_specific,non-specific,p,n,l,i,r,b
2,used_in_cooking,0.357,0.643,0.014,0.629,0.007,0.003,0.003,0.302,0.327,
15,sweet,0.4,0.6,0.057,0.543,0.057,,,0.543,,
8,female,0.667,0.333,0.111,0.222,,0.067,0.044,0.089,0.022,0.111
14,hot,0.702,0.298,0.036,0.262,0.024,0.012,,0.167,0.095,
13,lay_eggs,0.703,0.297,0.027,0.27,0.027,,,0.176,0.095,
4,green,0.839,0.161,0.002,0.159,0.002,,,0.145,0.014,
20,wheels,0.84,0.16,0.016,0.143,0.008,,0.008,0.049,0.094,
19,wings,0.85,0.15,0.011,0.139,0.002,,0.009,0.045,0.068,0.027
18,juicy,0.88,0.12,0.002,0.117,0.002,,,0.102,0.015,
10,dangerous,0.883,0.117,0.021,0.096,0.002,0.006,0.013,0.037,0.054,0.005


In [131]:
print(df.round(3).fillna('-').to_latex(index=False))

\begin{tabular}{lrrlrlllrll}
\toprule
        property &      u &    all & prop\_specific &  non-specific &      p &      n &      l &      i &      r &      b \\
\midrule
 used\_in\_cooking &  0.357 &  0.643 &         0.014 &         0.629 &  0.007 &  0.003 &  0.003 &  0.302 &  0.327 &      - \\
           sweet &  0.400 &  0.600 &         0.057 &         0.543 &  0.057 &      - &      - &  0.543 &      - &      - \\
          female &  0.667 &  0.333 &         0.111 &         0.222 &      - &  0.067 &  0.044 &  0.089 &  0.022 &  0.111 \\
             hot &  0.702 &  0.298 &         0.036 &         0.262 &  0.024 &  0.012 &      - &  0.167 &  0.095 &      - \\
        lay\_eggs &  0.703 &  0.297 &         0.027 &         0.270 &  0.027 &      - &      - &  0.176 &  0.095 &      - \\
           green &  0.839 &  0.161 &         0.002 &         0.159 &  0.002 &      - &      - &  0.145 &  0.014 &      - \\
          wheels &  0.840 &  0.160 &         0.016 &         0.143 &  0.008 &    

## Evidence distribution per semantic category

In [13]:
import json
import csv
import os
from collections import Counter, defaultdict
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [45]:
def load_prop_data(prop):
    
    path = f'../data/aggregated_semantic_info/{prop}.json'
    with open(path) as infile:
        concept_dict = json.load(infile)
    return concept_dict


def load_concept_evidence(concept, prop, model_name, categories):
    
    categories.add('all')
    contexts = set()
    dir_path = f'../results/{model_name}/tfidf-raw-10000/each_target_vs_corpus_per_category'
    
    for cat in categories:
        f_path = f'{dir_path}/{prop}/{cat}/pos/{concept}.csv'
        if os.path.isfile(f_path):
            with open(f_path) as infile:
                data = list(csv.DictReader(infile))
            for d in data:
                context = d['']
                diff = float(d['diff'])
                if diff > 0:
                    contexts.add(context)
    return contexts  

def get_categories(prop, model_name):
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    categories = set()
    for d in os.listdir(path_dir):
        if '.' not in d:
            categories.add(d)
    return categories

def get_n_examples_cat(category, prop, model_name):
    
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}/{category}'
    
    n_example_dict = dict()
    for l in ['pos', 'neg']:
        path = f'{path_dir}/{l}'
        examples = os.listdir(path)
        examples = [f for f in examples if f.endswith('.csv')]
        n_example_dict[l] = len(examples)
    return n_example_dict

def get_top_ev_categories(prop, model_name, top_cutoff, concept_cutoff):
    #table = dict()
    aggregation_name = f'aggregated-tfidf-raw-10000-categories'
    categories = get_categories(prop, model_name)
    
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
    evidence_dict = get_evidence_dict(model_name, prop, top_cutoff, concept_cutoff)
    
    et_context_dict = defaultdict(set)
    for c, et in evidence_dict.items():
        et_context_dict[et].add(c)
    et_sorted = ['p', 'n', 'l', 'i', 'r', 'b', 'u'] 
    et_cat_context_perf_dict = defaultdict(dict)
    
    # get top performance per evidence type for each category
    for cat in categories:
        path = f'{path_dir_agg}/{cat}.csv'
        n_example_dict = get_n_examples_cat(cat, prop, model_name)
        # load file containing all concepts
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        perf_data = defaultdict(list)
        for d in data:
            f1 = float(d['f1'])
            perf_data[f1].append(d)
        perf_ranked =sorted(list(perf_data.keys()), reverse=True)
        
        # go through evidence types
        for et in et_sorted:
            contexts = et_context_dict[et]
            for f1 in perf_ranked:
                data = perf_data[f1]
                contexts_f1 = [d['context'] for d in data]
                contexts_et = set()
                for c in contexts_f1:
                    if c in contexts:
                        contexts_et.add(c)
                if len(contexts_et) > 0:
                    et_cat_context_perf_dict[(cat, et)]['f1'] = round(f1, 2)
                    et_cat_context_perf_dict[(cat, et)]['contexts'] = ' '.join(contexts_et)
                    et_cat_context_perf_dict[(cat, et)].update(n_example_dict)
                    break              
                
    return et_cat_context_perf_dict

In [48]:
prop = 'fly'
model_name = 'giga_full_updated'
top_cutoff = 3
concept_cutoff = 3
table = get_top_ev_categories(prop, model_name, top_cutoff, concept_cutoff)
df = pd.DataFrame(table)
df.T

Unnamed: 0,Unnamed: 1,f1,contexts,pos,neg
all,p,0.87,flew,44,90
all,n,0.77,hovering,44,90
all,l,0.75,lands,44,90
all,i,0.81,bird,44,90
all,r,0.82,wings,44,90
all,b,0.64,tourist,44,90
all,u,0.81,overhead,44,90
food,p,1.0,fly,6,14
food,n,0.8,hovering,6,14
food,l,0.64,landing,6,14


In [39]:
# get mean per cat

properties = get_properties()
prop_table = []
for prop in properties:
    table = get_top_ev_categories(prop, model_name, top_cutoff, concept_cutoff)
    type_scores = defaultdict(list)
    ev_type_mean_scores = dict()
    for (cat, ev_type), f1_dict in table.items():
        type_scores[ev_type].append(f1_dict['f1'])
    for ev_type, scores in type_scores.items():
        mean = sum(scores)/len(scores)
        prop_dict = dict()
        ev_type_mean_scores[ev_type] = mean
    prop_dict['property'] = prop
    prop_dict.update(ev_type_mean_scores)
    prop_table.append(prop_dict)
    
cols = ['property', 'p', 'n', 'l', 'i', 'r', 'b', 'u']
df = pd.DataFrame(prop_table)
df = df[cols].sort_values('p', ascending = False).round(2).fillna('-')

In [40]:
df

Unnamed: 0,property,p,n,l,i,r,b,u
9,used_in_cooking,0.95,0.74,0.9,0.98,0.98,-,0.96
4,fly,0.93,0.81,0.83,0.89,0.91,0.75,0.93
19,wheels,0.9,-,0.81,0.92,0.92,-,0.87
12,made_of_wood,0.9,-,-,0.86,0.88,-,0.92
7,sweet,0.9,-,-,0.92,-,-,0.88
15,roll,0.87,-,0.76,0.91,0.92,-,0.95
20,lay_eggs,0.86,-,-,0.92,0.88,-,0.9
11,green,0.86,-,-,0.87,0.81,-,0.93
6,wings,0.85,-,0.85,0.89,0.88,0.81,0.9
10,juicy,0.83,-,-,0.9,0.86,-,0.91


In [41]:
print(df.to_latex(index=False))

\begin{tabular}{llllrllr}
\toprule
        property &     p &     n &     l &     i &     r &     b &     u \\
\midrule
 used\_in\_cooking &  0.95 &  0.74 &   0.9 &  0.98 &  0.98 &     - &  0.96 \\
             fly &  0.93 &  0.81 &  0.83 &  0.89 &  0.91 &  0.75 &  0.93 \\
          wheels &   0.9 &     - &  0.81 &  0.92 &  0.92 &     - &  0.87 \\
    made\_of\_wood &   0.9 &     - &     - &  0.86 &  0.88 &     - &  0.92 \\
           sweet &   0.9 &     - &     - &  0.92 &     - &     - &  0.88 \\
            roll &  0.87 &     - &  0.76 &  0.91 &  0.92 &     - &  0.95 \\
        lay\_eggs &  0.86 &     - &     - &  0.92 &  0.88 &     - &  0.90 \\
           green &  0.86 &     - &     - &  0.87 &  0.81 &     - &  0.93 \\
           wings &  0.85 &     - &  0.85 &  0.89 &  0.88 &  0.81 &  0.90 \\
           juicy &  0.83 &     - &     - &  0.90 &  0.86 &     - &  0.91 \\
             hot &  0.83 &  0.69 &     - &  0.92 &  0.89 &     - &  0.88 \\
       dangerous &  0.81 &  0.92 &  0.8

## Evidence strength


In [116]:

def get_tfidf_scores_context(prop, cat, contexts, model_name):
    
    # collect scores here:
    context_tfidf_dict = defaultdict(dict)

    # get tfidf scores
    path_tfidfs = f'../results/{model_name}/tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_tfidfs = f'{path_tfidfs}/{prop}/{cat}/pos'
    concept_files = [f for f in os.listdir(path_tfidfs) if f.endswith('.csv')]
    
    for cf in concept_files:
        full_path = f'{path_tfidfs}/{cf}'
        concept = cf.split('.')[0]
        with open(full_path) as infile:
            data = list(csv.DictReader(infile))
        for cw_target in contexts:
            found = False
            for d in data:
                cw = d['']
                score = d['target']
                if cw == cw_target:
                    context_tfidf_dict[cw][concept] = (float(score))
                    found = True
                    break
            if found == False:
                context_tfidf_dict[cw_target][concept]  = 0.0
    return context_tfidf_dict


def evidence_strength_to_file(prop, et, contexts, model_name, top_cutoff, concept_cutoff):

    filepath_target = f'../analysis/{model_name}/evidence_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    filepath_target_et = f'{filepath_target}/{prop}/{et}'
    os.makedirs(filepath_target_et, exist_ok=True)
    categories = get_categories(prop, model_name)
    tfidf_scores_cats = dict()
              
    for context in contexts:
        full_filepath = f'{filepath_target_et}/{context}.csv'
        for cat in categories:
            tfidf_scores_cats[cat] = get_tfidf_scores_context(prop, cat, contexts, model_name)[context]
            #print(cat, tfidf_scores_cats[cat]['wren'])
        df = pd.DataFrame(tfidf_scores_cats)
        df['mean'] = df.mean(numeric_only=True, axis = 1)
        df.to_csv(full_filepath)

In [151]:
model_name = 'giga_full_updated'
properties = get_properties()

top_cutoff = 3
concept_cutoff = 3


for prop in properties:
    table = get_top_ev_categories(prop, model_name, top_cutoff, concept_cutoff)

    ets = ['i', 'r', 'b']
    for et_target in ets:
        contexts = set()
        for (cat, et), d in table.items():
            if et_target == et:
                contexts.update(set(d['contexts'].split(' ')))
        if len(contexts) > 0:
            evidence_strength_to_file(prop, et_target, contexts, model_name, top_cutoff, concept_cutoff)

KeyboardInterrupt: 

In [150]:
# compare evidence strength across properties - may not make too much sense...

ets = ['p', 'l', 'n']
properties = get_properties()

model_name = 'giga_full_updated'

top_cutoff = 3
concept_cutoff = 3

prop_et_strength_dict = defaultdict(dict)

path_evidence = f'../analysis/{model_name}/evidence_{top_cutoff}_{concept_cutoff}-raw-10000-categories'

for prop in properties:
    means_prop = []
    for et in ets:
        means_et = []
        path_dir = f'{path_evidence}/{prop}/{et}'
        if os.path.isdir(path_dir):
            ev_files = [f for f in os.listdir(path_dir) if f.endswith('.csv')]
            for f in ev_files:
                full_path = f'{path_dir}/{f}'
                with open(full_path) as infile:
                    data = list(csv.DictReader(infile))
                    means = [float(d['mean']) for d in data]
                    mean = sum(means)/len(means)
                    #means_et.append(mean)
                    means_prop.append(mean)
    if len(means_prop) > 0:
        mean_prop = sum(means_prop)/len(means_prop)
    else:
        mean_prop = 0.0
    prop_et_strength_dict[prop]['prop_specific'] = mean_prop
            
df = pd.DataFrame(prop_et_strength_dict)
df.T.sort_values('prop_specific', ascending=False).round(4)

Unnamed: 0,prop_specific
juicy,0.0521
red,0.0332
lay_eggs,0.0284
yellow,0.0277
hot,0.0264
female,0.0255
green,0.0224
made_of_wood,0.0208
wheels,0.0149
used_in_cooking,0.0144


## Relation analysis

In [152]:
from statistics import stdev
import numpy as np

In [211]:

  
def get_relation_combinations(properties, combinations):
    
    relation_pair_dict = defaultdict(set)
    for prop in properties:
        prop_dict = load_prop_data(prop)
        for c, d in prop_dict.items():
            ml_label = d['ml_label']
            if ml_label in {'all', 'some', 'all-some', 'few-some'}:
                l = 'pos'
            elif ml_label in {'few'}:
                l = 'neg'
#             relation_pair_dict[l].add((prop, c))
            if l == 'pos':
                rel_dict = d['relations']
                for combination in combinations:
                    relations = set([rel for rel, p in rel_dict.items() if p > 0.5])
                    if combination == relations:
                        l_comb = tuple(sorted(relations))
                        relation_pair_dict[l_comb].add((prop, c))
    return relation_pair_dict   

In [213]:

# get prop concept pairs with a certain relation-configuration:
combinations = [
                    {'implied_category'},
                    {'implied_category', 'variability_limited'},
                    ]

properties = get_properties()
relation_pair_dict = get_relation_combinations(properties, combinations)
for rel, pairs in relation_pair_dict.items():
     print(rel, len(pairs)) 
print()
print(relation_pair_dict[('implied_category',  )]) 
print()
print(relation_pair_dict[('implied_category', 'variability_limited', )])



('implied_category', 'variability_limited') 21
('implied_category',) 16

{('swim', 'cob'), ('round', 'pepperoni'), ('wheels', 'saloon'), ('lay_eggs', 'crane'), ('wings', 'roach'), ('roll', 'bike'), ('wheels', 'underframe'), ('wings', 'cricket'), ('swim', 'bay'), ('wheels', 'tank'), ('round', 'patty'), ('lay_eggs', 'neritidae'), ('round', 'cherry'), ('dangerous', 'pentobarbital'), ('lay_eggs', 'flounder'), ('warm', 'brogue')}

{('made_of_wood', 'girder'), ('green', 'fenugreek'), ('wings', 'beetle'), ('square', 'blackboard'), ('round', 'lemon'), ('juicy', 'anjou'), ('round', 'pineapple'), ('fly', 'fowl'), ('round', 'gourd'), ('sweet', 'carrot'), ('round', 'cabbage'), ('made_of_wood', 'transom'), ('square', 'laptop'), ('made_of_wood', 'ladle'), ('square', 'computer'), ('round', 'nutlet'), ('roll', 'cart'), ('sweet', 'breadfruit'), ('round', 'sapodilla'), ('round', 'onion'), ('red', 'tongue')}


In [214]:
# focus on prop-specific evidence types


def get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff):
    
    path_evidence = f'../analysis/{model_name}/evidence_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    tfidf_scores = []
    for et in ets:
        path_prop_et = f'{path_evidence}/{prop}/{et}'
        if os.path.isdir(path_prop_et):
            files = [f for f in os.listdir(path_prop_et) if f.endswith('.csv')]
            for f in files:
                full_path = f'{path_prop_et}/{f}'
                with open(full_path) as infile:
                    data = list(csv.DictReader(infile))
                for d in data:
                    if d[''] == concept:
                        tfidf = d['mean']
                        tfidf_scores.append(float(tfidf))
                        break
    if len(tfidf_scores) > 0:
        mean = sum(tfidf_scores)/len(tfidf_scores)
    else:
        mean = 0.0
    return mean
            


In [215]:
# test for one pair:

pair = ('red', 'tomato')

ets = ['p', 'l', 'n']

properties = get_properties()

model_name = 'giga_full_updated'

top_cutoff = 3
concept_cutoff = 3
prop, concept = pair
mean = get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff)
print(mean)

0.027778310835282503


In [191]:
combination = ('implied_category', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.002814759990160404


In [192]:
combination = ('variability_limited', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.012240007649826072


In [197]:
combination = ('implied_category', 'variability_limited')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.004005841600065764


In [193]:
combination = ('variability_open', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.006245812417314189


In [198]:
combination = ('implied_category', 'variability_open')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.0028201754340966283


In [194]:
combination = ('typical_of_property', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

ZeroDivisionError: division by zero

In [195]:
combination = ('typical_of_concept', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.01861648532852521


In [199]:
combination = ('afforded_usual', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.03678590649896033


In [203]:
combination = ('afforded_unusual', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.0016719489904147


In [202]:
combination = ('afforded_unusual', 'implied_category')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.0009296119434532655


In [204]:
combination = ('afforded_usual', 'implied_category')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0.014956343039217359
