# Analyze based on semantic categories

1.) change tfidf so we compare equivalent categories only - done
2.) update ranking accordingly

In [6]:
import os
from collections import Counter, defaultdict
import csv
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)


def get_properties():
    properties = []
    for path in os.listdir('../data/aggregated/'):
        prop = path.split('.')[0]
        if 'female-' not in prop and prop != '':
            properties.append(prop)
    return properties

In [3]:
# check updated corpus extraction
f_original = os.listdir('../contexts/giga_full/vocab')
print(len(f_original))
f_update = os.listdir('../contexts/giga_full_updated/vocab')
print(len(f_update))

1446
1636


## Show top distinctive contexts per property

* calculate mean f1 per sematic category
* rank contexts by mean f1
* only include categories with at least 10 pos and 10 neg examples (small categories distort mean)

In [5]:
# get top distinctive contexts per prop
import get_top_contexts_distinctiveness

model_name = 'giga_full_updated'
properties = get_properties()
table = get_top_contexts_distinctiveness.get_top_distinctive_contexts(properties, model_name)
df = pd.DataFrame(table)
df.sort_values('f1-mean', ascending = False).round(2)

Unnamed: 0,property,n_contexts,f1-mean,p-mean,r-mean,contexts
11,green,578,1.0,1.0,1.0,shade
4,fly,952,0.98,0.98,0.98,payload overhead study experimental hovering
2,black,1149,0.97,0.97,0.97,named
9,used_in_cooking,572,0.95,0.96,0.95,add
10,juicy,810,0.9,0.9,0.9,for
8,hot,84,0.9,0.92,0.89,oven remove
13,blue,2234,0.89,0.9,0.9,european scale bright
7,sweet,35,0.88,0.9,0.88,potato
3,red,1767,0.88,0.89,0.88,red
14,yellow,52,0.87,0.9,0.88,sipping apple bubbly operating apples


In [6]:
# latex table for paper:
cols = ['property', 'n_contexts', 'f1-mean', 'contexts']
df = df.sort_values('f1-mean', ascending = False).round(2)
print(df[cols].to_latex(index=False))

\begin{tabular}{lrrl}
\toprule
        property &  n\_contexts &  f1-mean &                                      contexts \\
\midrule
           green &         578 &     1.00 &                                         shade \\
             fly &         952 &     0.98 &  payload overhead study experimental hovering \\
           black &        1149 &     0.97 &                                         named \\
 used\_in\_cooking &         572 &     0.95 &                                           add \\
           juicy &         810 &     0.90 &                                           for \\
             hot &          84 &     0.90 &                                   oven remove \\
            blue &        2234 &     0.89 &                         european scale bright \\
           sweet &          35 &     0.88 &                                        potato \\
             red &        1767 &     0.88 &                                           red \\
          yellow &         

In [7]:
# get top distinctive contexts per prop
import get_top_contexts_distinctiveness

model_name = 'wiki_updated'
properties = get_properties()
table = get_top_contexts_distinctiveness.get_top_distinctive_contexts(properties, model_name)
df = pd.DataFrame(table)
df.sort_values('f1-mean', ascending = False).round(2)

Unnamed: 0,property,n_contexts,f1-mean,p-mean,r-mean,contexts
4,fly,55,1.0,1.0,1.0,thrust demonstrate flown fly experimental
2,black,820,0.96,0.96,0.96,black size accidentally
16,female,109,0.95,0.96,0.95,birth
14,yellow,111,0.94,0.94,0.94,beer whiskey
13,blue,1819,0.94,0.94,0.94,abundant
11,green,550,0.93,0.94,0.93,forest flora
5,dangerous,627,0.93,0.94,0.93,hand
9,used_in_cooking,705,0.93,0.93,0.93,food
10,juicy,122,0.9,0.92,0.9,she
19,wheels,105,0.9,0.93,0.89,driver


In [8]:
# latex table for paper:
cols = ['property', 'n_contexts', 'f1-mean', 'contexts']
df = df.sort_values('f1-mean', ascending = False).round(2)
print(df[cols].to_latex(index=False))

\begin{tabular}{lrrl}
\toprule
        property &  n\_contexts &  f1-mean &                                   contexts \\
\midrule
             fly &          55 &     1.00 &  thrust demonstrate flown fly experimental \\
           black &         820 &     0.96 &                    black size accidentally \\
          female &         109 &     0.95 &                                      birth \\
          yellow &         111 &     0.94 &                               beer whiskey \\
            blue &        1819 &     0.94 &                                   abundant \\
           green &         550 &     0.93 &                               forest flora \\
       dangerous &         627 &     0.93 &                                       hand \\
 used\_in\_cooking &         705 &     0.93 &                                       food \\
           juicy &         122 &     0.90 &                                        she \\
          wheels &         105 &     0.90 &              

## Prepare context annotation

In [None]:
# import prepare_annotations

# model_name = 'giga_full_updated'
# properties = get_properties()
# #properties_test = ['dangerous', 'cold', 'lay_eggs']
# #properties = [p for p in properties if p not in properties_test]
# #properties = properties_test
# properties = ['dangerous']
# cutoff = 3
# cutoff_concepts = 3

# for prop in properties:
#     print(prop)
    
#     prepare_annotations.aggregate_contexts(prop, cutoff, model_name)
#     prepare_annotations.prepare_annotation(prop, model_name, cutoff, cutoff_concepts)

## Transfer giga to wiki annotations 


In [12]:
# transfer giga annotations to wiki

properties = get_properties()
#properties = ['dangerous']
model_name_current = 'wiki_updated'
model_name_old = 'giga_full_updated'


for prop in properties:
    print(prop)
    # current file:
    annotation_name = 'annotation-tfidf-top_3_3-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name_current}/{annotation_name}/{prop}'
    f_annotation_new = f'{path_dir_annotation}/annotation-updated.csv'
    f_annotation_tr = f'{path_dir_annotation}/annotation-transferred-updated.csv'

    # old file:
    annotation_name = 'annotation-tfidf-top_3_3-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name_old}/{annotation_name}/{prop}'
    f_annotation_old = f'{path_dir_annotation}/annotation-updated-done.csv'

    # load old annotations
    if os.path.isfile(f_annotation_old):
        print('found file')
        context_annotation_dict=dict()
        with open(f_annotation_old) as infile:
            data = list(csv.DictReader(infile))
            for d in data:
                c = d['context']
                et = d['evidence_type']
                context_annotation_dict[c] = et
                #c = d['context']

        # load new candidates

        with open(f_annotation_new) as infile:
            data = list(csv.DictReader(infile))

        # fill in old annotations
        for d in data:
            c = d['context']
            if c in context_annotation_dict:
                et = context_annotation_dict[c]
            else:
                et = 'NA'
            d['evidence_type'] = et

        # write to new file

        with open(f_annotation_tr, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = data[0].keys())
            writer.writeheader()
            for d in data:
                writer.writerow(d)

square
found file
warm
found file
black
found file
red
found file
fly
found file
dangerous
found file
wings
found file
sweet
found file
hot
found file
used_in_cooking
found file
juicy
found file
green
found file
made_of_wood
found file
blue
found file
yellow
found file
roll
found file
female
found file
cold
found file
round
found file
wheels
found file
lay_eggs
found file
swim
found file


## Complete annotations

In [3]:
import analyze_annotations
import numpy as np
import pandas as pd

In [4]:
model_name = 'giga_full_updated'
top_cutoff = 3
concept_cutoff = 3
ann_dict = analyze_annotations.show_annotation_status(model_name, top_cutoff, concept_cutoff)

completed:

black (1150, 0, 1150)
blue (2235, 0, 2235)
cold (1082, 0, 1082)
dangerous (1115, 0, 1115)
female (46, 0, 46)
fly (953, 0, 953)
green (579, 0, 579)
hot (85, 0, 85)
juicy (811, 0, 811)
lay_eggs (75, 0, 75)
made_of_wood (787, 0, 787)
red (1768, 0, 1768)
roll (3887, 0, 3887)
round (521, 0, 521)
square (1505, 0, 1505)
sweet (36, 0, 36)
swim (1382, 0, 1382)
used_in_cooking (573, 0, 573)
warm (2109, 0, 2109)
wheels (245, 0, 245)
wings (561, 0, 561)
yellow (53, 0, 53)

Incomplete:



In [5]:
model_name = 'giga_full_updated'
properties = ann_dict['complete']
top_cutoff = 3
concept_cutoff = 3

cols = ['property', 'u', 'all', 'prop_specific', 'non-specific', 'p', 'n', 'l', 'i', 'r', 'b']
table = []
for prop in properties:
    d = dict()
    d['property'] =  prop
    d.update(analyze_annotations.get_evidence_distribution(model_name, prop, top_cutoff, concept_cutoff))
    for c in cols:
        if c not in d:
            d[c] = np.nan
    table.append(d)
   
cols = ['property', 'u', 'all', 'prop_specific', 'non-specific', 'p', 'n', 'l', 'i', 'r', 'b']
df = pd.DataFrame(table)[cols]
df = df[cols].sort_values('all', ascending = False).round(3)
df

Unnamed: 0,property,u,all,prop_specific,non-specific,p,n,l,i,r,b
14,used_in_cooking,0.357,0.643,0.014,0.629,0.007,0.003,0.003,0.302,0.327,
6,sweet,0.4,0.6,0.057,0.543,0.057,,,0.543,,
16,female,0.622,0.378,0.067,0.311,,,0.067,0.156,,0.156
19,hot,0.702,0.298,0.036,0.262,0.024,0.012,,0.167,0.095,
13,lay_eggs,0.703,0.297,0.027,0.27,0.027,,,0.176,0.095,
18,green,0.839,0.161,0.002,0.159,0.002,,,0.145,0.014,
7,wheels,0.84,0.16,0.016,0.143,0.008,,0.008,0.049,0.094,
10,wings,0.85,0.15,0.011,0.139,0.002,,0.009,0.045,0.068,0.027
0,juicy,0.88,0.12,0.002,0.117,0.002,,,0.102,0.015,
17,dangerous,0.883,0.117,0.021,0.096,0.002,0.006,0.013,0.037,0.054,0.005


In [8]:
#print(df.round(3).fillna('-').to_latex(index=False))

## Evidence distribution per semantic category

In [10]:
# import json
# import csv
# import os
# from collections import Counter, defaultdict
import analyze_evidence

import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [12]:
prop = 'juicy'
model_name = 'giga_full_updated'
top_cutoff = 3
concept_cutoff = 3
table = analyze_evidence.get_top_ev_categories(prop, model_name, top_cutoff, concept_cutoff)
df = pd.DataFrame(table)
df.T

Unnamed: 0,Unnamed: 1,f1,contexts,pos,neg
all,p,0.74,juices,88,60
all,i,0.8,pineapple,88,60
all,r,0.73,ripe,88,60
all,u,0.73,red,88,60
plant,p,0.73,juice juices,59,28
plant,i,0.79,pineapple,59,28
plant,r,0.8,ripe,59,28
plant,u,0.8,banana,59,28
fruit,p,0.8,juice,53,25
fruit,i,0.85,pineapple,53,25


In [14]:
print(df.T.to_latex())

\begin{tabular}{llllll}
\toprule
          &   &    f1 &      contexts & pos & neg \\
\midrule
all & p &  0.74 &        juices &  88 &  60 \\
          & i &   0.8 &     pineapple &  88 &  60 \\
          & r &  0.73 &          ripe &  88 &  60 \\
          & u &  0.73 &           red &  88 &  60 \\
plant & p &  0.73 &  juice juices &  59 &  28 \\
          & i &  0.79 &     pineapple &  59 &  28 \\
          & r &   0.8 &          ripe &  59 &  28 \\
          & u &   0.8 &        banana &  59 &  28 \\
fruit & p &   0.8 &         juice &  53 &  25 \\
          & i &  0.85 &     pineapple &  53 &  25 \\
          & r &  0.81 &         fresh &  53 &  25 \\
          & u &  0.81 &           red &  53 &  25 \\
object & p &  0.79 &        juices &  73 &  50 \\
          & i &  0.81 &     pineapple &  73 &  50 \\
          & r &  0.78 &          ripe &  73 &  50 \\
          & u &  0.76 &        bright &  73 &  50 \\
food & p &  0.71 &        juices &  83 &  41 \\
          & i &  0.79 &   

In [15]:
# get mean per cat

pd.options.display.float_format = "{:,.2f}".format

properties = get_properties()
prop_table = []
for prop in properties:
    table = analyze_evidence.get_top_ev_categories(prop, model_name, top_cutoff, concept_cutoff)
    type_scores = defaultdict(list)
    ev_type_mean_scores = dict()
    for (cat, ev_type), f1_dict in table.items():
        type_scores[ev_type].append(f1_dict['f1'])
    for ev_type, scores in type_scores.items():
        mean = sum(scores)/len(scores)
        prop_dict = dict()
        ev_type_mean_scores[ev_type] = mean
    prop_dict['property'] = prop
    prop_dict.update(ev_type_mean_scores)
    prop_table.append(prop_dict)
    
cols = ['property', 'p', 'n', 'l', 'i', 'r', 'b', 'u']
df = pd.DataFrame(prop_table)
df = df[cols].sort_values('p', ascending = False).fillna('-')#round(2).fillna('-')
df

Unnamed: 0,property,p,n,l,i,r,b,u
9,used_in_cooking,0.90,0.73,0.91,0.92,0.94,-,0.86
4,fly,0.89,0.80,0.78,0.85,0.85,0.68,0.84
20,lay_eggs,0.83,-,-,0.82,0.77,-,0.78
13,blue,0.81,-,-,0.81,0.83,-,0.84
19,wheels,0.81,-,0.70,0.8,0.82,-,0.71
7,sweet,0.81,-,-,0.81,-,-,0.74
11,green,0.78,-,-,0.79,0.77,-,0.83
14,yellow,0.78,-,-,0.78,-,-,0.81
10,juicy,0.76,-,-,0.81,0.77,-,0.79
2,black,0.75,-,0.56,0.74,0.67,-,0.83


In [17]:
#print(df.to_latex(index=False))

## Evidence strength


In [75]:
import analyze_evidence


def get_tfidf_scores_context(prop, cat, contexts, model_name):
    
    # collect scores here:
    context_tfidf_dict = defaultdict(dict)

    # get tfidf scores
    path_tfidfs = f'../results/{model_name}/tfidf-raw-10000/each_target_vs_corpus_per_category'
    paths_tfidf = [f'{path_tfidfs}/{prop}/{cat}/pos', f'{path_tfidfs}/{prop}/{cat}/neg']
    concept_files = []
    for path in paths_tfidf:
        concept_files.extend([f'{path}/{f}' for f in os.listdir(path) if f.endswith('.csv')])
    for cf in concept_files:
        full_path = cf
        #full_path = f'{path_tfidfs}/{cf}'
        concept = os.path.basename(cf).split('.')[0]
        with open(full_path) as infile:
            data = list(csv.DictReader(infile))
        for cw_target in contexts:
            found = False
            for d in data:
                cw = d['']
                score = d['target']
                if cw == cw_target:
                    context_tfidf_dict[cw][concept] = (float(score))
                    found = True
                    break
            if found == False:
                context_tfidf_dict[cw_target][concept]  = 0.0
    return context_tfidf_dict


def evidence_strength_to_file(prop, et, contexts, model_name, top_cutoff, concept_cutoff):

    filepath_target = f'../analysis/{model_name}/evidence_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    filepath_target_et = f'{filepath_target}/{prop}/{et}'
    os.makedirs(filepath_target_et, exist_ok=True)
    categories = analyze_evidence.get_categories(prop, model_name)
    tfidf_scores_cats = dict()
              
    for context in contexts:
        full_filepath = f'{filepath_target_et}/{context}.csv'
        for cat in categories:
            tfidf_scores_cats[cat] = get_tfidf_scores_context(prop, cat, contexts, model_name)[context]
            #print(cat, tfidf_scores_cats[cat]['wren'])
        df = pd.DataFrame(tfidf_scores_cats).fillna(0)
        df['mean'] = df.mean(numeric_only=True, axis = 1)
        df.to_csv(full_filepath)

In [76]:

# extract mean evidence strength for context words that give prop-specific evidence
model_name = 'giga_full_updated'
properties = get_properties()

#properties = ['dangerous']

top_cutoff = 3
concept_cutoff = 3


for prop in properties:
    table = analyze_evidence.get_top_ev_categories(prop, model_name, top_cutoff, concept_cutoff)

    ets = ['p', 'n', 'l']
    for et_target in ets:
        contexts = set()
        for (cat, et), d in table.items():
            if et_target == et:
                contexts.update(set(d['contexts'].split(' ')))
        if len(contexts) > 0:
            evidence_strength_to_file(prop, et_target, contexts, model_name, top_cutoff, concept_cutoff)

In [77]:
# compare evidence strength across properties - may not make too much sense...

ets = ['p', 'l', 'n']
properties = get_properties()

model_name = 'giga_full_updated'

top_cutoff = 3
concept_cutoff = 3

prop_et_strength_dict = defaultdict(dict)

path_evidence = f'../analysis/{model_name}/evidence_{top_cutoff}_{concept_cutoff}-raw-10000-categories'

for prop in properties:
    means_prop = []
    for et in ets:
        means_et = []
        path_dir = f'{path_evidence}/{prop}/{et}'
        if os.path.isdir(path_dir):
            ev_files = [f for f in os.listdir(path_dir) if f.endswith('.csv')]
            for f in ev_files:
                full_path = f'{path_dir}/{f}'
                with open(full_path) as infile:
                    data = list(csv.DictReader(infile))
                    means = [float(d['mean']) for d in data]
                    if len(means) > 0:
                        mean = sum(means)/len(means)
                    else:
                        mean = 0.0
                    #means_et.append(mean)
                    means_prop.append(mean)
    if len(means_prop) > 0:
        mean_prop = sum(means_prop)/len(means_prop)
    else:
        mean_prop = 0.0
    prop_et_strength_dict[prop]['prop_specific'] = mean_prop
            
df = pd.DataFrame(prop_et_strength_dict)
df.T.sort_values('prop_specific', ascending=False).round(4)

Unnamed: 0,prop_specific
juicy,0.03
made_of_wood,0.02
red,0.02
hot,0.02
sweet,0.02
female,0.02
green,0.02
yellow,0.01
lay_eggs,0.01
blue,0.01


## Relation analysis

In [78]:
from statistics import stdev
import numpy as np
import analyze_evidence

In [79]:

  
def get_relation_combinations(properties, combinations):
    
    relation_pair_dict = defaultdict(set)
    for prop in properties:
        prop_dict = analyze_evidence.load_prop_data(prop)
        for c, d in prop_dict.items():
            ml_label = d['ml_label']
            if ml_label in {'all', 'some', 'all-some', 'few-some'}:
                l = 'pos'
            elif ml_label in {'few'}:
                l = 'neg'
            relation_pair_dict[l].add((prop, c))
            if l == 'pos':
                rel_dict = d['relations']
                for combination in combinations:
                    relations = set([rel for rel, p in rel_dict.items() if p > 0.5])
                    if combination == relations:
                        l_comb = tuple(sorted(relations))
                        relation_pair_dict[l_comb].add((prop, c))
    return relation_pair_dict   

In [115]:

# get prop concept pairs with a certain relation-configuration:
combinations = [
                    {'implied_category'},
                    {'variability_limited'},
                    {'variability_open'},
                    {'afforded_usual'},
                    {'afforded_usual', 'implied_category'},
                    {'afforded_unusual'},
                    {'implied_category', 'variability_limited'},
                    {'typical_of_property'}, 
                    {'typical_of_concept'},
                    {'typical_of_concept', 'typical_of_property'}
                    ]

properties = get_properties()
relation_pair_dict = get_relation_combinations(properties, combinations)
for rel, pairs in relation_pair_dict.items():
     print(rel, len(pairs)) 
# print()
# print(relation_pair_dict[('implied_category',  )]) 
# print()
# print(relation_pair_dict[('variability_limited',  )]) 
# print()
# print(relation_pair_dict[('implied_category', 'variability_limited', )])

print()
print(relation_pair_dict[('typical_of_concept',  )]) 
print()
print(relation_pair_dict[('typical_of_concept', 'typical_of_property' )]) 
print()
print(relation_pair_dict[('implied_category', 'variability_limited', )])
print()
print(relation_pair_dict[('variability_limited', )])


neg 1545
pos 2135
('implied_category', 'variability_limited') 21
('variability_open',) 175
('variability_limited',) 114
('implied_category',) 16
('typical_of_concept', 'typical_of_property') 4
('typical_of_concept',) 4
('afforded_unusual',) 20
('afforded_usual', 'implied_category') 18
('afforded_usual',) 3

{('made_of_wood', 'club'), ('juicy', 'meat'), ('wheels', 'wheel'), ('fly', 'babbler')}

{('red', 'tomato'), ('red', 'watermelon'), ('sweet', 'desert'), ('green', 'jade')}

{('made_of_wood', 'ladle'), ('roll', 'cart'), ('made_of_wood', 'transom'), ('green', 'fenugreek'), ('wings', 'beetle'), ('round', 'cabbage'), ('made_of_wood', 'girder'), ('sweet', 'carrot'), ('round', 'lemon'), ('fly', 'fowl'), ('square', 'computer'), ('square', 'laptop'), ('round', 'pineapple'), ('red', 'tongue'), ('round', 'onion'), ('round', 'gourd'), ('sweet', 'breadfruit'), ('round', 'sapodilla'), ('juicy', 'anjou'), ('square', 'blackboard'), ('round', 'nutlet')}

{('black', 'calf'), ('green', 'berry'), ('mad

In [81]:
# focus on prop-specific evidence types


def get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff):
    
    path_evidence = f'../analysis/{model_name}/evidence_{top_cutoff}_{concept_cutoff}-raw-10000-categories'
    tfidf_scores = []
    for et in ets:
        path_prop_et = f'{path_evidence}/{prop}/{et}'
        if os.path.isdir(path_prop_et):
            files = [f for f in os.listdir(path_prop_et) if f.endswith('.csv')]
            for f in files:
                full_path = f'{path_prop_et}/{f}'
                with open(full_path) as infile:
                    data = list(csv.DictReader(infile))
                for d in data:
                    if d[''] == concept:
                        tfidf = d['mean']
                        tfidf_scores.append(float(tfidf))
                        break
    if len(tfidf_scores) > 0:
        mean = sum(tfidf_scores)/len(tfidf_scores)
    else:
        mean = 0.0
    return mean
            

#### Sanity check: 

Positive relations should have a higher mean tfidf value than negative relations

In [127]:
combination = ('pos')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

2135
0.014551640101557144


In [128]:
combination = ('neg')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

1545
0.0037992636216904333


### Step 1: Check isolated relations

* advantage: no interference from other relations
* disadvantage: small number of prop-concept pairs

### Step 2: Check effects of combinations

* impliedness + relation associated with prop mentions should lead to higher values
* typical_of_concept + relation associated with prop mentions should lead to higher values

#### Impliedness:

* implied_category should have a low mean tfidf value
* Observation: Clearly lower than mean pos value, closer to mean neg value

In [126]:
combination = ('implied_category', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

16
0.0050416473256107005


#### Variability

* variability_limited should have a high mean tfidf value
* variability_open should have a low mean tfidf value

* Observations:


In [125]:
combination = ('variability_limited', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

114
0.014277238391360319


In [124]:
combination = ('variability_open', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

175
0.007375878935121847


#### Variability in combination with impliedness should overrule impliedness

In [131]:
combination = ('implied_category', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

16
0.0050416473256107005


In [139]:
combination = ('implied_category', 'variability_limited')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(pairs)
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

{('made_of_wood', 'ladle'), ('roll', 'cart'), ('made_of_wood', 'transom'), ('green', 'fenugreek'), ('wings', 'beetle'), ('round', 'cabbage'), ('made_of_wood', 'girder'), ('sweet', 'carrot'), ('round', 'lemon'), ('fly', 'fowl'), ('square', 'computer'), ('square', 'laptop'), ('round', 'pineapple'), ('red', 'tongue'), ('round', 'onion'), ('round', 'gourd'), ('sweet', 'breadfruit'), ('round', 'sapodilla'), ('juicy', 'anjou'), ('square', 'blackboard'), ('round', 'nutlet')}
21
0.004917146590967316


In [142]:
combination = ('variability_limited', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(pairs)

{('black', 'calf'), ('green', 'berry'), ('made_of_wood', 'chock'), ('sweet', 'hackberry'), ('juicy', 'rumor'), ('blue', 'shark'), ('red', 'ant'), ('made_of_wood', 'corkscrew'), ('red', 'shallot'), ('sweet', 'tomato'), ('square', 'footrest'), ('red', 'dogwood'), ('dangerous', 'doodlebug'), ('blue', 'dogbane'), ('fly', 'machine'), ('sweet', 'walnut'), ('square', 'typewriter'), ('sweet', 'pumpkin'), ('sweet', 'wine'), ('green', 'chutney'), ('made_of_wood', 'axe'), ('blue', 'budgie'), ('green', 'coconut'), ('red', 'echinocereus'), ('juicy', 'ham'), ('black', 'woodpecker'), ('red', 'ambulance'), ('green', 'serviceberry'), ('juicy', 'shallot'), ('square', 'server'), ('red', 'carnation'), ('black', 'human'), ('sweet', 'squash'), ('green', 'kingfisher'), ('black', 'cloud'), ('juicy', 'salad'), ('juicy', 'date'), ('black', 'crypt'), ('fly', 'craft'), ('used_in_cooking', 'pineapple'), ('sweet', 'cob'), ('yellow', 'locust'), ('used_in_cooking', 'sapodilla'), ('made_of_wood', 'clock'), ('sweet', '

In [133]:
combination = ('implied_category', 'variability_open')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

6
0.004328250549120216


#### Typicality

* typical_of_property should have higher values
* typical_of_concept should have lower values
* problem: they are unlikely to occur independently from one another

In [123]:
combination = ('typical_of_property', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

0


ZeroDivisionError: division by zero

In [122]:
combination = ('typical_of_concept', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

4
0.020486685150204003


#### Affordedness

In [129]:
combination = ('afforded_usual', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

3
0.025515742044588785


In [130]:
combination = ('afforded_unusual', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

20
0.0015403223710659818


In [134]:
combination = ('affording_activity', )
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

4
0.01892229580070073


#### Affordedness relations + other relations

*affording_activity*

In [143]:
combination = ('affording_activity', 'implied_category')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
print(pairs)
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

8
{('hot', 'lightning'), ('wheels', 'equipage'), ('wheels', 'snowplow'), ('warm', 'calfskin'), ('wheels', 'dumper'), ('warm', 'tunic'), ('wheels', 'airplane'), ('made_of_wood', 'bamboo')}
0.012580508606358764


In [136]:
combination = ('affording_activity', 'typical_of_concept')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

5
0.014216872024275748


In [137]:
combination = ('affording_activity', 'implied_category', 'typical_of_concept')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

57
0.006655211593828092


*afforded_usual*

In [138]:
combination = ('afforded_usual', 'implied_category')
combinations = [set(combination)]
relation_pair_dict = get_relation_combinations(properties, combinations)
pairs = relation_pair_dict[combination]
print(len(pairs))
means = []
for prop, concept in pairs:
    means.append(get_tfidf_pair(prop, concept, model_name, top_cutoff, concept_cutoff))
print(sum(means)/len(means))

18
0.025247162010199574
