# Ideas

Can we find evidence among all contexts of positive examples of a property by comparing them to the contexts of all negative examples?

Here, similar concepts should come in handy, as there should be a lot of overlap in their contexts. Distinctive aspects should be all the more salient. 



In [1]:
import pandas as pd
import os

In [None]:
# run extract_contexts.py (vm)
# dowload contexts - out in ../contexts/
# run get_tfidf.py
# run process_tfidf.py
# annotate evidence candidates
# run 

In [2]:
# sanity check giga

path_giga = '../contexts/giga_full/vocab/'
path_wiki =  '../contexts/wiki/vocab/'
path_vocab = '../data/vocab.txt'

with open(path_vocab) as infile:
    vocab = set(infile.read().strip().split('\n'))
print(len(vocab))

giga_vocab = [f.split('.')[0] for f in os.listdir(path_giga)]
print(len(giga_vocab))

wiki_vocab = [f.split('.')[0] for f in os.listdir(path_wiki)]
print(len(wiki_vocab))

1780
1445
1669


# TF-idf analysis 

In [2]:
# get overview of counts
from process_tfidf import get_table
from analyze_evidence import get_evidence_table, get_properties
import pandas as pd

In [7]:
# before annotation 

prop = 'female'
model = 'giga_full'
cnt_type = 'raw'
max_features = 10000
category = 'all-pos'
pos_dicts, neg_dicts = get_table(model, prop, category, 
                                 cnt_type, max_features, 
                                 rank_by='n_concepts', top_n=20)
df_pos = pd.DataFrame(pos_dicts)
df_neg = pd.DataFrame(neg_dicts)
df_pos

../analysis/giga_full/tfidf_aggregated_concept_scores-raw-10000/female-pos/all-pos/female-pos.csv


Unnamed: 0,context,n_concepts,mean_tfidf,mean_diff
0,herself,77,0.02,0.02
1,she,72,0.05,0.04
2,beautiful,68,0.02,0.02
3,lady,65,0.02,0.02
4,anna,63,0.02,0.02
5,love,63,0.02,0.02
6,mary,61,0.03,0.03
7,actress,60,0.03,0.03
8,elizabeth,60,0.04,0.03
9,maria,59,0.03,0.03


In [8]:
df_neg

Unnamed: 0,context,n_concepts,mean_tfidf,mean_diff
0,himself,98,0.01,0.01
1,michael,97,0.01,0.01
2,david,95,0.01,0.01
3,steve,94,0.01,0.01
4,brian,92,0.01,0.01
5,failed,91,0.0,0.0
6,signed,91,0.0,0.0
7,peter,91,0.01,0.01
8,james,90,0.01,0.01
9,bob,89,0.01,0.01


## Annotation procedure


## File handling:

* Annotate per semantic category and for all positive concepts
* Files: ../analysis/giga_full/annotation-tfidf-top10-raw-10000/dangerous-pos/animal/dangerous-pos.csv
* Go to dir
* Copy file: cp dangerous-pos.csv dangerous-pos-annotated.csv
* Annotate and save


## Annotation guidelines


Use labels in table

[insert table]


As yourself:

Can this word indicate the property given the semantic category? 

If you are annotating the general bin, just ask yourself whether this word can indicate the property.


## Inspect annotations

In [1]:
from analyze_evidence import get_evidence_table, get_properties
import pandas as pd

In [9]:
# after annotation

prop = 'dangerous'
model = 'giga_full'
category = 'animal'
max_features = 10000
table_dicts = get_evidence_table(prop, model, category, max_features)
df = pd.DataFrame(table_dicts)
df.round(2)
#print(df.to_latex(index=False))

n pos: 10
n neg: 60



Unnamed: 0,evidence,evidence_type,n_pos,mean_tfidf_pos,rank_pos,n_neg,mean_tfidf_neg,rank_neg
0,attacked,r,0.8,0.02,1,0.02,0.02,9847
1,killed,r,0.8,0.02,3,0.02,0.03,11212
2,attack,i,0.8,0.02,2,0.03,0.02,9158
3,shots,i,0.7,0.0,24,0.15,0.01,5101
4,survived,r,0.7,0.01,25,0.15,0.01,5522
5,threatened,r,0.7,0.02,19,0.07,0.01,8276
6,dangerous,p,0.6,0.0,110,0.03,0.01,9118


### Write interface to help with consistency

* display all files that have (not) been annotated

In [22]:
import os
from collections import defaultdict


def get_annotation_status(model):
    model = 'giga_full'
    dir_annotations = f'../analysis/{model}/annotation-tfidf-top20-raw-10000'
    annotation_dict = defaultdict(dict)

    for f in os.listdir(dir_annotations):
        if not f.endswith('.csv'):
            prop = f.split('/')[-1]
            full_path = f'{dir_annotations}/{f}'
            #print(full_path)
            # get categories:
            for cat in os.listdir(full_path):
                full_path_cf = f'{full_path}/{cat}'
                annotated = [f for f in os.listdir(full_path_cf) if f.endswith('annotated.csv')]
                if len(annotated) == 1:
                    if prop in annotation_dict['complete']:
                        annotation_dict['complete'][prop].add(cat)
                    else:
                        annotation_dict['complete'][prop] = {cat}
                else:
                    if prop in annotation_dict['incomplete']:
                        annotation_dict['incomplete'][prop].add(cat)
                    else:
                        annotation_dict['incomplete'][prop] = {cat}
    return annotation
                    
annotation_dict['complete']

{'dangerous-pos': {'all-pos',
  'animal',
  'communication',
  'criminal',
  'dangerous',
  'object'}}

# Property evidence in concept-contexts [to adapt to new setup]

In [6]:
# create concept-context overview matrix for each property

from analyze_evidence import get_properties, get_prop_overview
import pandas as pd

In [7]:
props = get_properties()
model = 'wiki'
prop_table = get_prop_overview(props, model)
df = pd.DataFrame(prop_table).sort_values('p_ev_pos', ascending=False)
df
#print(df.to_latex(index=False))

Unnamed: 0,prop,n_ev,total_pos,p_ev_pos,total_neg,p_ev_neg
16,female,10,122,0.97,150,0.46
19,wheels,8,75,0.91,27,0.37
14,yellow,4,22,0.91,85,0.21
10,juicy,5,55,0.91,64,0.2
7,sweet,4,54,0.85,64,0.34
21,swim,4,97,0.71,47,0.32
11,green,1,42,0.69,68,0.01
17,cold,2,22,0.68,23,0.17
12,made_of_wood,2,50,0.64,43,0.14
18,round,2,44,0.64,18,0.28


In [8]:
props = get_properties()
model = 'giga_full'
prop_table = get_prop_overview(props, model)
df = pd.DataFrame(prop_table).sort_values('p_ev_pos', ascending=False)
df

Unnamed: 0,prop,n_ev,total_pos,p_ev_pos,total_neg,p_ev_neg
10,juicy,8,55,1.0,60,0.18
16,female,6,109,0.95,144,0.33
7,sweet,6,51,0.86,63,0.3
19,wheels,3,67,0.85,25,0.24
11,green,2,42,0.83,67,0.15
8,hot,3,53,0.83,43,0.28
3,red,3,26,0.81,64,0.12
14,yellow,3,22,0.73,74,0.08
9,used_in_cooking,5,87,0.69,56,0.21
21,swim,3,78,0.68,38,0.08


## Property types

In [17]:
from analyze_evidence import get_prop_collection_overview, get_prop_types
import pandas as pd

In [23]:
# create overview table

prop_collection_dict, collection_prop_dict = get_prop_types()
table = []
for collection, props in collection_prop_dict.items():
    props_line = ' '.join(props)
    d = dict()
    d['prop_type'] = collection
    d['properties'] = props_line
    table.append(d)
df = pd.DataFrame(table)
df

Unnamed: 0,prop_type,properties
0,perceptual-shape,round square
1,percetual-heat,warm cold hot
2,perceptual-color,green red blue yellow black
3,activities,roll swim lay_eggs fly
4,complex,used_in_cooking dangerous
5,parts,wheels wings
6,perceptual,sweet juicy
7,part-material,made_of_wood
8,gender,female


In [24]:
print(df.to_latex(index=False))

\begin{tabular}{ll}
\toprule
        prop\_type &                   properties \\
\midrule
 perceptual-shape &                 round square \\
   percetual-heat &                warm cold hot \\
 perceptual-color &  green red blue yellow black \\
       activities &       roll swim lay\_eggs fly \\
          complex &    used\_in\_cooking dangerous \\
            parts &                 wheels wings \\
       perceptual &                  sweet juicy \\
    part-material &                 made\_of\_wood \\
           gender &                       female \\
\bottomrule
\end{tabular}



In [29]:
props = get_properties()
model = 'giga_full'
collection_table = get_prop_collection_overview(props, model)
df = pd.DataFrame(collection_table).sort_values('collection')
print(df.to_latex(index=False))

\begin{tabular}{lrrrrrr}
\toprule
       collection &  n\_props &  n\_ev &  total\_pos &  p\_ev\_pos &  total\_neg &  p\_ev\_neg \\
\midrule
       activities &        4 &    15 &        200 &      0.45 &        218 &      0.17 \\
          complex &        2 &     6 &        146 &      0.61 &        107 &      0.18 \\
           gender &        1 &     6 &        109 &      0.95 &        144 &      0.33 \\
    part-material &        1 &     2 &         43 &      0.05 &         33 &      0.18 \\
            parts &        2 &     3 &        125 &      0.46 &        102 &      0.06 \\
       perceptual &        2 &    13 &        106 &      0.77 &        123 &      0.23 \\
 perceptual-color &        5 &    10 &        126 &      0.64 &        356 &      0.09 \\
 perceptual-shape &        2 &     1 &         69 &      0.03 &         39 &      0.05 \\
   percetual-heat &        3 &     5 &        132 &      0.46 &         99 &      0.17 \\
\bottomrule
\end{tabular}



In [28]:
props = get_properties()
model = 'wiki'
collection_table = get_prop_collection_overview(props, model)
df = pd.DataFrame(collection_table).sort_values('collection')
print(df.to_latex(index=False))

\begin{tabular}{lrrrrrr}
\toprule
       collection &  n\_props &  n\_ev &  total\_pos &  p\_ev\_pos &  total\_neg &  p\_ev\_neg \\
\midrule
       activities &        4 &     8 &        279 &      0.34 &        262 &      0.13 \\
          complex &        2 &     3 &        154 &      0.25 &        123 &      0.07 \\
           gender &        1 &    10 &        122 &      0.97 &        150 &      0.46 \\
    part-material &        1 &     2 &         50 &      0.64 &         43 &      0.14 \\
            parts &        2 &     9 &        154 &      0.69 &        111 &      0.17 \\
       perceptual &        2 &     8 &        109 &      0.72 &        128 &      0.27 \\
 perceptual-color &        5 &     8 &        129 &      0.60 &        379 &      0.09 \\
 perceptual-shape &        2 &     3 &         73 &      0.40 &         39 &      0.15 \\
   percetual-heat &        3 &     5 &        140 &      0.54 &        101 &      0.12 \\
\bottomrule
\end{tabular}



## Relations

In [11]:
# def get hypotheses

from analyze_evidence import get_properties, get_relation_overview
import pandas as pd

In [37]:
props = get_properties()
#props = [p for p in props if p != 'female']
model = 'wiki'
relation_table = get_relation_overview(props, model, rel_type = 'top')
df = pd.DataFrame(relation_table)
df = df.sort_values('p_evidence', ascending = False)
print(df.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
            relation &  total\_concepts &  p\_evidence \\
\midrule
          gender-all &             152 &        0.78 \\
 typical\_of\_property &             127 &        0.68 \\
  affording\_activity &             407 &        0.59 \\
  typical\_of\_concept &             572 &        0.57 \\
    implied\_category &             629 &        0.50 \\
      afforded\_usual &             172 &        0.41 \\
          gender-few &             208 &        0.33 \\
    afforded\_unusual &              73 &        0.19 \\
 variability\_limited &             562 &        0.17 \\
             unusual &             578 &        0.16 \\
                rare &             308 &        0.15 \\
            creative &             152 &        0.12 \\
          impossible &             554 &        0.11 \\
    variability\_open &             443 &        0.08 \\
\bottomrule
\end{tabular}



In [33]:
props = get_properties()
model = 'wiki'
relation_table = get_relation_overview(props, model, rel_type = 'hyp_top')
df = pd.DataFrame(relation_table)
df = df.sort_values('p_evidence', ascending = False)
print(df.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
            relation &  total\_concepts &  p\_evidence \\
\midrule
          gender-all &             152 &        0.78 \\
 typical\_of\_property &             211 &        0.59 \\
  affording\_activity &             555 &        0.54 \\
  typical\_of\_concept &              19 &        0.53 \\
      afforded\_usual &             211 &        0.37 \\
          gender-few &             208 &        0.33 \\
    implied\_category &              45 &        0.27 \\
    afforded\_unusual &              50 &        0.24 \\
 variability\_limited &             803 &        0.20 \\
                rare &             244 &        0.15 \\
             unusual &             477 &        0.14 \\
            creative &             140 &        0.13 \\
          impossible &             553 &        0.11 \\
    variability\_open &             285 &        0.04 \\
\bottomrule
\end{tabular}



In [39]:
props = get_properties()
#props = [p for p in props if p != 'female']
model = 'giga_full'
relation_table = get_relation_overview(props, model, rel_type = 'top')
df = pd.DataFrame(relation_table)
df = df.sort_values('p_evidence', ascending = False)
print(df.to_latex(index=False))


\begin{tabular}{lrr}
\toprule
            relation &  total\_concepts &  p\_evidence \\
\midrule
          gender-all &             152 &        0.68 \\
 typical\_of\_property &             127 &        0.57 \\
  affording\_activity &             407 &        0.53 \\
  typical\_of\_concept &             572 &        0.52 \\
    implied\_category &             629 &        0.41 \\
      afforded\_usual &             172 &        0.38 \\
          gender-few &             208 &        0.23 \\
 variability\_limited &             562 &        0.19 \\
            creative &             152 &        0.17 \\
             unusual &             578 &        0.16 \\
                rare &             308 &        0.15 \\
    afforded\_unusual &              73 &        0.15 \\
          impossible &             554 &        0.10 \\
    variability\_open &             443 &        0.07 \\
\bottomrule
\end{tabular}



In [34]:
props = get_properties()
model = 'giga_full'
relation_table = get_relation_overview(props, model, rel_type = 'hyp_top')
df = pd.DataFrame(relation_table)
df = df.sort_values('p_evidence', ascending = False)
print(df.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
            relation &  total\_concepts &  p\_evidence \\
\midrule
          gender-all &             152 &        0.68 \\
 typical\_of\_property &             211 &        0.52 \\
  affording\_activity &             555 &        0.49 \\
  typical\_of\_concept &              19 &        0.42 \\
      afforded\_usual &             211 &        0.37 \\
          gender-few &             208 &        0.23 \\
 variability\_limited &             803 &        0.21 \\
    implied\_category &              45 &        0.18 \\
            creative &             140 &        0.18 \\
    afforded\_unusual &              50 &        0.18 \\
             unusual &             477 &        0.14 \\
                rare &             244 &        0.12 \\
          impossible &             553 &        0.10 \\
    variability\_open &             285 &        0.03 \\
\bottomrule
\end{tabular}



## Development

In [216]:
s1 = {1,2,3}
s2 = {2,3,1}
s1 ==s2

True

In [214]:
prop = 'yellow'


In [19]:



properties = ['fly']
prop_cat_syn_dict = collect_search_log(properties)
prop_cat_syn_dict

defaultdict(list,
            {'swim': [{'bird': {"Synset('bird.n.01')", "Synset('bird.n.02')"}},
              {'fish': {"Synset('fish.n.01')", "Synset('fish.n.02')"}},
              {'mammal': {"Synset('mammal.n.01')"}},
              {'bird': {"Synset('bird.n.01')", "Synset('bird.n.02')"}},
              {'fish': {"Synset('fish.n.01')", "Synset('fish.n.02')"}},
              {'mammal': {"Synset('mammal.n.01')"}}]})