In [2]:
import os
from collections import Counter, defaultdict
import csv
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)



In [162]:
# proportion of evidence

def get_properties():
    properties = []
    for path in os.listdir('../data/aggregated/'):
        prop = path.split('.')[0]
        if 'female-' not in prop and prop != '':
            properties.append(prop)
    return properties

def load_evidence_type_dict(prop, model_name):
    
    evidence_type_dict = dict()
    dir_corpus = f'../analysis/{model_name}/annotation-tfidf-top_3_3-raw-10000-categories'

    with open(f'{dir_corpus}/{prop}/annotation-updated-done.csv') as infile:
        data = list(csv.DictReader(infile))
    for d in data:
        t = d['evidence_type']
        c = d['context']
        evidence_type_dict[c] = t
    return evidence_type_dict
    
    
    
def get_evidence_prop_div(evidence_type_dict, cnt = 'prop'):
    
    evidence_prop_dict = dict()
    evidence_div_dict = dict()

    type_evidence_dict = defaultdict(list)
    
    for c, t in evidence_type_dict.items():
        type_evidence_dict[t].append(c)
        if t in ['p', 'n', 'l']:
            t_c = 'prop-specific'
            type_evidence_dict[t_c].append(c)
        elif t in ['i', 'r', 'b']:
            t_c = 'non-specific'
            type_evidence_dict[t_c].append(c)
        
    n_candidates = len(evidence_type_dict.keys())
                 
    for t, contexts in type_evidence_dict.items():
        n_contexts = len(contexts)
        p = n_contexts/n_candidates
        evidence_prop_dict[t] = p
        evidence_div_dict[t] = n_contexts
    if cnt == 'prop':
        result = evidence_prop_dict
    elif cnt == 'div':
        result = evidence_div_dict
    return result
        
        
def get_evidence_prop_div_concept_category(evidence_type_dict, prop, concept, category, cnt = 'prop'):
    
    evidence_prop_dict = dict()
    evidence_div_dict = dict()
    
    dir_path = '../results/giga_full_updated/tfidf-raw-10000/each_target_vs_corpus_per_category'
    full_path = f'{dir_path}/{prop}/{category}/pos/{concept}.csv'
    
    with open(full_path) as infile:
        data = list(csv.DictReader(infile))
    contexts = [d[''] for d  in data if float(d['diff']) > 0]
    
    context_candidates = [c for c in contexts if c in evidence_type_dict.keys()]
    n_candidates = len(context_candidates)
    
    evidence_context_dict = defaultdict(list)
    for c in context_candidates:
        if c in evidence_type_dict:
            t = evidence_type_dict[c]
            evidence_context_dict[t].append(c)
            if t in ['p', 'n', 'l']:
                t_c = 'prop-specific'
                evidence_context_dict[t_c].append(c)
            elif t in ['i', 'r', 'b']:
                t_c = 'non-specific'
                evidence_context_dict[t_c].append(c)
    for t, contexts in evidence_context_dict.items():
        n_contexts = len(contexts)
        evidence_div_dict[t] = n_contexts
        evidence_prop_dict[t] = n_contexts/n_candidates
        
    if cnt == 'prop':
        result = evidence_prop_dict
    elif cnt == 'div':
        result = evidence_div_dict
    return result
    
    
def get_pos_examples(model_name, prop):
    
    concepts_pos = set()
    
    # use 'all' category
    dir_path = '../results/giga_full_updated/tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_all_pos = f'{dir_path}/{prop}/all/pos/'
    
    for f in os.listdir(path_all_pos):
        concept = f.split('.')[0]
        if concept != '':
            concepts_pos.add(concept)
    return concepts_pos
    

def get_categories_concept(prop, concept, model_name):
    
    categories = set()
    
    dir_path = '../results/giga_full_updated/tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_prop = f'{dir_path}/{prop}/'
    
    for cat in os.listdir(path_prop):
        full_path = f'{path_prop}/{cat}/pos/{concept}.csv'
        if os.path.isfile(full_path):
            categories.add(cat)
    return categories
    
def get_evidence_prop_div_concept(prop, concept, model_name, cnt):
    categories = get_categories_concept(prop, concept, model_name)

    ev_prop_concept = Counter()

    for cat in categories:
        ev_prop_concept_cat = get_evidence_prop_div_concept_category(evidence_type_dict, 
                                                                     prop, concept, 
                                                                     cat, cnt = cnt)

        for ev, p in ev_prop_concept_cat.items():
            ev_prop_concept[ev] += p

    # calculate means
    for ev, p in ev_prop_concept.items():
        p_mean = p/len(categories)
        ev_prop_concept[ev] = p_mean
    return ev_prop_concept  


def get_evidence_prop_div_properties(model_name, cnt):
    
    table = []
    
    properties = get_properties()

    for prop in properties:
        evidence_type_dict = load_evidence_type_dict(prop, model_name)
        evidence_prop = get_evidence_prop_div(evidence_type_dict, cnt = cnt)
        evidence_prop['property'] = prop
        table.append(evidence_prop)
    
    columns = ['prop-specific', 'non-specific', 'p', 'l', 'n', 'i', 'r', 'b', 'u']
    df = pd.DataFrame(table).set_index('property')[columns]
    median = df.median(axis=0)
    df.loc['median'] = median
    
    return df


    
def raw_to_distance(df): 
    
    df_dict = df.to_dict('index')
    df_dict_distance = dict()
    
    median_dict = df_dict['median']
    
    
    for i, d in df_dict.items():
        if i != 'median':
            d_distance = dict()
            for k, v in d.items():
                median = median_dict[k]
                dist =  v -  median
                d_distance[k] = dist
            df_dict_distance[i] = d_distance
    df_dict_distance['median'] = median_dict
    df_dist = pd.DataFrame(df_dict_distance).T
    return df_dist


def get_evidence_prop_div_concepts(model_name, prop, cnt):
    
    table = []
    keys = set()
    concepts_pos = get_pos_examples(model_name, prop)

    for concept in concepts_pos:
        ev_prop_concept =  get_evidence_prop_div_concept(prop, concept, model_name, cnt='prop')
        keys.update(ev_prop_concept.keys())
        ev_prop_concept['concept'] = concept
        table.append(ev_prop_concept)
        

    columns = ['prop-specific', 'non-specific', 'p', 'l', 'n', 'i', 'r', 'b', 'u']
    columns = [c for c in columns if c in keys]
    df = pd.DataFrame(table).set_index('concept')
    median = df.median(axis=0)
    df.loc['median'] = median
    df = df[columns]

    return df

In [158]:

model_name = 'giga_full_updated'
df = get_evidence_prop_div_properties(model_name, cnt = 'prop')

df.round(4)

Unnamed: 0_level_0,prop-specific,non-specific,p,l,n,i,r,b,u
property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
square,,0.0426,,,,0.0412,0.0013,,0.9574
warm,0.0043,0.0532,0.0019,0.0014,0.0009,0.0427,0.0104,,0.9426
black,0.0017,0.0479,0.0009,0.0009,,0.04,0.0078,,0.9504
red,0.0006,0.0538,0.0006,,,0.0504,0.0034,,0.9457
fly,0.0095,0.0441,0.0063,0.0021,0.0011,0.0084,0.0326,0.0032,0.9464
dangerous,0.0206,0.0987,0.0018,0.0126,0.0063,0.0395,0.0539,0.0054,0.8806
wings,0.0125,0.1429,0.0036,0.0089,,0.0482,0.0679,0.0268,0.8446
sweet,0.0571,0.5429,0.0571,,,0.5429,,,0.4
hot,0.0357,0.2619,0.0238,,0.0119,0.1786,0.0833,,0.7024
used_in_cooking,0.014,0.6311,0.007,0.0035,0.0035,0.3287,0.3024,,0.3549


In [160]:
df_dist = raw_to_distance(df)
df_dist.round(4)

Unnamed: 0,prop-specific,non-specific,p,l,n,i,r,b,u
square,,-0.0333,,,,-0.0106,-0.0162,,0.0363
warm,-0.0013,-0.0227,-0.0003,-0.0014,-0.0031,-0.0091,-0.0071,,0.0214
black,-0.0038,-0.028,-0.0013,-0.0019,,-0.0118,-0.0097,,0.0292
red,-0.005,-0.0221,-0.0016,,,-0.0015,-0.0142,,0.0245
fly,0.0039,-0.0317,0.0041,-0.0007,-0.003,-0.0434,0.015,-0.0129,0.0253
dangerous,0.0151,0.0229,-0.0004,0.0098,0.0022,-0.0123,0.0363,-0.0107,-0.0405
wings,0.0069,0.067,0.0014,0.0061,,-0.0036,0.0503,0.0107,-0.0765
sweet,0.0516,0.467,0.0549,,,0.491,,,-0.5212
hot,0.0302,0.1861,0.0216,,0.0078,0.1267,0.0658,,-0.2188
used_in_cooking,0.0084,0.5553,0.0048,0.0007,-0.0006,0.2768,0.2849,,-0.5663


In [163]:
# concepts 

model_name = 'giga_full_updated'
prop = 'lay_eggs'

  
df = get_evidence_prop_div_concepts(model_name, prop, cnt = 'prop')
df =  raw_to_distance(df)

df.round(4) 

Unnamed: 0,prop-specific,non-specific,p,i,r,u
bass,-0.0021,-0.0124,-0.0021,-0.0074,0.0019,0.0126
crocodile,0.0006,0.0003,0.0006,0.0022,0.0049,-0.0028
crow,,-0.0539,,-0.014,-0.0331,0.0544
egret,,0.0062,,0.0337,-0.0206,-0.0058
smallmouth,,0.0918,,0.0339,0.0647,-0.0913
tortoise,,0.0047,,0.0002,0.0114,-0.0043
halibut,0.0032,0.018,0.0032,0.0098,0.015,-0.0231
platypus,0.0101,0.0226,0.0101,0.0467,-0.0173,-0.0346
stork,,-0.0414,,-0.002,-0.0325,0.0418
rattlesnake,,-0.0104,,-0.0015,-0.002,0.0109
