In [26]:
from statistics import stdev
import numpy as np
import analyze_evidence
import os
import os
from collections import Counter, defaultdict
import csv
import pandas as pd
pd.set_option('display.max_colwidth', None) 
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)


def get_properties():
    properties = []
    for path in os.listdir('../data/aggregated/'):
        prop = path.split('.')[0]
        if 'female-' not in prop and prop != '':
            properties.append(prop)
    return properties

In [4]:
from itertools import permutations

def load_relation_pairs(combination, order=True):
    
    all_pairs = set()
    
    path_dir = '../data/relations'
    if order == True:
        name = '-'.join(combination)
        paths = [f'{path_dir}/{name}.txt']
    else:
        perms = list(permutations(combination, len(combination)))
        paths = []
        for perm in perms:
            name = '-'.join(perm)
            paths.append(f'{path_dir}/{name}.txt')
    
    for path in paths:
        if os.path.isfile(path):
            with open(path) as infile:
                lines = infile.read().strip().split('\n')
            pairs = [(l.split(',')[0], l.split(',')[1]) for l in lines]
            all_pairs.update(pairs)
    return all_pairs


def load_relation_pairs_hyp(target_rel):
    rels_evidence = {'typical_of_property', 'variability_limited',  'afforded_usual', 'affording_activity'}
    rels_no_evidence = {'typical_of_concept', 'afforded_unusual', 'implied_category', 'variability_open'}
    
    # find combinations in which only one relation is associated with evidence
    # if no relation is associated with evidence, take the top non-evidence relation
    all_pairs = set()
    path_dir = '../data/relations'
    all_files = os.listdir(path_dir)
    for f in all_files:
        rels = f.split('.')[0].split('-')
        if target_rel in rels:
            rels_ev = [r for r in rels if r in rels_evidence]
            if len(rels_ev) == 1 and target_rel == rels_ev[0]:
                #load data
                with open(f'{path_dir}/{f}') as infile:
                    lines = infile.read().strip().split('\n')
                    all_pairs.update([tuple(l.split(',')) for l in lines])
            else:
                # only take if isolated or top:
                if rels[0] == target_rel:
                    with open(f'{path_dir}/{f}') as infile:
                        lines = infile.read().strip().split('\n')
                    all_pairs.update([tuple(l.split(',')) for l in lines])
                        
    return all_pairs
                


In [22]:
# csv file to sort props by type

properties = get_properties()

filepath  = '../data/property_types.csv'

if not os.path.isfile(filepath):
    with open(filepath,  'w') as outfile:
        outfile.write('property,type\n')
        for prop in properties:
            outfile.write(f'{prop}, ,\n')
else:
    print('file exists')
    

file exists


## Count properties per relation

In [30]:
def get_rel_prop_overview(pair_mode):
    
    path_dir = '../analysis/prop_set/'
    os.makedirs(path_dir, exist_ok=True)
    path = f'{path_dir}/properties_relations-{pair_mode}.csv'
    
    all_relations = [
                        #'pos', 'neg', 'all', 'some', 'few',
                         #'evidence', 'no_evidence_pos', 'no_evidence_neg',
                         'implied_category', 
                         'typical_of_concept', 'typical_of_property', 
                         'affording_activity', 'afforded_usual', 'afforded_unusual',
                         'variability_limited', 'variability_open',
                         'variability_limited_scalar', 'variability_open_scalar',
                         'rare', 'unusual', 'impossible', 'creative']

    # prop types
    path_prop_types  = '../data/property_types.csv'
    prop_types_dict = dict()
    with open(path_prop_types) as infile:
        lines =  infile.read().strip().split('\n')
        for line in lines[1:]:
            prop, prop_type = line.split(',')
            prop_types_dict[prop.strip()] = prop_type.strip()
            
    all_props = prop_types_dict.keys()
    all_prop_types = set(prop_types_dict.values())
   
    rel_props = defaultdict(list)
    rel_prop_types = defaultdict(list)
    rel_pair_counts = dict()
    for rel in all_relations:
        rel_dict = dict()
        rel_dict['relation'] = rel

        if pair_mode == 'strict':
            combination = (rel,  )
            pairs =  load_relation_pairs(combination, order=False)
        elif pair_mode ==  'evidence':
            pairs = load_relation_pairs_hyp(rel)
        for prop, c in pairs:
            if rel not in rel_pair_counts:
                rel_pair_counts[rel] = 1
            else:
                rel_pair_counts[rel] += 1
            rel_props[rel].append(prop)
            rel_prop_types[rel].append(prop_types_dict[prop])

    table = []
    
    # load prop types
    
    
    for rel, props in rel_props.items():
        prop_types = rel_prop_types[rel]
        rel_dict = dict()
        rel_dict['relation'] = rel
        for prop_type in all_prop_types:
            rel_dict[prop_type] = prop_types.count(prop_type) #/rel_pair_counts[rel]
        for prop in all_props:
            rel_dict[prop] = props.count(prop)/rel_pair_counts[rel]
        table.append(rel_dict)


    df = pd.DataFrame(table)
    df.to_csv(path)
    cols = ['relation']
    cols.extend(all_prop_types)
    #cols.extend(all_props)
    df = df[cols]
    return df

In [31]:
pair_mode = 'strict'
df = get_rel_prop_overview(pair_mode)
df.round(2)

Unnamed: 0,relation,color,function-action,material,shape,taxonomic,gender,part,taste,encyclopedic,temperature
0,implied_category,0,3,0,3,3,0,5,0,1,1
1,typical_of_concept,0,1,1,0,0,0,1,1,0,0
2,affording_activity,0,0,0,0,0,0,2,2,0,0
3,afforded_usual,0,1,0,0,2,0,0,0,0,0
4,afforded_unusual,0,20,0,0,0,0,0,0,0,0
5,variability_limited,45,10,13,15,0,0,0,30,0,0
6,variability_open,57,0,0,18,0,0,0,2,0,0
7,variability_limited_scalar,0,0,0,0,0,0,0,0,1,0
8,variability_open_scalar,0,0,0,0,0,0,0,0,1,97
9,rare,9,3,0,0,0,0,0,5,0,2


In [23]:
pair_mode = 'evidence'
df = get_rel_prop_overview(pair_mode)
df.round(2)

Unnamed: 0,relation,color,function-action,material,shape,taxonomic,gender,part,taste,encyclopedic,...,made_of_wood,blue,yellow,roll,female,cold,round,wheels,lay_eggs,swim
0,implied_category,0.05,0.3,0.02,0.11,0.13,0.0,0.17,0.09,0.06,...,0.02,0.0,0.01,0.07,0.0,0.0,0.09,0.09,0.13,0.09
1,typical_of_concept,0.22,0.2,0.07,0.05,0.01,0.0,0.12,0.1,0.09,...,0.07,0.02,0.05,0.0,0.0,0.04,0.02,0.04,0.01,0.05
2,typical_of_property,0.3,0.14,0.11,0.07,0.0,0.0,0.04,0.07,0.14,...,0.11,0.02,0.11,0.05,0.0,0.04,0.04,0.02,0.0,0.04
3,affording_activity,0.01,0.09,0.03,0.05,0.0,0.0,0.32,0.12,0.07,...,0.03,0.0,0.0,0.0,0.0,0.03,0.04,0.18,0.0,0.0
4,afforded_usual,0.0,0.6,0.0,0.0,0.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.4,0.29
5,afforded_unusual,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.49,0.0,0.0,0.0,0.0,0.0,0.51
6,variability_limited,0.38,0.11,0.12,0.19,0.0,0.0,0.01,0.19,0.0,...,0.12,0.05,0.03,0.04,0.0,0.0,0.08,0.01,0.0,0.01
7,variability_open,0.61,0.0,0.02,0.33,0.0,0.0,0.01,0.04,0.0,...,0.02,0.16,0.03,0.0,0.0,0.0,0.2,0.01,0.0,0.0
8,variability_limited_scalar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,variability_open_scalar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,...,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0
