# Analyze based on semantic categories

1.) change tfidf so we compare equivalent categories only - done
2.) update ranking accordingly

In [1]:
import os
from collections import Counter, defaultdict
import csv
import pandas as pd

In [3]:
f_original = os.listdir('../contexts/giga_full/vocab')
print(len(f_original))
f_update = os.listdir('../contexts/giga_full_updated/vocab')
print(len(f_update))

1446
1636


In [2]:
f_original = os.listdir('../contexts/wiki/vocab')
print(len(f_original))
f_update = os.listdir('../contexts/wiki_updated/vocab')
print(len(f_update))

1669
1874


In [2]:

def get_categories(prop, model_name):
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    categories = set()
    for d in os.listdir(path_dir):
        categories.add(d)
    return categories

def get_context_cnts(prop, cat, label, model_name):
    
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    path_label = f'{path_dir}/{cat}/{label}'
    
    context_cnt = Counter()
    for f in os.listdir(path_label):
        full_path = f'{path_label}/{f}'
        if full_path.endswith('.csv'):
            with open(full_path) as infile:
                data = list(csv.DictReader(infile))
            for d in data:
                context = d['']
                diff = float(d['diff'])
                if diff > 0:
                    context_cnt[context] += 1
    return context_cnt
    
def get_n_concepts_total(prop, cat, model_name):
    
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    label = 'pos'
    path_pos = f'{path_dir}/{cat}/{label}'
    label = 'neg'
    path_neg = f'{path_dir}/{cat}/{label}'
    
    files_pos = [f for f in os.listdir(path_pos) if f.endswith('.csv')]
    files_neg = [f for f in os.listdir(path_neg) if f.endswith('.csv')]
    
    return len(files_pos), len(files_neg)


def get_f1_distinctiveness(n_pos, n_neg, total_pos, total_neg):
    
    tp = n_pos
    tn = total_neg - n_neg
    fp = n_neg
    fn = total_pos - n_pos
    
    if tp+fp != 0:
        p = tp/(tp+fp)
    else:
        p = 0
    if tp+fn != 0:
        r = tp/(tp+fn)
    else:
        r = 0
    
    if p+r != 0:
        f1 = 2 * ((p*r)/(p+r))
    else:
        f1=0
    
    return p, r, f1

    
def aggregate_contexts(prop, cutoff, model_name):
    aggregation_name = 'aggregated-tfidf-top20-raw-10000-categories'
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
    os.makedirs(path_dir_agg, exist_ok = True)
    
    context_cnts_all = Counter()
    context_cat_dict = defaultdict(set)

    cats = get_categories(prop, model_name)

    for cat in cats:
        context_cnts_pos = get_context_cnts(prop, cat, 'pos', model_name)
        context_cnts_neg = get_context_cnts(prop, cat, 'neg', model_name)
        total_pos, total_neg = get_n_concepts_total(prop, cat, model_name)
        
        context_f1_dict = Counter()
        context_score_dict = defaultdict(dict)
        
        # get distinctiveness
        for c, cnt_pos in context_cnts_pos.most_common():
            cnt_neg = context_cnts_neg[c]
            p, r, f1 = get_f1_distinctiveness(cnt_pos, cnt_neg, total_pos, total_neg)
            context_f1_dict[c] = f1
            context_score_dict[c] = {'p': p,'r':r, 'f1': f1}
        
        table = []
        for c, f1 in context_f1_dict.most_common():
            scores = context_score_dict[c]
            d = dict()
            d['context'] = c
            d.update(scores)
            d['n_pos'] = context_cnts_pos[c]
            d['total_pos'] = total_pos
            d['n_neg'] = context_cnts_neg[c]
            d['total_neg'] = total_neg
            table.append(d)
        
        # collect and write to file
        f = f'{path_dir_agg}/{cat}.csv'
        
        header = table[0].keys()
        with open(f, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = header)
            writer.writeheader()
            for d in table:
                writer.writerow(d)
        

                
def prepare_annotation(prop, model_name, cutoff=20):
    
    annotation_name = 'annotation-tfidf-top20-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    os.makedirs(path_dir_annotation, exist_ok = True)
    f_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}/annotation.csv'
    
    # paths aggregated files:
    aggregation_name = 'aggregated-tfidf-top20-raw-10000-categories'
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
    
    # get categories
    cats = get_categories(prop, model_name)
    
    # collect all contexts and categories 
    context_cats_dict = defaultdict(set)
    
    # load top 20 per category
    for cat in cats:
        path = f'{path_dir_agg}/{cat}.csv'
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        # get top 20
        data = data[:cutoff]
        contexts = [d['context'] for d in data]
        # record categories
        for c in contexts:
            context_cats_dict[c].add(cat)
    
    with open(f_annotation, 'w') as outfile:
        outfile.write('context,evidence_type,categories\n')
        for c, cats in context_cats_dict.items():
            outfile.write(f'{c}, ,{" ".join(cats)}\n')

def get_properties():
    properties = []
    for path in os.listdir('../data/aggregated/'):
        prop = path.split('.')[0]
        if 'female-' not in prop and prop != '':
            properties.append(prop)
    return properties

def get_top_distinctive_contexts(properties, model_name):
    aggregation_name = 'aggregated-tfidf-top20-raw-10000-categories'
    table = []
    for prop in properties:
        path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
        path = path = f'{path_dir_agg}/all.csv'
        # load file containing all concepts and simply load first one
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        # top distinctive context
        d = dict()
        d['property'] = prop
        top_context = data[0]
        for k, v in top_context.items():
            if k != 'context':
                v = float(v)
            d[k] = v
        table.append(d)
    return table

In [50]:
model_name = 'giga_full_updated'
properties = get_properties()
cutoff = 20

for prop in properties:
    aggregate_contexts(prop, cutoff, model_name)
    prepare_annotation(prop, model_name, cutoff) 

In [51]:
# get top distinctive contexts per prop

model_name = 'giga_full_updated'
properties = get_properties()
table = get_top_distinctive_contexts(properties, model_name)
df = pd.DataFrame(table)
df.sort_values('f1', ascending = False).round(2)

Unnamed: 0,property,context,p,r,f1,n_pos,total_pos,n_neg,total_neg
9,used_in_cooking,add,1.0,0.92,0.96,93.0,101.0,0.0,56.0
0,square,built,1.0,0.8,0.89,70.0,87.0,0.0,21.0
19,wheels,drove,0.98,0.73,0.84,51.0,70.0,1.0,25.0
7,sweet,sweet,0.98,0.71,0.83,65.0,91.0,1.0,63.0
17,cold,variety,1.0,0.71,0.83,48.0,68.0,0.0,24.0
5,dangerous,killed,0.98,0.69,0.81,45.0,65.0,1.0,51.0
10,juicy,pineapple,1.0,0.66,0.79,58.0,88.0,0.0,60.0
16,female,herself,0.9,0.71,0.79,77.0,109.0,9.0,144.0
11,green,green,0.97,0.66,0.79,59.0,89.0,2.0,67.0
1,warm,heavy,0.99,0.65,0.79,81.0,124.0,1.0,32.0


### Transfer old annotations to new files


In [70]:
properties = get_properties()
model_name_current = 'giga_full_updated'
model_name_old = 'giga_full'


for prop in properties:
    # current file:
    annotation_name = 'annotation-tfidf-top20-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation_new = f'{path_dir_annotation}/annotation.csv'
    f_annotation_tr = f'{path_dir_annotation}/annotation-transferred.csv'

    # old file:
    annotation_name = 'annotation-tfidf-top20-raw-10000'
    path_dir_annotation = f'../analysis/{model_name_old}/{annotation_name}/{prop}-pos'
    f_annotation_old = f'{path_dir_annotation}/annotation-done.csv'

    # load old annotations
    context_annotation_dict=dict()
    with open(f_annotation_old) as infile:
        data = list(csv.DictReader(infile))
        for d in data:
            c = d['context']
            et = d['evidence']
            context_annotation_dict[c] = et
            #c = d['context']

    # load new candidates

    with open(f_annotation_new) as infile:
        data = list(csv.DictReader(infile))

    # fill in old annotations
    for d in data:
        c = d['context']
        if c in context_annotation_dict:
            et = context_annotation_dict[c]
        else:
            et = 'NA'
        d['evidence_type'] = et

    # write to new file

    with open(f_annotation_tr, 'w') as outfile:
        writer = csv.DictWriter(outfile, fieldnames = data[0].keys())
        writer.writeheader()
        for d in data:
            writer.writerow(d)

### Complete annotations

In [17]:
def get_annotation_status(model):
    dir_annotations = f'../analysis/{model}/annotation-tfidf-top20-raw-10000-categories'
    annotation_dict = defaultdict(set)

    for f in os.listdir(dir_annotations):
        if  not f.endswith('.csv') and not f.endswith('.ipynb_checkpoints'):
            prop = f.split('/')[-1]
            full_path = f'{dir_annotations}/{f}'
            #print(full_path)
            # get categories:
            files = os.listdir(full_path)
            if 'annotation-done.csv' in files:
                annotation_dict['complete'].add(prop)
            else:
                annotation_dict['incomplete'].add(prop)
    return annotation_dict

def show_annotation_status(model_name):
    annotation_dict = get_annotation_status(model_name)
    # same category not annotated:
    print('completed:\n')
    for prop in sorted(list(annotation_dict['complete'])):
        # cats open:
        print(prop)
    print()
    print('Incomplete:\n')
    for prop in sorted(annotation_dict['incomplete']):
        if prop not in annotation_dict['complete']:
            print(prop)
            

def get_evidence_distribution(model_name, prop):
    
    # current file:
    annotation_name = 'annotation-tfidf-top20-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation = f'{path_dir_annotation}/annotation-done.csv'
    
    ev_cnts = Counter()
    
    with open(f_annotation) as infile:
        data = list(csv.DictReader(infile))
    
    total_contexts = len(data)
    
    for d in data:
        et = d['evidence_type']
        ev_cnts[et] += 1
        if et != 'u':
            ev_cnts['all'] += 1
    
    ev_dict = dict()
    for ev, cnt in ev_cnts.items():
        ev_dict[ev]  = cnt/total_contexts
    return ev_dict

In [26]:
model_name = 'giga_full_updated'
show_annotation_status(model_name)

completed:

black
blue
cold
dangerous
female
fly
green
hot
juicy

Incomplete:

lay_eggs
made_of_wood
red
roll
round
square
sweet
swim
used_in_cooking
warm
wheels
wings
yellow


In [27]:
model_name = 'giga_full_updated'
properties = ['black', 'blue', 'dangerous', 'female', 'fly', 'green', 'hot', 'juicy']

table = []
for prop in properties:
    d = dict()
    d['property'] =  prop
    d.update(get_evidence_distribution(model_name, prop))
    table.append(d)
   
cols = ['property', 'u', 'all', 'p', 'n', 'l', 'i', 'r', 'b']
df = pd.DataFrame(table)
df = df.round(2)
df[cols].sort_values('all', ascending = False)

Unnamed: 0,property,u,all,p,n,l,i,r,b
7,juicy,0.73,0.27,0.02,,,0.2,0.05,
2,dangerous,0.74,0.26,0.01,,,0.09,0.16,0.01
6,hot,0.74,0.26,0.01,0.02,,0.1,0.13,
3,female,0.75,0.25,,,0.01,0.16,0.01,0.07
4,fly,0.82,0.18,0.04,0.01,,0.07,0.06,
5,green,0.84,0.16,0.0,,,0.14,0.02,
0,black,0.89,0.11,0.0,,,0.1,0.0,
1,blue,0.93,0.07,0.0,,,0.05,0.02,
