# Analyze based on semantic categories

1.) change tfidf so we compare equivalent categories only - done
2.) update ranking accordingly

In [1]:
import os
from collections import Counter, defaultdict
import csv
import pandas as pd

from sklearn.metrics import precision_recall_fscore_support

In [3]:
f_original = os.listdir('../contexts/giga_full/vocab')
print(len(f_original))
f_update = os.listdir('../contexts/giga_full_updated/vocab')
print(len(f_update))

1446
1636


In [2]:
f_original = os.listdir('../contexts/wiki/vocab')
print(len(f_original))
f_update = os.listdir('../contexts/wiki_updated/vocab')
print(len(f_update))

1669
1874


In [2]:

def get_categories(prop, model_name):
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    categories = set()
    for d in os.listdir(path_dir):
        categories.add(d)
    return categories

def get_context_cnts(prop, cat, label, model_name):
    
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    path_label = f'{path_dir}/{cat}/{label}'
    
    context_cnt = Counter()
    for f in os.listdir(path_label):
        full_path = f'{path_label}/{f}'
        if full_path.endswith('.csv'):
            with open(full_path) as infile:
                data = list(csv.DictReader(infile))
            for d in data:
                context = d['']
                diff = float(d['diff'])
                if diff > 0:
                    context_cnt[context] += 1
    return context_cnt
    
def get_n_concepts_total(prop, cat, model_name):
    
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    label = 'pos'
    path_pos = f'{path_dir}/{cat}/{label}'
    label = 'neg'
    path_neg = f'{path_dir}/{cat}/{label}'
    
    files_pos = [f for f in os.listdir(path_pos) if f.endswith('.csv')]
    files_neg = [f for f in os.listdir(path_neg) if f.endswith('.csv')]
    
    return len(files_pos), len(files_neg)

def get_f1_distinctiveness(n_pos, n_neg, total_pos, total_neg):
    
   
    total_instances = total_pos + total_neg
    labels = []
    [labels.append('pos') for i in range(total_pos)]
    [labels.append('neg') for i in range(total_neg)]
    pred_labels_pos = []
    for i in range(total_pos):
        if i < n_pos:
            pred_labels_pos.append('pos')
        else:
            pred_labels_pos.append('neg')
#     print(n_pos, total_pos)
#     print(pred_labels_pos.count('pos'), pred_labels_pos.count('neg'))
    
    pred_labels_neg = []
    for i in range(total_neg):
        if i < n_neg:
            pred_labels_neg.append('pos')
        else:
            pred_labels_neg.append('neg')
#     print(n_neg, total_neg)
#     print(pred_labels_neg.count('pos'), pred_labels_neg.count('neg'))
    
    predictions = pred_labels_pos + pred_labels_neg
    
    
    #print(len(labels), len(predictions))
    #print(pos_predictions, neg_predictions)
    
    p, r, f1, supp = precision_recall_fscore_support(labels, predictions, average = 'weighted', 
                                                     zero_division=0)
    #average='weighted'
    
    return p, r, f1


    
def aggregate_contexts(prop, cutoff, model_name):
    aggregation_name = 'aggregated-tfidf-raw-10000-categories'
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
    os.makedirs(path_dir_agg, exist_ok = True)
    
    context_cnts_all = Counter()
    context_cat_dict = defaultdict(set)

    cats = get_categories(prop, model_name)

    for cat in cats:
        context_cnts_pos = get_context_cnts(prop, cat, 'pos', model_name)
        context_cnts_neg = get_context_cnts(prop, cat, 'neg', model_name)
        total_pos, total_neg = get_n_concepts_total(prop, cat, model_name)
        
        context_f1_dict = Counter()
        context_score_dict = defaultdict(dict)
        
        # get distinctiveness
        for c, cnt_pos in context_cnts_pos.most_common():
            cnt_neg = context_cnts_neg[c]
            p, r, f1 = get_f1_distinctiveness(cnt_pos, cnt_neg, total_pos, total_neg)
            context_f1_dict[c] = f1
            context_score_dict[c] = {'p': p,'r':r, 'f1': f1}
        
        table = []
        for c, f1 in context_f1_dict.most_common():
            scores = context_score_dict[c]
            d = dict()
            d['context'] = c
            d.update(scores)
            d['n_pos'] = context_cnts_pos[c]
            d['total_pos'] = total_pos
            d['n_neg'] = context_cnts_neg[c]
            d['total_neg'] = total_neg
            table.append(d)
        
        # collect and write to file
        f = f'{path_dir_agg}/{cat}.csv'
        
        header = table[0].keys()
        with open(f, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = header)
            writer.writeheader()
            for d in table:
                writer.writerow(d)
        
                
def prepare_annotation(prop, model_name, cutoff=3, cutoff_concepts = 5):
    
    annotation_name = f'annotation-tfidf-top_{cutoff}_{cutoff_concepts}-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    os.makedirs(path_dir_annotation, exist_ok = True)
    f_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}/annotation-updated.csv'
    
    # paths aggregated files:
    aggregation_name = 'aggregated-tfidf-raw-10000-categories'
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'

    
    # get categories
    cats = get_categories(prop, model_name)
    
    # collect all contexts and categories 
    context_cats_dict = defaultdict(set)
    
    # load top 5 per category
    for cat in cats:
        path = f'{path_dir_agg}/{cat}.csv'
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        # sort by f1
        f1_dict  = defaultdict(list)
        for d in data:
            f1 = d['f1']
            f1_dict[f1].append(d)
        scores = sorted(list(f1_dict.keys()), reverse=True)
        top_scores = scores[:cutoff]
        top_context_dicts = []
        for ts in top_scores:
            dicts = f1_dict[ts]
            for d in dicts:
                n_pos = int(d['n_pos'])
                if n_pos > cutoff_concepts:
                    top_context_dicts.append(d)
    
        contexts = [d['context'] for d in top_context_dicts]
        # record categories
        for c in contexts:
            context_cats_dict[c].add(cat)
    
    with open(f_annotation, 'w') as outfile:
        outfile.write('context,evidence_type,categories\n')
        for c, cats in context_cats_dict.items():
            outfile.write(f'{c}, ,{" ".join(cats)}\n')

def get_properties():
    properties = []
    for path in os.listdir('../data/aggregated/'):
        prop = path.split('.')[0]
        if 'female-' not in prop and prop != '':
            properties.append(prop)
    return properties

def get_top_distinctive_contexts(properties, model_name):
    aggregation_name = 'aggregated-tfidf-raw-10000-categories'
    table = []
    for prop in properties:
        path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
        path = path = f'{path_dir_agg}/all.csv'
        # load file containing all concepts and simply load first one
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        # top distinctive context
        d_prop = dict()
        d_prop['property'] = prop
        # sort data by f1
        f1_dict = defaultdict(list)
        for d in data:
            f1 = d['f1']
            f1_dict[f1].append(d)
        
        top_score = max(list(f1_dict.keys()))
        top_dicts = f1_dict[top_score]
        top_context_dict = top_dicts[0]
        top_contexts = ' '.join([d['context'] for d in top_dicts])
        

        for k, v in top_context_dict.items():
            if k != 'context':
                v = float(v)
                d_prop[k] = v
        d_prop['contexts'] = top_contexts
        table.append(d_prop)
    return table

In [20]:
model_name = 'giga_full_updated'
#properties = get_properties()
properties_test = ['dangerous']
properties = [p for p in properties if p not in properties_test]
cutoff = 3
cutoff_concepts = 5

for prop in properties:
    print(prop)
    aggregate_contexts(prop, cutoff, model_name)
    prepare_annotation(prop, model_name, cutoff, cutoff_concepts) 

square
warm
black
red
fly
wings
sweet
hot
used_in_cooking
juicy
green
made_of_wood
blue
yellow
roll
female
cold
round
wheels
lay_eggs
swim


In [31]:
# get top distinctive contexts per prop

model_name = 'giga_full_updated'
properties = get_properties()
table = get_top_distinctive_contexts(properties, model_name)
df = pd.DataFrame(table)
df.sort_values('f1', ascending = False).round(2)

Unnamed: 0,property,p,r,f1,n_pos,total_pos,n_neg,total_neg,contexts
9,used_in_cooking,0.96,0.95,0.95,93.0,101.0,0.0,56.0,add recipe
4,fly,0.88,0.87,0.87,29.0,44.0,2.0,90.0,flew
0,square,0.91,0.84,0.86,70.0,87.0,0.0,21.0,built
20,lay_eggs,0.86,0.84,0.83,20.0,33.0,1.0,57.0,eggs
16,female,0.85,0.84,0.83,77.0,109.0,9.0,144.0,herself
7,sweet,0.87,0.82,0.83,65.0,91.0,1.0,63.0,sweet
5,dangerous,0.86,0.82,0.82,45.0,65.0,1.0,51.0,killed
13,blue,0.87,0.83,0.81,31.0,59.0,0.0,106.0,magic
6,wings,0.85,0.82,0.81,36.0,60.0,1.0,77.0,bird
19,wheels,0.87,0.79,0.8,51.0,70.0,1.0,25.0,drove wheel


### Transfer old annotations to new files


In [4]:
properties = get_properties()
model_name_current = 'giga_full_updated'
model_name_old = 'giga_full'


for prop in properties:
    # current file:
    annotation_name = 'annotation-tfidf-top20-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation_new = f'{path_dir_annotation}/annotation.csv'
    f_annotation_tr = f'{path_dir_annotation}/annotation-transferred.csv'

    # old file:
    annotation_name = 'annotation-tfidf-top20-raw-10000'
    path_dir_annotation = f'../analysis/{model_name_old}/{annotation_name}/{prop}-pos'
    f_annotation_old = f'{path_dir_annotation}/annotation-done.csv'

    # load old annotations
    context_annotation_dict=dict()
    with open(f_annotation_old) as infile:
        data = list(csv.DictReader(infile))
        for d in data:
            c = d['context']
            et = d['evidence']
            context_annotation_dict[c] = et
            #c = d['context']

    # load new candidates

    with open(f_annotation_new) as infile:
        data = list(csv.DictReader(infile))

    # fill in old annotations
    for d in data:
        c = d['context']
        if c in context_annotation_dict:
            et = context_annotation_dict[c]
        else:
            et = 'NA'
        d['evidence_type'] = et

    # write to new file

    with open(f_annotation_tr, 'w') as outfile:
        writer = csv.DictWriter(outfile, fieldnames = data[0].keys())
        writer.writeheader()
        for d in data:
            writer.writerow(d)

In [35]:
# transfer new annotations to updated f1 scores

properties = get_properties()
#properties = ['dangerous']
model_name_current = 'giga_full_updated'
model_name_old = 'giga_full'


for prop in properties:
    # current file:
    annotation_name = 'annotation-tfidf-top_3_5-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation_new = f'{path_dir_annotation}/annotation-updated.csv'
    f_annotation_tr = f'{path_dir_annotation}/annotation-transferred-updated.csv'

    # old file:
    annotation_name = 'annotation-tfidf-top20-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation_old = f'{path_dir_annotation}/annotation-done.csv'

    # load old annotations
    if os.path.isfile(f_annotation_old):
        print('found file')
        context_annotation_dict=dict()
        with open(f_annotation_old) as infile:
            data = list(csv.DictReader(infile))
            for d in data:
                c = d['context']
                et = d['evidence_type']
                context_annotation_dict[c] = et
                #c = d['context']

        # load new candidates

        with open(f_annotation_new) as infile:
            data = list(csv.DictReader(infile))

        # fill in old annotations
        for d in data:
            c = d['context']
            if c in context_annotation_dict:
                et = context_annotation_dict[c]
            else:
                et = 'NA'
            d['evidence_type'] = et

        # write to new file

        with open(f_annotation_tr, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = data[0].keys())
            writer.writeheader()
            for d in data:
                writer.writerow(d)

found file
found file
found file
found file
found file
found file
found file
found file
found file
found file
found file
found file
found file
found file


### Complete annotations

In [4]:
from collections import defaultdict
import os
import csv

In [13]:
def get_annotation_status(model):
    #dir_annotations = f'../analysis/{model}/annotation-tfidf-top20-raw-10000-categories'
    dir_annotations = f'../analysis/{model}/annotation-tfidf-top_3_5-raw-10000-categories'
    annotation_dict = defaultdict(set)

    for f in os.listdir(dir_annotations):
        if  not f.endswith('.csv') and not f.endswith('.ipynb_checkpoints'):
            prop = f.split('/')[-1]
            full_path = f'{dir_annotations}/{f}'
            #print(full_path)
            # get categories:
            files = os.listdir(full_path)
            if 'annotation-updated-done.csv' in files:
                annotation_dict['complete'].add(prop)
            else:
                annotation_dict['incomplete'].add(prop)
    return annotation_dict

def show_annotation_status(model_name):
    annotation_dict = get_annotation_status(model_name)
    # same category not annotated:
    print('completed:\n')
    for prop in sorted(list(annotation_dict['complete'])):
        # cats open:
        print(prop)
    print()
    print('Incomplete:\n')
    for prop in sorted(annotation_dict['incomplete']):
        if prop not in annotation_dict['complete']:
            print(prop)
            
            
def get_evidence_dict(model_name, prop):
    
    annotation_name = 'annotation-tfidf-top_3_5-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation = f'{path_dir_annotation}/annotation-transferred-done.csv'
    
    ev_dict = dict()
    
    with open(f_annotation) as infile:
        data = list(csv.DictReader(infile))
    for d in data:
        et = d['evidence_type']
        ev = d['context']
        ev_dict[ev] = et
    return ev_dict
        
            

def get_evidence_distribution(model_name, prop):
    
    # current file:
    annotation_name = 'annotation-tfidf-top_3_5-raw-10000-categories'
    path_dir_annotation = f'../analysis/{model_name}/{annotation_name}/{prop}'
    f_annotation = f'{path_dir_annotation}/annotation-updated-done.csv'
    
    ev_dict = get_evidence_dict(model_name, prop)
    
    ev_cnts = Counter()
    
    for e, et in ev_dict.items():
        ev_cnts[et] += 1
        if et != 'u':
            ev_cnts['all'] += 1
        if et in ['p', 'l', 'n']:
            ev_cnts['prop_specific'] += 1
    
    total_contexts = len(ev_dict)
    
    ev_counts_norm = dict()
    for ev, cnt in ev_cnts.items():
        ev_counts_norm[ev]  = cnt/total_contexts
    return ev_counts_norm

In [12]:
model_name = 'giga_full_updated'
show_annotation_status(model_name)

completed:

black

Incomplete:

blue
cold
dangerous
female
fly
green
hot
juicy
lay_eggs
made_of_wood
red
roll
round
square
sweet
swim
used_in_cooking
warm
wheels
wings
yellow


In [8]:
model_name = 'giga_full_updated'
properties = [
    'black'
             ]
cols = ['property', 'u', 'all', 'prop_specific', 'p', 'n', 'l', 'i', 'r', 'b']
table = []
for prop in properties:
    d = dict()
    d['property'] =  prop
    d.update(get_evidence_distribution(model_name, prop))
    for c in cols:
        if c not in d:
            d[c] = ''
    table.append(d)
   
#cols = ['property', 'u', 'all', 'prop_specific', 'p', 'n', 'l', 'i', 'r', 'b']
df = pd.DataFrame(table)
df[cols].sort_values('all', ascending = False).round(2)

Unnamed: 0,property,u,all,prop_specific,p,n,l,i,r,b
0,black,0.89,0.11,0.0,0.0,,,0.1,0.0,


In [214]:
import json
path = '/Users/piasommerauer/Data/winogrande/winogrande_1.1/dev.jsonl'
with open(path) as infile:
    for line in infile:
        d = json.loads(line)
        op1 = d['option1']
        op2 = d['option2']
        if all([ not op1.istitle, not op2.istitle]):
            print(d)
            break

In [204]:
%ls /Users/piasommerauer/Data/winogrande/winogrande_1.1/

README.md                     train_l.jsonl
dev-labels.lst                train_m-labels.lst
dev.jsonl                     train_m.jsonl
eval.py                       train_s-labels.lst
sample-submission-labels.lst  train_s.jsonl
test.jsonl                    train_xl-labels.lst
train_debiased-labels.lst     train_xl.jsonl
train_debiased.jsonl          train_xs-labels.lst
train_l-labels.lst            train_xs.jsonl


## Testing hypotheses

In [2]:
import json
import csv
import os
from collections import Counter, defaultdict
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [3]:
def load_prop_data(prop):
    
    path = f'../data/aggregated_semantic_info/{prop}.json'
    with open(path) as infile:
        concept_dict = json.load(infile)
    return concept_dict


def load_concept_evidence(concept, prop, model_name, categories):
    
    categories.add('all')
    contexts = set()
    dir_path = f'../results/{model_name}/tfidf-raw-10000/each_target_vs_corpus_per_category'
    
    for cat in categories:
        f_path = f'{dir_path}/{prop}/{cat}/pos/{concept}.csv'
        if os.path.isfile(f_path):
            with open(f_path) as infile:
                data = list(csv.DictReader(infile))
            for d in data:
                context = d['']
                diff = float(d['diff'])
                if diff > 0:
                    contexts.add(context)
    return contexts  

def get_categories(prop, model_name):
    analysis_type = 'tfidf-raw-10000/each_target_vs_corpus_per_category'
    path_dir = f'../results/{model_name}/{analysis_type}'
    path_dir = f'{path_dir}/{prop}'
    categories = set()
    for d in os.listdir(path_dir):
        categories.add(d)
    return categories


def get_top_ev_categories(prop, model_name):
    table = dict()
    aggregation_name = 'aggregated-tfidf-top20-raw-10000-categories'
    categories = get_categories(prop, model_name)
    
    path_dir_agg = f'../analysis/{model_name}/{aggregation_name}/{prop}'
    evidence_dict = get_evidence_dict(model_name, prop)
    
    et_context_dict = defaultdict(set)
    for c, et in evidence_dict.items():
        et_context_dict[et].add(c)
    
    # get top performance per evidence type for each category
    for cat in categories:
        path = path = f'{path_dir_agg}/{cat}.csv'
        # load file containing all concepts and simply load first one
        with open(path) as infile:
            data = list(csv.DictReader(infile))
        # sort by performance:
        perf_data = defaultdict(list)
        for d in data:
            f1 = d['f1']
            perf_data[f1].append(d)
        perf_ranked = sorted(list(perf_data.keys()), reverse = True)
        for et, contexts in et_context_dict.items():
            for f1 in perf_ranked:
                data = perf_data[f1]
                d_perf = dict()
                for k, v in d.items():
                    if k != 'context':
                        d_perf[k] = round(float(v), 2)
                contexts_ev = set()
                for d in data:
                    context = d['context']
                    if context in contexts:
                        contexts_ev.add(context)
                if contexts_ev:
                    d_perf['n_c'] = len(contexts_ev)
                    d_perf['contexts'] = ' '.join(contexts_ev)
                    table[(cat, et)] = d_perf
                    break
                
    return table

In [6]:
prop = 'lay_eggs'
model_name = 'giga_full_updated'
table = get_top_ev_categories(prop, model_name)

df = pd.DataFrame(table)
df.T

Unnamed: 0,Unnamed: 1,p,r,f1,n_pos,total_pos,n_neg,total_neg,n_c,contexts
communication,u,0.25,1.0,0.4,1,1,3,5,41,fall baked size populations stuffed piano spot...
communication,i,1.0,1.0,1.0,1,1,0,5,12,duck swordfish pike smallmouth shrimp salmon c...
communication,r,1.0,1.0,1.0,1,1,0,5,7,freshwater sea fishing atlantic fishermen angl...
communication,p,1.0,1.0,1.0,1,1,0,5,2,laying eggs
no-cat,u,0.14,0.17,0.15,1,6,6,6,2,habitat endangered
no-cat,i,1.0,1.0,1.0,6,6,0,6,1,animal
no-cat,r,1.0,1.0,1.0,6,6,0,6,2,nest river
no-cat,p,1.0,1.0,1.0,6,6,0,6,1,eggs
mammal,u,0.06,1.0,0.12,1,1,15,49,17,rare man mammal mammals billed genes sty retai...
mammal,i,1.0,1.0,1.0,1,1,0,49,2,duck animals


In [184]:
properties = ['black', 'blue', 'dangerous', 
              'fly', 'green', 
              'hot', 'juicy', 'lay_eggs',
              'made_of_wood']

model_name = 'giga_full_updated'


# get all pairs in which the only relation > 0.5 is implied
# get all pos pairs that do not have implied > 0.5
# get all pos pairs that do not have implied and have some relations leading to mentions
# compare prop mentions

relations_m = {'typical_of_property', 'variability_limited', 'affording_activity', 'afforded_usual'}
relations_nm = {'typical_of_concept', 'implied_category', 'variability_open', 'afforded_usual'}

target = 'implied_category'
target_pairs = set()
comparison_pairs = set()
comparison_pairs_clean = set()


properties = get_properties()
for prop in properties:
    concept_dict = load_prop_data(prop)
    for c, d in concept_dict.items():
        ml_label = d['ml_label']
        if ml_label in ['all', 'all-some', 'some', 'few-some']:
            relations = d['relations']
            relations_r = [r for r, v in relations.items() if v > 0.5]
            if len(relations_r) == 1 and target in relations_r:
                target_pairs.add((prop, c))
            elif target not in relations_r:
                comparison_pairs.add((prop, c))
            elif any([r in relations_m for r in relations_r]):
                comparison_pairs_clean.add((prop, c))
            
print(len(target_pairs), len(comparison_pairs), len(comparison_pairs_clean))          

16 906 989


In [185]:
target_pairs

{('dangerous', 'pentobarbital'),
 ('lay_eggs', 'crane'),
 ('lay_eggs', 'flounder'),
 ('lay_eggs', 'neritidae'),
 ('roll', 'bike'),
 ('round', 'cherry'),
 ('round', 'patty'),
 ('round', 'pepperoni'),
 ('swim', 'bay'),
 ('swim', 'cob'),
 ('warm', 'brogue'),
 ('wheels', 'saloon'),
 ('wheels', 'tank'),
 ('wheels', 'underframe'),
 ('wings', 'cricket'),
 ('wings', 'roach')}

In [186]:
# check how often direct mentions of property words are mentioned 
# in the context of the target pairs vs the rest