In [249]:
from read_write_files import read_json, save_json
from collections import Counter
import itertools
import pandas as pd

In [250]:
def create_sense_zip(alignments):
    attr = "Sense"
    attr_zip = []
    for alignment in alignments:
        line_attr = []
        line_attr += [alignment["gold"][attr][0]]
        
        for pars in alignment["parsers"]:
            if pars == None:
                line_attr += ["None"]
            else:
                line_attr += [pars[attr][0]]
        
        attr_zip += [tuple(line_attr)]
    return attr_zip,alignment["parser_names"]

In [251]:
def sense_statistics(sense_comparison,parser_names,not_mapped_rels):
    zip_sense_comparison = list(zip(*sense_comparison))
    gold_senses = zip_sense_comparison[0]
    #Count all combination of senses
    sense_counter = Counter(sense_comparison)
    #Different Senses
    set_senses = set(gold_senses)
    #How many parsers will be compared
    len_parser = len(zip_sense_comparison)-1
    sense_rows = []
    
    #not mappable predicted relations
    not_mapped_counter = Counter([(rel["Parser"],rel["Sense"][0]) for rel in not_mapped_rels])

    for sense in set_senses:
        tmp_senses = set_senses.copy()
        tmp_senses.remove(sense)
        possible_comb = [[sense]] + [tmp_senses for i in range(len_parser)]
        diff_preds = itertools.product(*possible_comb)

        equal_correct_parsing = sense_counter[tuple([sense]+[sense]*len_parser)]

        all_wrong_parsing = sum([
            sense_counter[tuple(diff_pred)] 
            for diff_pred in diff_preds])

        total_act_sense_count = equal_correct_parsing + sum([1 for i in gold_senses if i == sense])


        equal_wrong_parsing = sum([
            sense_counter[tuple([sense]+[other_sense]*len_parser)] 
            for other_sense in tmp_senses])
        
        parser_columns = []

        #Parser is better than all the other
        parser_better = []
        for index_parser in range(1,len_parser+1):
            tmp_possible_comb = possible_comb[:]
            column_name = parser_names[index_parser-1]
            
            tmp_possible_comb[index_parser] = [sense]
            other_diff_preds = itertools.product(*tmp_possible_comb)
            
            pars_better = sum([sense_counter[other_diff_pred] for other_diff_pred in other_diff_preds])
            total_pred_sense_count = Counter(zip_sense_comparison[index_parser])[sense]#sum([1 for i in zip_sense_comparison[index_parser] if i == sense])
            
            not_mapped = not_mapped_counter[(column_name,sense)]
            
            right_pred_comb = [[sense]] + [set_senses]*len_parser
            right_pred_comb[index_parser] = [sense]
            right_pred = sum([sense_counter[comb] for comb in itertools.product(*right_pred_comb)])
            
            parser_better += [
                pars_better,
                not_mapped,
                right_pred,
                total_pred_sense_count+not_mapped]
            
            
            parser_columns += [string.format(column_name) for string in [
                "{}: unique right",
                "{}: not mapped",
                "{}: right pred",
                "{}: total pred"]]
            
        #Two Parser share opinions
        for parser1,parser2 in itertools.combinations([i for i in range(1,len_parser+1)],2):
            tmp_possible_comb = possible_comb[:]
            tmp_possible_comb[parser1] = [sense]
            tmp_possible_comb[parser2] = [sense]
            
            
            pars_better = sum([sense_counter[comb] for comb in itertools.product(*tmp_possible_comb)])
            total_pred_sense_count = sum(
                [1 for pars1_pred,pars2_pred in zip(*[
                    zip_sense_comparison[parser1],
                    zip_sense_comparison[parser2]
                ])
                if (pars2_pred == sense) != (pars1_pred == sense)])
            
            parser_better += [pars_better,total_pred_sense_count]
            
            column_name = " and ".join([parser_names[parser1-1],parser_names[parser2-1]])
            parser_columns += [
                "{} right".format(column_name),
                "Total Pred ({})".format(column_name)]
        

        #At least one parser is correct
        correct_sense_poss = [[sense]]+[set_senses]*len_parser
        combination_correct = sum(
            [sense_counter[at_least_one_right] 
             for at_least_one_right in itertools.product(*correct_sense_poss) if sense in at_least_one_right[1:]])

        sense_rows += [[sense,equal_correct_parsing,all_wrong_parsing,total_act_sense_count,equal_wrong_parsing,combination_correct]+parser_better]

    columns = ["Sense",
             "Equal Correct",
             "All Wrong",
             "Total Act",
             "Equal Wrong",
             "At least one correct"]+parser_columns
    return pd.DataFrame(
        columns=columns,
        data=sense_rows),columns


In [252]:
alignment_path = "data/project_files/blind/total_alignment.json"
not_mappable_path = "data/project_files/blind/not_mappable.json"

total_alignments = read_json(alignment_path)
not_mappable = read_json(not_mappable_path)

In [253]:
sense_zip,sense_zip_names = create_sense_zip(total_alignments)
sense_counter = Counter(sense_zip)

In [254]:

sense_df,df_columns = sense_statistics(sense_zip,sense_zip_names,not_mappable)

In [255]:
print("Overall Statistics")
sense_df[df_columns[:6]]

Overall Statistics


Unnamed: 0,Sense,Equal Correct,All Wrong,Total Act,Equal Wrong,At least one correct
0,Expansion.Conjunction,115,20,435,13,183
1,Temporal.Asynchronous.Precedence,12,8,62,3,22
2,Temporal.Synchrony,6,8,58,3,17
3,Temporal.Asynchronous.Succession,19,9,77,3,28
4,Contingency.Cause.Result,6,18,58,3,16
5,EntRel,31,16,231,1,102
6,Comparison.Concession,2,62,109,34,7
7,Contingency.Condition,12,0,38,0,12
8,Comparison.Contrast,9,17,63,5,21
9,Expansion.Instantiation,1,24,45,5,5


In [256]:
print("Parser Individual Statistics")
sense_df[df_columns[6:6+len(sense_zip_names)*4]]

Parser Individual Statistics


Unnamed: 0,oslopots: unique right,oslopots: not mapped,oslopots: right pred,oslopots: total pred,nguyenlab: unique right,nguyenlab: not mapped,nguyenlab: right pred,nguyenlab: total pred,steven: unique right,steven: not mapped,steven: right pred,steven: total pred
0,8,4,146,343,4,1,145,277,23,3,155,464
1,0,2,21,35,0,2,20,28,1,2,14,45
2,1,2,13,54,2,3,15,51,0,0,9,36
3,0,5,27,45,0,2,26,41,1,1,21,49
4,0,0,8,20,6,0,15,50,1,0,8,31
5,21,0,88,329,6,0,66,222,4,1,50,208
6,2,0,5,6,1,0,4,14,1,0,3,14
7,0,3,12,24,0,3,12,21,0,2,12,40
8,0,1,17,91,2,0,21,105,0,3,11,135
9,0,0,1,6,3,0,4,10,1,0,2,5


In [257]:
print("Two Parser Aggrement")
sense_df[df_columns[-len(sense_zip_names)*2:]]

Two Parser Aggrement


Unnamed: 0,oslopots and nguyenlab right,Total Pred (oslopots and nguyenlab),oslopots and steven right,Total Pred (oslopots and steven),nguyenlab and steven right,Total Pred (nguyenlab and steven)
0,16,245,7,380,10,377
1,8,13,1,38,0,41
2,5,36,1,50,2,48
3,7,15,1,34,0,39
4,2,50,0,37,1,59
5,25,235,11,300,4,255
6,1,14,0,14,0,24
7,0,13,0,19,0,24
8,8,53,0,110,2,125
9,0,12,0,9,0,13


# Model Training

In [258]:
def get_sense_lists(relations):
    gold_senses = []
    parser_senses = []
    parser_names = relations[0]["parser_names"]
    
    for rel in relations:
        gold_senses += [rel["gold"]["Sense"][0]]
        
        parser_pred = []
        for parser in rel["parsers"]:
            if parser == None:
                parser_pred += ["None"]
            else:
                parser_pred += [parser["Sense"][0]]
        
        parser_senses += tuple([parser_pred])
    parser_senses_zip = list(zip(*parser_senses))
    
    return gold_senses,parser_senses_zip,parser_names
        
            
    

In [259]:
def calc_prob_measure(gold_total,pred_total,tp):
    prec = 0
    if pred_total != 0:
        prec = tp/pred_total
        
    rec = 0
    if gold_total != 0:
        rec = tp/gold_total
    
    f1 = 0
    if prec != 0 and rec != 0:
        f1 = ((prec*rec)/(prec+rec))
    
    return prec,rec,f1

In [None]:
#def calc_sense_category_probs(gold_senses,)

In [260]:
def create_one_parser_model(gold_senses,pred_senses):
    sense_model = dict()
    
    sense_set = set(gold_senses)
    sense_compare_zip = zip(*[gold_senses,pred_senses])
    sense_counter = Counter(sense_compare_zip)
    pred_counter = Counter(pred_senses)
    gold_counter = Counter(gold_senses)
    
    for sense in sense_set:
        gold_total = gold_counter[sense]
        pred_total = pred_counter[sense]
        
        prec,rec,f1 = calc_prob_measure(gold_total,pred_total,sense_counter[(sense,sense)])
        sense_model[sense] = {
            "prec":prec,
            "rec":rec,
            "f1":f1
        }
    
    #TODO
    #sense_model = calc_sense_category_probs(sense_model,sense_set)
    
    return sense_model
    
    

In [265]:
gold_senses,parser_senses,parser_names = get_sense_lists(total_alignments)

In [263]:
model = []
model_path = "data/project_files/blind/sense_model.json"
for parser_name,parser_pred in zip(*[parser_names,parser_senses]):
    tmp_model = create_one_parser_model(gold_senses,parser_pred)
    model += [{
        "parser":parser_name,
        "sense_pred": tmp_model
    }]
    
save_json(model,model_path)