In [1]:
from read_write_files import read_json, save_json
from collections import Counter
import itertools
import pandas as pd
import numpy as np
import conll16st.scorer as scorer

In [299]:
def create_sense_zip(alignments):
    attr = "Sense"
    attr_zip = []
    for alignment in alignments:
        line_attr = []
        line_attr += [alignment["gold"][attr][0]]
        
        for pars in alignment["parsers"]:
            if pars == None:
                line_attr += ["None"]
            else:
                line_attr += [pars[attr][0]]
        
        attr_zip += [tuple(line_attr)]
    return attr_zip,alignment["parser_names"]

In [300]:
def sense_statistics(sense_comparison,parser_names,not_mapped_rels):
    zip_sense_comparison = list(zip(*sense_comparison))
    gold_senses = zip_sense_comparison[0]
    #Count all combination of senses
    sense_counter = Counter(sense_comparison)
    #Different Senses
    set_senses = set(gold_senses)
    #How many parsers will be compared
    len_parser = len(zip_sense_comparison)-1
    sense_rows = []
    
    #not mappable predicted relations
    not_mapped_counter = Counter([(rel["Parser"],rel["Sense"][0]) for rel in not_mapped_rels])

    for sense in set_senses:
        tmp_senses = set_senses.copy()
        tmp_senses.remove(sense)
        possible_comb = [[sense]] + [tmp_senses for i in range(len_parser)]
        diff_preds = itertools.product(*possible_comb)

        equal_correct_parsing = sense_counter[tuple([sense]+[sense]*len_parser)]

        all_wrong_parsing = sum([
            sense_counter[tuple(diff_pred)] 
            for diff_pred in diff_preds])

        total_act_sense_count = equal_correct_parsing + sum([1 for i in gold_senses if i == sense])


        equal_wrong_parsing = sum([
            sense_counter[tuple([sense]+[other_sense]*len_parser)] 
            for other_sense in tmp_senses])
        
        parser_columns = []

        #Parser is better than all the other
        parser_better = []
        for index_parser in range(1,len_parser+1):
            tmp_possible_comb = possible_comb[:]
            column_name = parser_names[index_parser-1]
            
            tmp_possible_comb[index_parser] = [sense]
            other_diff_preds = itertools.product(*tmp_possible_comb)
            
            pars_better = sum([sense_counter[other_diff_pred] for other_diff_pred in other_diff_preds])
            total_pred_sense_count = Counter(zip_sense_comparison[index_parser])[sense]#sum([1 for i in zip_sense_comparison[index_parser] if i == sense])
            
            not_mapped = not_mapped_counter[(column_name,sense)]
            
            right_pred_comb = [[sense]] + [set_senses]*len_parser
            right_pred_comb[index_parser] = [sense]
            right_pred = sum([sense_counter[comb] for comb in itertools.product(*right_pred_comb)])
            
            parser_better += [
                pars_better,
                not_mapped,
                right_pred,
                total_pred_sense_count+not_mapped]
            
            
            parser_columns += [string.format(column_name) for string in [
                "{}: unique right",
                "{}: not mapped",
                "{}: right pred",
                "{}: total pred"]]
            
        #Two Parser share opinions
        for parser1,parser2 in itertools.combinations([i for i in range(1,len_parser+1)],2):
            tmp_possible_comb = possible_comb[:]
            tmp_possible_comb[parser1] = [sense]
            tmp_possible_comb[parser2] = [sense]
            
            
            pars_better = sum([sense_counter[comb] for comb in itertools.product(*tmp_possible_comb)])
            total_pred_sense_count = sum(
                [1 for pars1_pred,pars2_pred in zip(*[
                    zip_sense_comparison[parser1],
                    zip_sense_comparison[parser2]
                ])
                if (pars2_pred == sense) != (pars1_pred == sense)])
            
            parser_better += [pars_better,total_pred_sense_count]
            
            column_name = " and ".join([parser_names[parser1-1],parser_names[parser2-1]])
            parser_columns += [
                "{} right".format(column_name),
                "Total Pred ({})".format(column_name)]
        

        #At least one parser is correct
        correct_sense_poss = [[sense]]+[set_senses]*len_parser
        combination_correct = sum(
            [sense_counter[at_least_one_right] 
             for at_least_one_right in itertools.product(*correct_sense_poss) if sense in at_least_one_right[1:]])

        sense_rows += [[sense,equal_correct_parsing,all_wrong_parsing,total_act_sense_count,equal_wrong_parsing,combination_correct]+parser_better]

    columns = ["Sense",
             "Equal Correct",
             "All Wrong",
             "Total Act",
             "Equal Wrong",
             "At least one correct"]+parser_columns
    return pd.DataFrame(
        columns=columns,
        data=sense_rows),columns


In [301]:
alignment_path = "data/project_files/test/total_alignment.json"
not_mappable_path = "data/project_files/test/not_mappable.json"

total_alignments = read_json(alignment_path)
not_mappable = read_json(not_mappable_path)

In [302]:
sense_zip,sense_zip_names = create_sense_zip(total_alignments)
sense_counter = Counter(sense_zip)

In [287]:
sense_df,df_columns = sense_statistics(sense_zip,sense_zip_names,not_mappable)

In [288]:
print("Overall Statistics")
sense_df[df_columns[:6]]

Overall Statistics


Unnamed: 0,Sense,Equal Correct,All Wrong,Total Act,Equal Wrong,At least one correct
0,Expansion.Conjunction,121,25,441,16,187
1,Temporal.Asynchronous.Precedence,15,5,65,1,26
2,Temporal.Synchrony,10,4,62,1,25
3,Temporal.Asynchronous.Succession,18,10,76,4,24
4,Contingency.Cause.Result,7,22,59,5,15
5,EntRel,23,13,223,4,109
6,Comparison.Concession,1,61,108,35,6
7,Contingency.Condition,10,0,36,0,10
8,Comparison.Contrast,10,16,64,6,20
9,Expansion.Instantiation,2,17,46,2,10


In [289]:
print("Parser Individual Statistics")
sense_df[df_columns[6:6+len(sense_zip_names)*4]]

Parser Individual Statistics


Unnamed: 0,oslopots: unique right,oslopots: not mapped,oslopots: right pred,oslopots: total pred,ecnucs: unique right,ecnucs: not mapped,ecnucs: right pred,ecnucs: total pred,steven: unique right,steven: not mapped,steven: right pred,steven: total pred
0,9,0,148,291,5,1,152,323,22,3,159,442
1,0,2,23,33,2,2,25,63,1,2,16,42
2,0,1,22,51,1,0,24,68,1,3,12,37
3,0,3,21,37,1,1,23,50,1,1,20,45
4,1,0,10,18,3,1,11,38,2,0,10,31
5,20,0,88,302,9,0,69,262,6,1,49,209
6,2,0,4,6,0,1,2,6,2,1,3,15
7,0,4,10,19,0,1,10,29,0,3,10,32
8,0,1,14,78,2,2,17,125,3,1,14,114
9,1,0,3,8,6,0,9,20,0,0,3,5


In [290]:
print("Two Parser Aggrement")
sense_df[df_columns[-len(sense_zip_names)*2:]]

Two Parser Aggrement


Unnamed: 0,oslopots and ecnucs right,Total Pred (oslopots and ecnucs),oslopots and steven right,Total Pred (oslopots and steven),ecnucs and steven right,Total Pred (ecnucs and steven)
0,14,229,4,378,12,363
1,8,32,0,39,0,61
2,12,30,0,52,1,62
3,3,19,0,36,1,41
4,1,33,1,29,0,52
5,31,268,14,300,6,262
6,1,7,0,16,0,15
7,0,13,0,20,0,19
8,4,62,0,106,1,132
9,0,20,0,9,1,19


# Model Training

In [303]:
def get_sense_lists(relations):
    gold_senses = []
    parser_senses = []
    parser_names = relations[0]["parser_names"]
    
    for rel in relations:
        gold_senses += [rel["gold"]["Sense"][0]]
        
        parser_pred = []
        for parser in rel["parsers"]:
            if parser == None:
                parser_pred += ["None"]
            else:
                parser_pred += [parser["Sense"][0]]
        
        parser_senses += tuple([parser_pred])
    parser_senses_zip = list(zip(*parser_senses))
    
    return gold_senses,parser_senses_zip,parser_names
        
            
    

In [304]:
def calc_prob_measure(gold_total,pred_total,tp):
    prec = 0
    if pred_total != 0:
        prec = tp/pred_total
        
    rec = 0
    if gold_total != 0:
        rec = tp/gold_total
    
    f1 = 0
    if prec != 0 and rec != 0:
        f1 = ((prec*rec)/(prec+rec))
    
    return prec,rec,f1

In [349]:
def calc_sense_category_probs(sense_model,senses):
    
    sense_split = [(sense.split("."),sense_model[sense]) for sense in senses]
    
    new_senses = {}
    ready = True
    while ready:
        for index,(splitted_sense,f1) in enumerate(sense_split):
            ready = False
            if len(splitted_sense) > 1:
                ready = True
                new_sense = ".".join(splitted_sense[:-1])
                if new_sense in new_senses:
                    new_senses[new_sense] += [sense_model[".".join(splitted_sense)]]
                else:
                    new_senses[new_sense] = [sense_model[".".join(splitted_sense)]]
                sense_split[index][0] = 
        
    return sense_model

In [353]:
def create_one_parser_model(gold_senses,pred_senses):
    sense_model = dict()
    
    sense_set = set(gold_senses)
    sense_compare_zip = zip(*[gold_senses,pred_senses])
    sense_counter = Counter(sense_compare_zip)
    pred_counter = Counter(pred_senses)
    gold_counter = Counter(gold_senses)
    
    for sense in sense_set:
        gold_total = gold_counter[sense]
        pred_total = pred_counter[sense]
        
        prec,rec,f1 = calc_prob_measure(gold_total,pred_total,sense_counter[(sense,sense)])
        sense_model[sense] = {
            "prec":prec,
            "rec":rec,
            "f1":f1
        }
    
    #TODO
    #sense_model = calc_sense_category_probs(sense_model,sense_set)
    
    return sense_model
    
    

In [None]:
def weight_parser_by_accuracy(gold,parsers):
    
    parser_f1_scores = []
    
    for parser in parsers:
        parser_f1_scores += [(parser["Parser"],scorer.evaluate(gold,parser)[7])]
        
    parser_f1_sum = sum(zip(parser_f1_scores[1]))
    parser_weights = {name:f1_score/parser_f1_sum for name,f1_score in parser_f1_scores}

In [354]:
gold_senses,parser_senses,parser_names = get_sense_lists(total_alignments)

In [355]:
model = []
model_path = "data/project_files/test/sense_model.json"


for parser_name,parser_pred in zip(*[parser_names,parser_senses]):
    tmp_model = create_one_parser_model(gold_senses,parser_pred)
    model += [{
        "parser":parser_name,
        "sense_pred": tmp_model
    }]
    
save_json(model,model_path)

In [358]:
rows = []

for sense in set(gold_senses):
    row = [sense]
    for parser in model:
        parser_senses = parser["sense_pred"]
        if sense in parser_senses:
            row += [parser_senses[sense]["f1"]]
        else:
            row += [0]
    rows += [tuple(row)]


## Sense Model Statistics

In [359]:
pd.DataFrame(data=rows, columns=["Sense"]+parser_names)

Unnamed: 0,Sense,steven,oslopots,ecnucs,tao0920,goethe,nguyenlab,clac,PurdueNLP,gw0,ykido,gtnlp
0,Expansion.Conjunction,0.217604,0.337165,0.32766,0.324775,0.323904,0.320285,0.291399,0.305147,0.116766,0.325914,0.324074
1,Temporal.Asynchronous.Precedence,0.252427,0.425,0.364706,0.382716,0.341772,0.419753,0.381579,0.368421,0.0,0.378378,0.406977
2,Expansion.Alternative,0.25,0.4,0.333333,0.384615,0.444444,0.454545,0.454545,0.4,0.0,0.333333,0.454545
3,Temporal.Synchrony,0.170068,0.30625,0.310976,0.3,0.308176,0.316384,0.29878,0.319527,0.072125,0.319277,0.3
4,Temporal.Asynchronous.Succession,0.225225,0.369369,0.320388,0.369369,0.342593,0.353448,0.28866,0.311321,0.0,0.30303,0.30303
5,Contingency.Cause.Result,0.15566,0.222222,0.180488,0.19457,0.225806,0.206422,0.180328,0.171717,0.0,0.224138,0.213592
6,EntRel,0.194286,0.235686,0.25974,0.269978,0.234568,0.245763,0.200765,0.261053,0.0,0.173267,0.254576
7,Comparison.Concession,0.074627,0.2,0.189655,0.225806,0.153846,0.157143,0.14,0.213115,0.0,0.2,0.102564
8,Contingency.Condition,0.293651,0.4375,0.401869,0.411215,0.380952,0.412844,0.37,0.383178,0.0,0.427273,0.436975
9,Comparison.Contrast,0.256011,0.366469,0.33662,0.346705,0.324963,0.357955,0.329231,0.33752,0.007264,0.346812,0.352638
