# Sense Model

This notebook gives a overall statistical point of view on the parser predictions. Thereby, we analysis some statistical measures like overall agreements, wrong predictions and how often two parsers agree with each other. Thereby, we focus on the three best parsers due to the relevance of the shown measures (e.g. a overall agreement is really unlikely for more than 10 parsers compared to 3 parsers) and computational reasons.

In [2]:
from read_write_files import read_json, save_json, get_parser_paths
from collections import Counter
import itertools
import pandas as pd
import numpy as np
import conll16st.scorer as scorer

## Sense Statistics

In [3]:
def create_sense_zip(alignments):
    # This function extracts the senses for the gold standard and all parsers 
    
    #param alignments      list of the mapped predictions to the gold standard
    
    #return list of gold and predicted senses and parser names
    attr = "Sense"
    attr_zip = []
    for alignment in alignments:
        line_attr = []
        line_attr += [alignment["gold"][attr][0]]
        
        for pars in alignment["parsers"]:
            if pars == None:
                line_attr += ["None"]
            else:
                line_attr += [pars[attr][0]]
        
        attr_zip += [tuple(line_attr)]
    return attr_zip,alignment["parser_names"]

In [19]:
def sense_statistics2(sense_comparison,parser_names,not_mapped_rels):
    # This function calculates some basic statistical measures for the giving parser predictions
    
    #param sense_comparison     output of the previous function
    #param parser_names     names of the parsers
    #param not_mapped_rels    all predicted relations that don't have a corresponding gold relation
    
    #return Dataframe of the statistical measures
    
    #[(gold1,pred11,pred12,pred13),(gold2,pred21,pred22,pred23),(...)]
    # --> [(gold1,gold2),(pred11,pred21),(pred12,pred22),(pred13,pred23),(...)]
    zip_sense_comparison = list(zip(*sense_comparison))
    gold_senses = zip_sense_comparison[0]

    #Count all combination of senses
    sense_counter = Counter(sense_comparison)
    #Different Senses
    set_senses = set(gold_senses)
    #How many parsers will be compared
    len_parser = len(zip_sense_comparison)-1
    sense_rows = []
    
    #not mappable predicted relations
    not_mapped_counter = Counter([(rel["Parser"],rel["Sense"][0]) for rel in not_mapped_rels])
    
    #All combination of parser outputs e.g. (parser1,parser2),(parser2,parser3),(parser1,parser3)
    parser_comb = list(itertools.combinations([i for i in range(1,len_parser+1)],2))
    
    for sense in set_senses:
        
        #initial measures
        equal_correct_parsing = 0
        all_wrong_parsing = 0
        total_act_sense_count = 0
        equal_wrong_parsing = 0
        
        individ_parser_stats = {
            parser_index:[
                0,
                0,
                0,
                0
            ] for parser_index in range(1,len_parser+1)
        }
        double_parser_stats = {
            parsers:[0,0]
            for parsers in parser_comb
        }

        combination_correct = 0
        
        #loop over the relations (each row: gold sense and all predicted senses)
        for sense_comb in sense_comparison:
            if sense_comb[0] == sense:
                total_act_sense_count += 1
                
                #all correct
                if tuple([sense]+[sense]*len_parser) == sense_comb:
                    equal_correct_parsing += 1
                    
                #no one correct
                if not sense in sense_comb[1:]:
                    all_wrong_parsing += 1
                    
                    #all same wrong sense
                    if len(set(list(sense_comb[1:]))) == 1:
                        equal_wrong_parsing += 1
                        
                #at least one correct
                else:
                    combination_correct += 1
                    
                #parser individual measures
                for parser_index in range(1,len_parser+1):
                    parser_pred = sense_comb[parser_index] 
                    other_preds = [pred for ind,pred in enumerate(sense_comb[1:]) if ind+1 != parser_index]
                    
                    #parser correct
                    if parser_pred == sense:
                        individ_parser_stats[parser_index][2] += 1
                        
                        #only one that is correct
                        if sense not in other_preds:
                            individ_parser_stats[parser_index][0] += 1
                    
                    #total predictions
                    individ_parser_stats[parser_index][3] = Counter(zip_sense_comparison[parser_index])[sense]
                
                #two parser predictions
                for p_comb in parser_comb:
                    
                    #both parser equal correct sense
                    if sense_comb[p_comb[0]] == sense and sense_comb[p_comb[1]] == sense:
                        double_parser_stats[p_comb][0] += 1
                    
                    #at least one correct
                    if sense_comb[p_comb[0]] == sense or sense_comb[p_comb[1]] == sense:
                        double_parser_stats[p_comb][1] += 1
        
        #predicted relations without mapped gold relation
        for parser_index in range(1,len_parser+1):                
            individ_parser_stats[parser_index][1] = not_mapped_counter[
                (parser_names[parser_index-1],sense)]
        
        #collect all results for each sense
        sense_row = [[sense,
             equal_correct_parsing,
             all_wrong_parsing,
             total_act_sense_count,
             equal_wrong_parsing,
             combination_correct]]+[
                individ_parser_stats[parser_index]
                for parser_index in range(1,len_parser+1)
            ]+[
                double_parser_stats[parsers]
                for parsers in parser_comb
            ]
        
        sense_row = [measure for part in sense_row for measure in part]
        sense_rows += [sense_row]

    #Column Names
    columns = ["Sense",
         "Equal Correct",
         "All Wrong",
         "Total Act",
         "Equal Wrong",
         "At least one correct"
    ]+[
        string.format(parser_name) for parser_name in parser_names 
        for string in [
            "{}: unique right",
            "{}: not mapped",
            "{}: right pred",
            "{}: total pred"]
        
    ]+[
        string.format(parser_names[ind1-1],parser_names[ind2-1]) for ind1,ind2 in parser_comb
        for string in ["{}/{}: both right","{}/{}: total pred"]
        
    ]


    return pd.DataFrame(
        columns=columns,
        data=sense_rows),columns


### Create Statistics

In [20]:
#overall statistics for all parsers
alignment_path = "data/project_files/test/total_alignment.json"
not_mappable_path = "data/project_files/test/not_mappable.json"

total_alignments = read_json(alignment_path)
not_mappable = read_json(not_mappable_path)

sense_zip,sense_zip_names = create_sense_zip(total_alignments)
sense_counter = Counter(sense_zip)

In [21]:
##statistics for the three best parsers
best3_alignment_path = "data/project_files/test/3best_alignment.json"
best3_not_mappable_path = "data/project_files/test/3best_not_mappable.json"

best3_alignments = read_json(best3_alignment_path)
best3_not_mappable = read_json(best3_not_mappable_path)

best3_sense_zip,best3_sense_zip_names = create_sense_zip(best3_alignments)
best3_sense_counter = Counter(best3_sense_zip)

In [22]:
#Create Statistics
sense_df,df_columns = sense_statistics2(best3_sense_zip,best3_sense_zip_names,best3_not_mappable)

In [23]:
print("Overall Statistics")
sense_df[df_columns[:6]]

Overall Statistics


Unnamed: 0,Sense,Equal Correct,All Wrong,Total Act,Equal Wrong,At least one correct
0,Comparison,0,1,1,0,0
1,Expansion.Conjunction,103,64,391,13,327
2,Expansion.Instantiation,14,35,91,4,56
3,Temporal.Asynchronous.Precedence,19,7,45,0,38
4,Comparison.Concession,3,19,33,4,14
5,Contingency.Cause.Result,15,75,135,5,60
6,Contingency.Cause.Reason,27,75,195,10,120
7,Expansion,0,3,3,0,0
8,Expansion.Alternative,0,0,5,0,5
9,Temporal.Asynchronous.Succession,15,24,69,14,45


This part shows the overall agree- or disagreement of all three best parsers. As you can see, there are always predictions where at least one parser is correct, while the overall agreement (same correct prediction) is pretty low. There are event cases were all of the parsers predict the same wrong sense. In future project a closer look on these relations could find out features that lead to a wrong prediction.

In [24]:
print("Parser Individual Statistics")
sense_df[df_columns[6:6+len(best3_sense_zip_names)*4]]

Parser Individual Statistics


Unnamed: 0,steven: unique right,steven: not mapped,steven: right pred,steven: total pred,oslopots: unique right,oslopots: not mapped,oslopots: right pred,oslopots: total pred,ecnucs: unique right,ecnucs: not mapped,ecnucs: right pred,ecnucs: total pred
0,0,0,0,0,0,0,0,0,0,0,0,0
1,20,1,178,427,41,1,264,392,23,3,231,314
2,1,0,17,36,5,0,33,42,21,0,49,66
3,1,0,26,58,2,2,34,35,1,0,31,40
4,1,0,5,34,1,0,12,27,1,0,11,25
5,5,1,33,77,5,0,42,54,13,0,37,70
6,4,6,42,99,25,1,92,165,22,2,82,152
7,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,2,3,1,0,4,5,0,0,3,4
9,2,2,25,42,3,3,41,42,1,1,33,34


This table shows the individual parser statistics. Comparing the unique right predictions you explore that the steven parser doesn't predict so often relations, that weren't also predicted by the others. On the other hand, oslopots and ecnuncs have a lot of these unique right predictions. Due to the fact, that we only had the sense prediction part of the challenge we have to exclude the "not mapped" relations, because each relation span is the same.

If you look on the right predictions for each sense, it seems that oslopots and ecnucs have a higher accuracy in predicting senses compared to their overall predictions.

In [91]:
print("Two Parser Agreement")
sense_df[df_columns[-len(best3_sense_zip_names)*2:]]

Two Parser Agreement


Unnamed: 0,steven/oslopots: both right,steven/oslopots: total pred,steven/ecnucs: both right,steven/ecnucs: total pred,oslopots/ecnucs: both right,oslopots/ecnucs: total pred
0,0,0,0,0,0,0
1,138,304,123,286,188,307
2,15,35,15,51,27,55
3,23,37,21,36,28,37
4,4,13,3,13,10,13
5,28,47,15,55,24,55
6,36,98,29,95,58,116
7,0,0,0,0,0,0
8,1,5,1,4,2,5
9,22,44,16,42,31,43


While the combination with the steven parser only have one third right predictions compared to the overall predictions, the combination of oslopots and ecnucs gets around one half of them. Considering this fact, we have to say that oslopots and ecnucs have a marginally difference between their predictions.

# Model Training

We calculate the precision, recall and f1-measure for all parsers to get to know the "reliability" for each parser for a specific sense. "Reliability" is the measure of how accurate a parser can predict a giving sense. Moreover, we add a weighting for each parser depending on its overall prediction skills (normalized by all parsers).

In [136]:
def get_sense_lists(relations):
    # This function extracts the senses for the gold standard and all parsers 
    
    #param relations    list of all relations of the alignments
    
    #return gold senses and parser senses seperated and parser names
    gold_senses = []
    parser_senses = []
    parser_names = relations[0]["parser_names"]
    
    for rel in relations:
        gold_senses += [rel["gold"]["Sense"][0]]
        
        parser_pred = []
        for parser in rel["parsers"]:
            if parser == None:
                parser_pred += ["None"]
            else:
                parser_pred += [parser["Sense"][0]]
        
        parser_senses += tuple([parser_pred])
    parser_senses_zip = list(zip(*parser_senses))
    
    return gold_senses,parser_senses_zip,parser_names
        
            
    

In [195]:
def calc_prob_measure(gold_total,pred_total,tp):
    #Calculate precision, recall and f1 for the giving measures
    gold_total = float(gold_total)
    pred_total = float(pred_total)
    tp = float(tp)
    
    prec = 0
    if pred_total != 0:
        prec = (tp/pred_total)
           
    rec = 0
    if gold_total != 0:
        rec = tp/gold_total
    
    f1 = 0
    if prec != 0 and rec != 0:
        f1 = ((prec*rec)/(prec+rec))
    
    return prec,rec,f1

In [196]:
def create_one_parser_model(gold_senses,pred_senses):
    # Creates a model dictionary for the voting systems
    
    #param gold_senses     list of actual senses
    #param pred_senses     list of corresponding senses of a parser
    
    #return dictionary with precision, recall and f1 for all senses for one parser
    
    
    sense_model = dict()
    
    sense_set = set(gold_senses)
    sense_compare_zip = list(zip(*[gold_senses,pred_senses]))
    sense_counter = Counter(sense_compare_zip)
    pred_counter = Counter(pred_senses)
    gold_counter = Counter(gold_senses)
    
    for sense in sense_set:
        gold_total = gold_counter[sense]
        pred_total = pred_counter[sense]
        
        prec,rec,f1 = calc_prob_measure(gold_total,pred_total,sense_counter[(sense,sense)])
        sense_model[sense] = {
            "prec":prec,
            "rec":rec,
            "f1":f1
        }
        
    
    return sense_model
    
    

In [197]:
def weight_parser_by_accuracy(sense_model,gold,parsers):
    # Weights each parser by its overall F1 prediction score
    
    #param sense_model      list of dictionaries of precision, recall, f1 for all senses for all parsers
    
    #return sense_model with added weighting parameter
    
    parser_f1_scores = []
    
    for name,relations in parsers:
        parser_f1_scores += [(name,scorer.evaluate_sense(gold,relations).compute_micro_average_f1()[0])]
    parser_f1_sum = sum(zip(*parser_f1_scores)[1])
    parser_weights = {name:f1_score/parser_f1_sum for name,f1_score in parser_f1_scores}
    
    for tmp_model in sense_model:
        tmp_model["weight"] = parser_weights[tmp_model["parser"]]
        
    return sense_model

In [198]:
#Sense Lists
gold_senses,parser_senses,parser_names = get_sense_lists(total_alignments)

In [199]:
#Create Sense Model for all parsers

model = []
model_path = "data/project_files/test/sense_model_V2.json"

for parser_name,parser_pred in list(zip(*[parser_names,parser_senses])):
    tmp_model = create_one_parser_model(gold_senses,parser_pred)
    model += [{
        "parser":parser_name,
        "sense_pred": tmp_model
    }]
    


In [201]:
gold_path = "data/gold_standard/test/relations.json"
parsers_path = "data/submissions/sense_only/test/"

In [202]:
#Add weighting to each parser model

model = weight_parser_by_accuracy(
    model,
    read_json(gold_path),
    [(name,read_json(path)) for name,path in get_parser_paths(parsers_path)])
    
save_json(model,model_path)

## Sense Model Statistics

This part shows a visual overview of the reliability for each parser and sense. I added a additional dataframe that shows the highest values in a transposed way (panda doesn't supply a easy way to highlight rows).

In [237]:
rows = []

#Create Rows for visualization (select the f1 measure for all senses)
for sense in set(gold_senses):
    row = [sense]
    for parser in model:
        parser_senses = parser["sense_pred"]
        if sense in parser_senses:
            row += [parser_senses[sense]["f1"]]
        else:
            row += [0]
    rows += [tuple(row)]


In [205]:
def highlight_max(data, color='yellow'):
    # highlight the maximum in a Series or DataFrame
    attr = 'background-color: {}'.format(color)
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

In [231]:
pd.options.display.float_format = '{:,.2f}'.format
reliability_df = pd.DataFrame(data=rows, columns=["Sense"]+parser_names)
reliability_df

Unnamed: 0,Sense,steven,oslopots,ecnucs,tao0920,goethe,nguyenlab,clac,PurdueNLP,gw0,ykido,gtnlp
0,Comparison,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Expansion.Conjunction,0.22,0.34,0.33,0.32,0.32,0.32,0.29,0.31,0.12,0.33,0.32
2,Expansion.Instantiation,0.13,0.25,0.31,0.31,0.25,0.19,0.16,0.24,0.0,0.16,0.28
3,Temporal.Asynchronous.Precedence,0.25,0.42,0.36,0.38,0.34,0.42,0.38,0.37,0.0,0.38,0.41
4,Comparison.Concession,0.07,0.2,0.19,0.23,0.15,0.16,0.14,0.21,0.0,0.2,0.1
5,Contingency.Cause.Result,0.16,0.22,0.18,0.19,0.23,0.21,0.18,0.17,0.0,0.22,0.21
6,Contingency.Cause.Reason,0.14,0.26,0.24,0.25,0.23,0.2,0.21,0.24,0.04,0.18,0.25
7,Expansion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Expansion.Alternative,0.25,0.4,0.33,0.38,0.44,0.45,0.45,0.4,0.0,0.33,0.45
9,Temporal.Asynchronous.Succession,0.23,0.37,0.32,0.37,0.34,0.35,0.29,0.31,0.0,0.3,0.3


In [236]:
transposed_reliability_df = reliability_df[reliability_df.columns[1:]].transpose()
for ind,sense in enumerate(list(zip(*rows)[0])):
    print("{}: {}".format(ind,sense))

transposed_reliability_df.style.apply(highlight_max,axis=0)

0: Comparison
1: Expansion.Conjunction
2: Expansion.Instantiation
3: Temporal.Asynchronous.Precedence
4: Comparison.Concession
5: Contingency.Cause.Result
6: Contingency.Cause.Reason
7: Expansion
8: Expansion.Alternative
9: Temporal.Asynchronous.Succession
10: Contingency.Condition
11: Expansion.Alternative.Chosen alternative
12: Comparison.Contrast
13: EntRel
14: Expansion.Restatement
15: Temporal.Synchrony


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
steven,0,0.217604,0.133858,0.252427,0.0746269,0.15566,0.142857,0,0.25,0.225225,0.293651,0.0909091,0.256011,0.194286,0.0147783,0.170068
oslopots,0,0.337165,0.24812,0.425,0.2,0.222222,0.255556,0,0.4,0.369369,0.4375,0.142857,0.366469,0.235686,0.125,0.30625
ecnucs,0,0.32766,0.312102,0.364706,0.189655,0.180488,0.236311,0,0.333333,0.320388,0.401869,0.0526316,0.33662,0.25974,0.160819,0.310976
tao0920,0,0.324775,0.309942,0.382716,0.225806,0.19457,0.252155,0,0.384615,0.369369,0.411215,0.142857,0.346705,0.269978,0.127796,0.3
goethe,0,0.323904,0.248408,0.341772,0.153846,0.225806,0.233161,0,0.444444,0.342593,0.380952,0.05,0.324963,0.234568,0.146965,0.308176
nguyenlab,0,0.320285,0.188976,0.419753,0.157143,0.206422,0.204787,0,0.454545,0.353448,0.412844,0.142857,0.357955,0.245763,0.090625,0.316384
clac,0,0.291399,0.157407,0.381579,0.14,0.180328,0.208763,0,0.454545,0.28866,0.37,0.1,0.329231,0.200765,0.0888889,0.29878
PurdueNLP,0,0.305147,0.243243,0.368421,0.213115,0.171717,0.237136,0,0.4,0.311321,0.383178,0.142857,0.33752,0.261053,0.108475,0.319527
gw0,0,0.116766,0.0,0.0,0.0,0.0,0.0423077,0,0.0,0.0,0.0,0.0,0.00726392,0.0,0.143954,0.0721248
ykido,0,0.325914,0.157407,0.378378,0.2,0.224138,0.184211,0,0.333333,0.30303,0.427273,0.1,0.346812,0.173267,0.0233645,0.319277


Each column represent a sense, while each row shows the reliability of each parser to predict the corresponding sense.

First, we have to exclude the comparison and expansion relation, because the parser always predicted a more specific relation sense for the relations. Second, the gw0 parser only predict certain relations, while all the others try to predict all of them.

Like expected the oslopots parser has the highest reliability for most of the sense. Nevertheless, you can see that for some senses (e.g. Expansion.Alternative,Comparison.ConcessionContingency.Cause.Result,...) other parser have a higher probability to predict these sense right. Therefore, it is justified to explore different voting systems for the parsers.