In [1]:
from os import listdir
import json
import pandas as pd
import networkx as nx
import numpy as np

## Helper functions

In [8]:
def load_terminology(terminology_file: str):
    """
    Returns a list of concepts given a txt file with a concept for each line
    """
    
    concepts = []
    
    with open(terminology_file, "r", encoding="utf-8") as f: 
        for line in f.readlines():
            concepts.append(line.split('\n')[0])
        
    return concepts
    

def find_all_pairs(concepts: list):
    """
    Returns a list of all possible pairs of concepts given a list of concepts
    (returns ALSO reflexive pairs with the same concept).
    """
    
    all_possibles = []
    
    for c1 in concepts:
        for c2 in concepts:
            # uncomment it to exclude reflexives
            #if c1 != c2:
            all_possibles.append((c1, c2))
            
    return all_possibles



def load_burst(allen_weights, threshold=0):
    """
    Reads a csv file with relations extracted with burst and returns a pandas dataframe. 
    Columns of the df: [prerequisite, advanced, weight].
    
    allen_weights can be 1 or "AIED".
    
    if allen_weights == 1 or "1":
        all weights are 1
    
    if == "AIED" or "aied":
        {'equals': 2, 'before': 5, 'after': 0, 'meets': 3, 'met-by': 0, 'overlaps': 7, 'overlapped-by': 1, 
        'during': 7, 'includes': 7, 'starts': 4, 'started-by': 2, 'finishes': 2, 'finished-by': 8}
    
    In both cases inverse relations are NOT used.
    
    Threshold: all relations with weight less or equal are discarded.
    """
    
    if allen_weights in [1, "1"]:
        # pesi = 1
        file_burst = "risultati_burst/edgelist_burst_prerequisites_con_collis_detect_pesi1.csv"
    elif allen_weights in ["AIED", "aied"]:
        #pesi = AIED
        file_burst = "risultati_burst/edgelist_burst_prerequisites_pesiAIED.csv"

    burst = pd.read_csv(file_burst, sep=",", index_col=0, encoding="utf-8")
    burst.rename({"X (prerequisite)": "prerequisite", "Y (subsidiary)": "advanced"}, axis=1, inplace=True)
    burst = burst[burst["weight"] > threshold]

    return burst

def load_bs(base_line_method: str, threshold):
    """
    Reads a csv file with relations extracted with BaseLine Methods and returns a pandas dataframe.
    Columns of the df: [prerequisite, advanced, weight].
    """
    file_bs = "BS/M5_1.csv"
    bs = pd.read_csv(file_bs, sep=";", encoding="utf-8", index_col=False)
    # bs.rename({ "lemma1": "advanced", "lemma2": "prerequisite", "m1": "weight" }, axis=1, inplace=True)
    # columns = ["index","prerequisite", "advanced", "m1", "m2", "m2_sentence","m3","m4","m4a","m4b","m5"]
    # bs = bs.reindex(columns=columns)
    bs.rename({ "m5": "weight" }, axis=1, inplace=True)

    bs = bs[bs["weight"] > threshold]
    # print(bs)

    return bs


def generate_gold(annotations: str, revisioned: bool):
    """
    Combines annotations taking only those relations with both concepts in the terminology. 
    
    revisioned:
        True
        False
        
    annotations:
        "nuove": le 4 annotazioni degli ultimi ragazzi:
        "vecchie": le annotazioni dei 2 ragazzi dell'anno scorso + le 4 nostre:
        "tutte". tutte le 10 annotazioni:
        
        
    Possible inputs:
    generate_gold(revisioned: True, "nuove")
    generate_gold(revisioned: True, "vecchie")
    generate_gold(revisioned: True, "tutte")
    generate_gold(revisioned: False, "nuove")
    generate_gold(revisioned: False, "vecchie")
    generate_gold(revisioned: False, "tutte")
    """
    
    
    gold = pd.DataFrame(columns=["prerequisite", "advanced", "count"])
    
    
    if not revisioned:
    
        if annotations == "nuove":
            input_dir = "annotazioni_originali/nuove/"
        elif annotations == "vecchie":
            input_dir = "annotazioni_originali/vecchie/"
        elif annotations == "tutte":
            input_dir = "annotazioni_originali/tutte/"


        for file in listdir(input_dir): 
            print(file)
            with open(input_dir+file, "r", encoding="utf-8") as f:

                data = json.load(f)["savedInsertedRelations"]
                print("\tNum of relations:", len(data), "\n")

                for rel in data:
                    # take only relations with both concepts in the terminology
                    if rel['prerequisite'] in concepts and rel['advanced'] in concepts:
                        # add a new row in the df if the relation is not present
                        if gold[(gold["prerequisite"] == rel['prerequisite']) & (gold["advanced"] == rel['advanced'])].shape[0] == 0:
                            gold.loc[gold.shape[0]] = [rel['prerequisite'], rel['advanced'], 1]
                        else:
                            # update the value of count in the df if the relation is already present
                            idx = gold[(gold["prerequisite"] == rel['prerequisite']) & (gold["advanced"] == rel['advanced'])].index[0]
                            gold.at[idx, "count"] = gold.at[idx, "count"] + 1

    
    if revisioned:
        
        if annotations == "nuove":
            input_dir = "annotazioni_revisionate/nuove/"
        elif annotations == "vecchie":
            input_dir = "annotazioni_revisionate/vecchie/"
        elif annotations == "tutte":
            input_dir = "annotazioni_revisionate/tutte/"

    
        for file in listdir(input_dir):
            print(file)
            # use only columns ["prerequisite", "advanced", "weight", "agreem", "revised"]
            curr_df = pd.read_csv(input_dir + file, sep=",", encoding="utf-8", usecols=[1, 2, 4, 5, 9]) 
                              
            # keep only relation with agreemen > 1 or those with agreement 1 but still present after revision
            curr_df = curr_df[(curr_df["agreem"] > 1) | (curr_df["Revised"].isin(["0.5", "0,5", "1", 1, 0.5]) ) ]
            print("\tNum of relations:", curr_df.shape[0], "\n")

            for i, rel in curr_df.iterrows():

                if rel['prerequisite'] in concepts and rel['advanced'] in concepts: 
                    # add a new row in the df if the relation is not present
                    if gold[(gold["prerequisite"] == rel['prerequisite']) & (gold["advanced"] == rel['advanced'])].shape[0] == 0:
                        gold.loc[gold.shape[0]] = [rel['prerequisite'], rel['advanced'], 1]
                    else:
                        # update the value of count in the df if the relation is already present
                        idx = gold[(gold["prerequisite"] == rel['prerequisite']) & (gold["advanced"] == rel['advanced'])].index[0]
                        gold.at[idx, "count"] = gold.at[idx, "count"] + 1


    print("\nNum of relations in the combined gold:", gold.shape[0])   
    
    return gold

    
def compute_metrics(automatic, manual):
    """
    Prints precision, recall and F1 score given TP FP TN and FN.
    """

    # results_df=pd.DataFrame(columns=["prerequisite","advanced","Direct Edge",	"Path",	"Not in Gold", "TP","TN","FP","FN"])
    results_list = []
    if type(automatic) == pd.DataFrame:
        automatic_edges = [tuple(x) for x in automatic[["prerequisite", "advanced"]].values]
        print("The compute metrics input automatic is a data frame")
    elif type(automatic) == nx.DiGraph:
        automatic_edges = list(automatic.edges())
        # print("The compute metrics input automatic is a Graph")
    else:
        print("Input error for 'automatic': accept only pd.DataFrame or nx.Digraph.")
        return None
    
    if type(manual) == pd.DataFrame:
        manual_edges = [tuple(x) for x in manual[["prerequisite", "advanced"]].values]
        # print("The compute metrics input manual is a data frame")
    elif type(manual) == nx.DiGraph:
         manual_edges = list(manual.edges())
         print("The compute metrics input manual is a Graph")
    else:
        print("Input error for 'manual': accept only pd.DataFrame or nx.Digraph.")
        return None
    
    
    
    all_possibles = find_all_pairs(concepts)
    
    
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    HP = 0
    FNHP = 0

    for rel in automatic_edges:
        if rel in manual_edges:
            TP +=1
            # print("TP Direct rel:",rel)
            # ["prerequisite","advanced","Direct Edge",	"Path",	"Not in Gold", "TP","TN","FP","FN"]
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':1,'Path':0,'Not in Gold':0,"TP":1,"TN":0,"FP":0,"FN":0,"Path chain": '', "FN Path": 0}
            results_list.append(new_raw)
        elif manual.has_node(rel[0]) and manual.has_node(rel[1]):
            if nx.has_path(manual, rel[0], rel[1]):
                TP +=1
                HP +=1
                # print("There is a path between", rel)
                new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':1,'Not in Gold':0,"TP":1,"TN":0,"FP":0,"FN":0,"Path chain": nx.shortest_path(manual, rel[0], rel[1]), "FN Path": 0}
                results_list.append(new_raw)
            else:
                FP +=1
                # print("The relation does not exist in the Gold", rel)
                new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":1,"FN":0, "Path chain": '', "FN Path": 0}
                results_list.append(new_raw)
        elif rel not in manual_edges:
            FP +=1
            # print("FP: ",rel)
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':1,"TP":0,"TN":0,"FP":1,"FN":0, "Path chain": '', "FN Path": 0}
            results_list.append(new_raw)
        else:
            print("No defined criteria for: ",rel)

    # FN Missing rel, exist in gold but not extracted nor path
    for rel in manual_edges:
        # If gold rel not exist as a direct rel
        if rel not in automatic_edges:
            FN +=1
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":0,"FN":1, "Path chain": '', "FN Path": 0}
            results_list.append(new_raw)
        # If the concept in the extracted exist but there is no path
        elif automatic.has_node(rel[0]) and automatic.has_node(rel[1]):
            if not nx.has_path(automatic, rel[0], rel[1]):
                FN +=1
                FNHP +=1
                new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":0,"FN":1, "Path chain": '', "FN Path": 1}
                results_list.append(new_raw)
            else:
                print("For the gold relation there is also a path in the burst", rel)

    for rel in all_possibles:
        if rel not in manual_edges and rel not in automatic_edges: 
            TN +=1
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":1,"FP":0,"FN":0, "Path chain": '', "FN Path": 0}
            results_list.append(new_raw)



    results_df = pd.DataFrame(results_list,columns=["prerequisite","advanced","Direct Edge",	"Path",	"Not in Gold", "TP","TN","FP","FN", "Path chain", "FN Path"])
    results_df.to_csv("burst_FN_path_results.csv", sep=";", encoding="utf-8")
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    print("\nTP =", TP, "\nFP =", FP, "\nTN =", TN, "\nFN =", FN,"\nFNHP =", FNHP, "precision =", precision, "\nrecall =", recall)
    F1_score = 2 * ( (precision * recall) / (precision + recall) )
    print("precision =", precision, "\nrecall =", recall, "\nF1 =", F1_score)
    print("number of paths TP",HP)
    # nx.draw(manual)
    


def compute_trans_closure_metrics(automatic, manual):
    """
    Prints precision, recall and F1 score given TP FP TN and FN.
    """

    # results_df=pd.DataFrame(columns=["prerequisite","advanced","Direct Edge",	"Path",	"Not in Gold", "TP","TN","FP","FN"])
    results_list = []
    if type(automatic) == pd.DataFrame:
        automatic_edges = [tuple(x) for x in automatic[["prerequisite", "advanced"]].values]
        print("The compute metrics input automatic is a data frame")
    elif type(automatic) == nx.DiGraph:
        automatic_edges = list(automatic.edges())
        # print("The compute metrics input automatic is a Graph")
    else:
        print("Input error for 'automatic': accept only pd.DataFrame or nx.Digraph.")
        return None

    if type(manual) == pd.DataFrame:
        manual_edges = [tuple(x) for x in manual[["prerequisite", "advanced"]].values]
        # print("The compute metrics input manual is a data frame")
    elif type(manual) == nx.DiGraph:
         manual_edges = list(manual.edges())
         print("The compute metrics input manual is a Graph")
    else:
        print("Input error for 'manual': accept only pd.DataFrame or nx.Digraph.")
        return None



    all_possibles = find_all_pairs(concepts)


    TP = 0
    FP = 0
    TN = 0
    FN = 0
    HP = 0
    FNHP = 0

    for rel in automatic_edges:
        if rel in manual_edges:
            TP +=1
            # print("TP Direct rel:",rel)
            # ["prerequisite","advanced","Direct Edge",	"Path",	"Not in Gold", "TP","TN","FP","FN"]
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':1,'Path':0,'Not in Gold':0,"TP":1,"TN":0,"FP":0,"FN":0,"Path chain": '', "FN Path": 0}
            results_list.append(new_raw)
        elif manual.has_node(rel[0]) and manual.has_node(rel[1]):
            if nx.has_path(manual, rel[0], rel[1]):
                TP +=1
                HP +=1
                # print("There is a path between", rel)
                new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':1,'Not in Gold':0,"TP":1,"TN":0,"FP":0,"FN":0,"Path chain": nx.shortest_path(manual, rel[0], rel[1]), "FN Path": 0}
                results_list.append(new_raw)
            else:
                FP +=1
                # print("The relation does not exist in the Gold", rel)
                new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":1,"FN":0, "Path chain": '', "FN Path": 0}
                results_list.append(new_raw)
        elif rel not in manual_edges:
            FP +=1
            # print("FP: ",rel)
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':1,"TP":0,"TN":0,"FP":1,"FN":0, "Path chain": '', "FN Path": 0}
            results_list.append(new_raw)
        else:
            print("No defined criteria for: ",rel)

    # FN Missing rel, exist in gold but not extracted nor path
    # for rel in manual_edges:
    #     # If gold rel not exist as a direct rel
    #     if rel not in automatic_edges:
    #         FN +=1
    #         new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":0,"FN":1, "Path chain": '', "FN Path": 0}
    #         results_list.append(new_raw)
    #     # If the concept in the extracted exist but there is no path
    #     elif automatic.has_node(rel[0]) and automatic.has_node(rel[1]):
    #         if not nx.has_path(automatic, rel[0], rel[1]):
    #             FN +=1
    #             FNHP +=1
    #             new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":0,"FN":1, "Path chain": '', "FN Path": 1}
    #             results_list.append(new_raw)
    #         else:
    #             print("For the gold relation there is also a path in the burst", rel)

    # FN Missing rel, exist in gold as path but not extracted nor path
    manual_t = nx.transitive_closure(manual, reflexive=False)
    manual_t_edges = list(manual_t.edges())
    for rel in manual_t_edges:
        # If gold rel not exist as a direct rel
        if rel not in automatic_edges:
            FN +=1
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":0,"FN":1, "Path chain": '', "FN Path": 0}
            results_list.append(new_raw)
        # If the concept in the extracted exist but there is no path
        elif automatic.has_node(rel[0]) and automatic.has_node(rel[1]):
            if not nx.has_path(automatic, rel[0], rel[1]):
                FN +=1
                FNHP +=1
                new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":0,"FP":0,"FN":1, "Path chain": '', "FN Path": 1}
                results_list.append(new_raw)
            else:
                print("For the gold relation there is also a path in the burst", rel)

    for rel in all_possibles:
        if rel not in manual_edges and rel not in automatic_edges:
            TN +=1
            new_raw={'prerequisite':rel[0],'advanced':rel[1],'Direct Edge':0,'Path':0,'Not in Gold':0,"TP":0,"TN":1,"FP":0,"FN":0, "Path chain": '', "FN Path": 0}
            results_list.append(new_raw)



    results_df = pd.DataFrame(results_list,columns=["prerequisite","advanced","Direct Edge",	"Path",	"Not in Gold", "TP","TN","FP","FN", "Path chain", "FN Path"])
    results_df.to_csv("burst_FN_path_results.csv", sep=";", encoding="utf-8")
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    print("\nTP =", TP, "\nFP =", FP, "\nTN =", TN, "\nFN =", FN,"\nFNHP =", FNHP, "precision =", precision, "\nrecall =", recall)
    F1_score = 2 * ( (precision * recall) / (precision + recall) )
    print("precision =", precision, "\nrecall =", recall, "\nF1 =", F1_score)
    print("number of paths TP",HP)
    # nx.draw(manual)



def build_freq_matrix(index_df, save=False):
    """
    Builds an undirected square matrix with weights based on concepts co-occurrence in a window of +/- 3 sentences.
    
    index_df: index df with columns ["Lemma", "idFrase", "idParolaStart"]
    """
    
    freq_df = pd.DataFrame(0, index=concepts, columns=concepts)

    for c1 in index_df["Lemma"].unique().tolist():
        # subset with only occurrences of c1
        sub_df_c1 = index_df[index_df["Lemma"] == c1]
        for i, r in sub_df_c1.iterrows():
            curr_sent = r["idFrase"]

            for c2 in index_df["Lemma"].unique().tolist():
                if c2 != c1:
                    # subset with only occurrences of c2
                    sub_df_c2 = index_df[index_df["Lemma"] == c2]
                    count = sub_df_c2[sub_df_c2["idFrase"].isin(list(range(curr_sent-3, curr_sent+4)))].shape[0]

                    freq_df.at[c1, c2] = freq_df.at[c1, c2] + count
                    
    if save:
        freq_df.to_csv("risultati_freq\\freq.csv", sep="\t", encoding="utf-8")
        
    return freq_df
    

def create_graphs(dataset):
    """
    Returns a networkx graph from a pandas dataframe with columns ["prerequisite", "advanced"]
    """

    G_nx = nx.DiGraph()

    for i, r in dataset.iterrows():
        G_nx.add_edge(r["prerequisite"], r["advanced"])

    return G_nx



def detect_and_remove_cycles(graph):
    """
    Recursively detect and delete cycles from the graph of the gold.
    First it automatically remove 2-nodes-cycles (i.e. recirpocal) if these are present, 
    by using annotators agreement or, if agreement is equal in both directions, 
    temporal order of concepts appearance in the text.
    Then it prompts user to manually decide which edge remove inside 3(or more)-nodes-cycles.
    
    Finally returns the graph with no cycles.
    
    """
    
    G_gold_no_trans = graph.copy()
    
    # detect simple cycles (i.e. closed paths with 2 or more nodes where no node appears twice)
    cycles = list(nx.simple_cycles(G_gold_no_trans))

    print("Number of cycles found:\t", len(cycles))

    def find_longest_cycle(cycles_list):
        """lenght of the cycles measured in number of nodes"""
        list_len = [len(i) for i in cycles_list]
        return max(list_len)
    
    def find_shortest_cycle(cycles_list):
        """lenght of the cycles measured in number of nodes"""
        list_len = [len(i) for i in cycles_list]
        return min(list_len)
    

    if len(cycles) == 0:
        # no cycles have been found
        return G_gold_no_trans


    print("Number of nodes in the longest cycle:", find_longest_cycle(cycles), "\n")

    reciprocal_cycles = [x for x in cycles if len(x) == 2]
    
    if len(reciprocal_cycles) > 0:
        
        print("Found cycles with only 2 nodes that will be fixed automatically using annotators agreement or temporal order of occurrence.\n")

        for cycle in reciprocal_cycles:
            print("\tAnalyzing cycle:\t", cycle)
            c1 = cycle[0]
            c2 = cycle[1]

            # check agreement in the gold
            from_c1_to_c2 = gold[(gold["prerequisite"] == c1) & (gold["advanced"] == c2)].iloc[0]["count"]
            from_c2_to_c1 = gold[(gold["prerequisite"] == c2) & (gold["advanced"] == c1)].iloc[0]["count"]

            if from_c1_to_c2 > from_c2_to_c1:
                G_gold_no_trans.remove_edge(c2, c1)
                print("\tremoved:\t", c2, "\t-->\t", c1)
                print("\tcriterion: agreement\n")
            elif from_c2_to_c1 > from_c1_to_c2:
                G_gold_no_trans.remove_edge(c1, c2)
                print("\tremoved:\t", c1, "\t-->\t", c2)
                print("\tcriterion: agreement\n")
            elif from_c1_to_c2 == from_c2_to_c1:
                # edges have the same agreement: use temporal order to decide
                if index_df[index_df["Lemma"] == c1]["idFrase"].min() < index_df[index_df["Lemma"] == c2]["idFrase"].min():
                    G_gold_no_trans.remove_edge(c2, c1)
                    print("\tremoved:\t", c2, "\t-->\t", c1)
                    print("\tcriterion: temporal order\n")
                elif index_df[index_df["Lemma"] == c2]["idFrase"].min() < index_df[index_df["Lemma"] == c1]["idFrase"].min():
                    G_gold_no_trans.remove_edge(c1, c2)
                    print("\tremoved:\t", c1, "\t-->\t", c2)
                    print("\tcriterion: temporal order\n")
                elif index_df[index_df["Lemma"] == c2]["idFrase"].min() == index_df[index_df["Lemma"] == c1]["idFrase"].min():
                    # occur in the same sentence: check number of token
                    same_sent = index_df[index_df["Lemma"] == c2]["idFrase"].min() 
                    sent_sub_df_c1 = index_df[(index_df["Lemma"] == c1) & (index_df["idFrase"] == same_sent)]
                    sent_sub_df_c2 = index_df[(index_df["Lemma"] == c2) & (index_df["idFrase"] == same_sent)]

                    if sent_sub_df_c1["idParolaStart"].min() < sent_sub_df_c2["idParolaStart"].min():
                        G_gold_no_trans.remove_edge(c2, c1)
                        print("\tremoved:\t", c2, "\t-->\t", c1)
                        print("\tcriterion: temporal order\n")
                    elif sent_sub_df_c2["idParolaStart"].min() < sent_sub_df_c1["idParolaStart"].min():
                        G_gold_no_trans.remove_edge(c1, c2)
                        print("\tremoved:\t", c1, "\t-->\t", c2)
                        print("\tcriterion: temporal order\n")
                    else:
                        print("\tnot captured:\n", c1, "\t-->\t", c2, "\nand viceversa.\n")

            else:
                print("\tnot captured:\n", c1, "\t-->\t", c2, "\nand viceversa.\n")
         
        print("Finished to fix cycles with 2 nodes. Cycles will be now detected again in the new graph.\n")
        # recursively call the function to update the list of cycles to fix
        detect_and_remove_cycles(G_gold_no_trans)
                
    else:
        print("There are no cycles with only 2 nodes. You need to choose manually which edge remove.\n")
              
        shortest_cycles = [x for x in cycles if len(x) == find_shortest_cycle(cycles)]
    
        first_cycle = shortest_cycles[0]

        edges_list = []

        for node in range(len(first_cycle)):
            try:
                edges_list.append((first_cycle[node], first_cycle[node+1]))
            except IndexError:
                edges_list.append((first_cycle[node], first_cycle[0]))

        print("\tAnalyzing cycle:\n")

        for i, x in enumerate(edges_list):
             print("\t", i, ":", x)

        choice = int(input("\tInsert the number of the edge to remove:\t"))

        to_remove = edges_list[choice]

        G_gold_no_trans.remove_edge(to_remove[0], to_remove[1])
        print("\tremoved:\t", to_remove[0], "\t-->\t", to_remove[1], "\n")

        print("Fixed one cycle with more than 2 nodes. Cycles will be now detected again in the new graph.\n")
        # recursively call the function to update the list of cycles to fix
        detect_and_remove_cycles(G_gold_no_trans)
            
            
    return G_gold_no_trans

## Open data

In [9]:
# load terminology
terminology_file = "new_terminologia_pret_v2_3.txt"
concepts = load_terminology(terminology_file)        
all_possibles = find_all_pairs(concepts)
# load burst
burst = load_burst(allen_weights=1, threshold=0)
# load gold
gold = generate_gold("nuove", True)
# load bs
bs = load_bs("m5", threshold=0.0)

# select top burst relations
# top= 100
# select top n burst (n=number of relations in the gold)
# top = gold.shape[0]
#gold = gold.sort_values(by="count", ascending=False).iloc[:top]
# burst_top = burst.sort_values(by="weight", ascending=False).iloc[:top]
# print("\nnumber of relations in the top burst", top)
print("\nRelations with the same weight of the last relation in the top list that were excluded from the top chart:")
#burst[burst["weight"] == burst_top["weight"].min()]

Revisione_Annotazione2020_Sava.csv
	Num of relations: 130 

Revisione_Annotazione2020_Mirenda.csv
	Num of relations: 236 

Revisione_Annotazione2020_Parizzi.csv
	Num of relations: 143 

Revisione_Annotazione2020_Moggio.csv
	Num of relations: 184 


Num of relations in the combined gold: 350

Relations with the same weight of the last relation in the top list that were excluded from the top chart:


In [10]:
# bs_graph = create_graphs(bs)
burst_graph = create_graphs(burst)
gold_graph = create_graphs(gold)
# compute_metrics(bs_graph, gold_graph)
compute_metrics(burst_graph, gold_graph)

The compute metrics input manual is a Graph
For the gold relation there is also a path in the burst ('INTERNET', 'PROTOCOL')
For the gold relation there is also a path in the burst ('INTERNET', 'TRADITIONAL TELEPHONE')
For the gold relation there is also a path in the burst ('INTERNET', 'TIER-1 INTERNET SERVICE PROVIDER')
For the gold relation there is also a path in the burst ('INTERNET', 'TIER-2 INTERNET SERVICE PROVIDER')
For the gold relation there is also a path in the burst ('INTERNET', 'INTRANET')
For the gold relation there is also a path in the burst ('INTERNET', 'TIER-3 INTERNET SERVICE PROVIDER')
For the gold relation there is also a path in the burst ('INTERNET', 'WEBSERVERS')
For the gold relation there is also a path in the burst ('INTERNET', 'GATEWAY')
For the gold relation there is also a path in the burst ('INTERNET', 'INTERNET SERVICE PROVIDER')
For the gold relation there is also a path in the burst ('INTERNET', 'ACCESS INTERNET SERVICE PROVIDER')
For the gold relati

In [5]:
#compute_metrics(burst, gold)

In [6]:
#compute_metrics(burst_top, gold)

## Co-occurrence

In [7]:
# file with occurrences
index_df = pd.read_csv("april_2020_index_con_collision_detection.csv", sep="\t", encoding="utf-8", usecols=[0,3,4,5])
index_df.head()

Unnamed: 0,Lemma,idFrase,idParolaStart,Lunghezza
0,"CARRIER SENSE , MULTIPLE ACCESS WITH COLLISION...",57,6,8
1,"CARRIER SENSE , MULTIPLE ACCESS WITH COLLISION...",531,13,8
2,"CARRIER SENSE , MULTIPLE ACCESS WITH COLLISION...",43,23,8
3,"CARRIER SENSE , MULTIPLE ACCESS WITH COLLISION...",51,9,8
4,"CARRIER SENSE , MULTIPLE ACCESS WITH COLLISION...",52,3,8


In [8]:
# build undirected matrix from occurrences
# TAKES TIME! USE PRE-COMPUTED MATRIX SAVED IN CSV FILE
#freq_df = build_freq_matrix(index_df, save=False)
freq_df = pd.read_csv("risultati_freq\\freq.csv", sep="\t", encoding="utf-8", index_col=0)
freq_df.head()

FileNotFoundError: [Errno 2] File b'risultati_freq\\freq.csv' does not exist: b'risultati_freq\\freq.csv'

In [None]:
# give directionality to co-occurrence using temporal order

freq_rels = []

for c1 in concepts:
    for c2 in concepts:
        
        if freq_df.at[c1, c2] != 0 and freq_df.at[c2, c1] != 0:
            
            sub_df_c1 = index_df[index_df["Lemma"] == c1]
            sub_df_c2 = index_df[index_df["Lemma"] == c2]
            
            if sub_df_c1["idFrase"].min() < sub_df_c2["idFrase"].min():
                freq_rels.append((c1, c2, freq_df.at[c1, c2]))
            elif sub_df_c1["idFrase"].min() > sub_df_c2["idFrase"].min():
                freq_rels.append((c2, c1, freq_df.at[c2, c1]))
            
            # same sentence: check number of token
            elif sub_df_c1["idFrase"].min() == sub_df_c2["idFrase"].min():
                same_sent = sub_df_c1["idFrase"].min()
                sent_sub_df_c1 = index_df[(index_df["Lemma"] == c1) & (index_df["idFrase"] == same_sent)]
                sent_sub_df_c2 = index_df[(index_df["Lemma"] == c2) & (index_df["idFrase"] == same_sent)]
                
                if sent_sub_df_c1["idParolaStart"].min() < sent_sub_df_c2["idParolaStart"].min():
                    freq_rels.append((c1, c2, freq_df.at[c1, c2]))
                elif sent_sub_df_c1["idParolaStart"].min() > sent_sub_df_c2["idParolaStart"].min():
                    freq_rels.append((c2, c1, freq_df.at[c2, c1]))
                else:
                    print(c1, "\t->\t", c2, "\t", "not captured (1)")
            else:
                print(c1, "\t->\t", c2, "\t", "not captured (2)")
                
freq_rels_df = pd.DataFrame(freq_rels, columns=["prerequisite", "advanced", "count"])
freq_rels_df.head()

In [None]:
compute_metrics(freq_rels_df, gold)

## Transitives

In [None]:
input_dir = "annotazioni_revisionate/nuove/"

trans = set()

for file in listdir(input_dir):
    
    curr_name = file.replace("Revisione_Annotazione2020_", "").replace(".csv", "")
    
    print(curr_name)
    
    trans_df = pd.read_excel("transitive_weak/dopo_revisione/transitive_dopo_revisione.xlsx", sheet_name=curr_name)
    trans = trans.union({tuple(x) for x in trans_df.values})
    
    print("\tNum of transitives:", len({tuple(x) for x in trans_df.values}))
    
print("\nNum of unique transitives:", len(trans))

gold_with_trans = {tuple(x) for x in gold.drop("count", axis=1, inplace=False).values}
gold_no_trans = gold_with_trans - trans
gold_no_trans = pd.DataFrame(data=list(gold_no_trans), columns=["prerequisite", "advanced"])
    
print("\nNum of relations in the combined gold with no transitives:", gold_no_trans.shape[0])               
gold_no_trans.head()

In [None]:
G_burst = create_graphs(burst)

diameter = nx.diameter(G_burst.to_undirected())
longest_path = nx.dag_longest_path_length(G_burst)
print("diameter:\t", diameter, "\nlongest path:\t", longest_path)

G_burst_no_trans = nx.dag.transitive_reduction(G_burst)
print("Burst after transitive reduction:\n", len(G_burst.edges()), "\t-->\t", len(G_burst_no_trans.edges()))

In [None]:
G_gold = create_graphs(gold_no_trans)

# if it's not a DAG we need to first delete cycles
try: 
    G_gold_no_trans = nx.dag.transitive_reduction(G_gold)
except nx.exception.NetworkXException:
    G_gold_no_trans = detect_and_remove_cycles(G_gold)

In [None]:
print("Gold after transitive reduction:\n", len(G_gold.edges()), "\t-->\t", len(G_gold_no_trans.edges()))

In [None]:
compute_metrics(G_burst_no_trans, G_gold_no_trans)