In [79]:
"""check spartan outputs
- auprc for graph embedding methods
- precision at k for graph embedding methods
- raw TP and FP for precision at k,
    - whether the FPs appear in the next years. Whether the FPs appear in 2016,2017,2018...,2021, then the FPs are not 
        really FPs, but the errors come from the limitation of the experimental settings
    - start with sdne year==[2002,2021], k==10. 
-- observations of errors (FPs) from precision at k
-- For observations of errors among top predictions, see "analyze_top_predictions.ipynb"
"""

"check spartan outputs\n- auprc for local similarity measures\n- auprc for graph embedding methods\n- precision at k for graph embedding methods\n- raw TP and FP for precision at k, choosing Year as 2015\n    - whether the FPs appear in the next years. Whether the FPs appear in 2016,2017,2018...,2021, then the FPs are not \n        really FPs, but the errors come from the limitation of the experimental settings\n    - start with sdne year==[2002,2021], k==10. \n    \n-- observations of errors (FPs) from precision at k:\n    - first error for the TP and FP, a few rubbish labels aren't assigned smeantic types, they should be excluded from the annotation, e.g. rdfs:label\n    - second error ... third error ...\n\n-- observations of FNs:\n    - \n\n"

In [2]:
import os
import json
import pandas as pd
import numpy as np
import xmltodict
from collections import Counter,defaultdict
import warnings
import sys
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
sys.path.insert(1, '/Users/yidesdo21/Projects/codes/04_generate_graph')
from create_edge import create_yr_link
import altair as alt
from IPython.display import display
alt.data_transformers.disable_max_rows()
import copy

In [3]:
path = "../../outputs/17_dgl_results/spartan_output/auprc/"   ## ../ goes to the parent directory of pwd, .. goes to the current directory
sim_path = "../../outputs/17_dgl_results/similarity_measures/"
prec_at_k_path = "../../outputs/17_dgl_results/spartan_output/" 
src_path = "../../outputs/16_dgl_csv/1977_" 
network_path = "../../outputs/24_dgl_networks/"
error_path = "../../outputs/17_dgl_results/error_analysis/" 
meta_path = "../../outputs/12_time_slicing/metadata/" 
fns_raw_path = "../../outputs/17_dgl_results/sdne_fns/" 
fns_idx_path = "../../outputs/17_dgl_results/spartan_output/prec_at_10_raw/sdne/" 

In [7]:
# year auprc results for each model each year
pr_results = dict()

for root, dirs, files in os.walk(path):
    model = root.split("/")[-1]
    model_results = dict()

    for file in files:
        if file.endswith('.json') and not file.endswith('not_cogdl.json'):

            with open(root+"/"+file, 'r') as f:
                data=json.load(f)
                pr_data = dict()

                if model == "dummy":    # for dummy, we are using precision when recall is 1, instead of auprc. It is the worst case scenario
                    for d in data:
                        for k,v in d.items():
                            pr_data[k] = v[1]

                elif model == "sim":    # process in the next cell
                    continue

                else:
                    for k,v in data.items():  # get the auprc value
                        # pr_data[k] = round(v[0],4)  # round in the dataframe
                        pr_data[k] = v[0]

                model_results.update(pr_data)
    
    if len(model) > 0: # remove ""
        pr_results[model] = model_results

# print(pr_results)

In [9]:
# create the dataframe for auprc results

# ne_categories = ["Matrix factorization-based"]*2+["Random walk-based"]*3+["Neural network-based"]*4
# ne_categories = ["Random walk-based"]*3+["Neural network-based"]*4
# ne_categories = ["Baseline"]+["Local similarity-based"]*5+["Matrix factorization-based"]*2+["Random walk-based"]*3+["Neural network-based"]*4
ne_categories = ["Baseline"]+["Matrix factorization-based"]*2+["Random walk-based"]*2+["Neural network-based"]*4
# ne_method = ["HOPE","GraRep"]
# ne_method = ["struc2vec","node2vec","deepwalk","line","gcn","graphsage","sdne"]
ne_method = ["dummy","grarep", "hope","node2vec","deepwalk","line","gcn","graphsage","sdne"]

ne_cols = pd.MultiIndex.from_arrays([ne_categories,ne_method])
ne_cols


MultiIndex([(                  'Baseline',     'dummy'),
            ('Matrix factorization-based',    'grarep'),
            ('Matrix factorization-based',      'hope'),
            (         'Random walk-based',  'node2vec'),
            (         'Random walk-based',  'deepwalk'),
            (      'Neural network-based',      'line'),
            (      'Neural network-based',       'gcn'),
            (      'Neural network-based', 'graphsage'),
            (      'Neural network-based',      'sdne')],
           )

In [11]:
# pd.set_option('display.float_format', '{:.2e}'.format)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

df_lsts = list()

for model_name,year_prc in pr_results.items():
    # print(model_name)
    # print(year_prc)

    if model_name == "graphsage":   # need to rank 
        sorted_year_prc = {key: value for key, value in sorted(year_prc.items())}
        df = pd.DataFrame.from_dict(sorted_year_prc,orient='index', columns=[model_name])
        df_lsts.append(df)

    else:
        df = pd.DataFrame.from_dict(year_prc,orient='index', columns=[model_name])
        df_lsts.append(df)
    
ne_df = pd.concat(df_lsts, axis=1)
df = pd.concat([ne_df,sim_df], axis=1)

# df = df.reindex(columns=["dummy", "grarep", "hope", "struc2vec","node2vec","deepwalk","line","gcn","graphsage","sdne"])
df = df.reindex(columns=["dummy", "grarep", "hope","node2vec","deepwalk","line","gcn","graphsage","sdne"])
df.columns = pd.MultiIndex.from_arrays([ne_categories,df.columns])

df.loc['mean'] = df.mean()
df["mean"] = df.mean(axis=1)   # add average to each pair of train/test dataset 
df = df.replace(np.nan,"tbd")
# df.round(5)

df = df.mul(100).round(2).replace("_","-")
df


Unnamed: 0_level_0,Baseline,Matrix factorization-based,Matrix factorization-based,Random walk-based,Random walk-based,Neural network-based,Neural network-based,Neural network-based,Neural network-based,mean
Unnamed: 0_level_1,dummy,grarep,hope,node2vec,deepwalk,line,gcn,graphsage,sdne,Unnamed: 10_level_1
1977_2002,2.44,10.02,6.46,9.42,14.56,7.55,14.0,2.67,16.05,9.24
1977_2003,0.62,4.58,3.99,4.37,2.61,1.04,8.86,0.74,9.33,4.02
1977_2004,0.34,3.82,3.62,2.94,1.86,0.45,7.04,0.39,7.54,3.11
1977_2005,0.27,3.7,3.74,3.16,1.99,0.36,6.56,0.29,7.02,3.01
1977_2006,0.19,3.54,4.05,2.87,1.91,0.28,5.14,0.21,5.39,2.62
1977_2007,0.05,0.95,0.92,0.88,0.63,0.08,1.37,0.06,1.48,0.71
1977_2008,0.12,2.69,2.78,2.54,1.74,0.17,4.05,0.12,4.17,2.04
1977_2009,0.12,3.51,3.73,3.04,2.27,0.17,4.54,0.13,5.23,2.53
1977_2010,0.09,2.52,2.46,1.93,1.61,0.13,3.44,0.1,3.72,1.77
1977_2011,0.06,2.01,2.08,1.51,1.25,0.08,2.94,0.07,3.18,1.46


In [12]:
print(df.to_latex(index=True))

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} & Baseline & \multicolumn{2}{l}{Matrix factorization-based} & \multicolumn{2}{l}{Random walk-based} & \multicolumn{4}{l}{Neural network-based} & mean \\
{} &    dummy &                     grarep & hope &          node2vec & deepwalk &                 line &   gcn & graphsage & \multicolumn{2}{l}{sdne} \\
\midrule
1977\_2002 &     2.44 &                      10.02 & 6.46 &              9.42 &    14.56 &                 7.55 & 14.00 &      2.67 & 16.05 & 9.24 \\
1977\_2003 &     0.62 &                       4.58 & 3.99 &              4.37 &     2.61 &                 1.04 &  8.86 &      0.74 &  9.33 & 4.02 \\
1977\_2004 &     0.34 &                       3.82 & 3.62 &              2.94 &     1.86 &                 0.45 &  7.04 &      0.39 &  7.54 & 3.11 \\
1977\_2005 &     0.27 &                       3.70 & 3.74 &              3.16 &     1.99 &                 0.36 &  6.56 &      0.29 &  7.02 & 3.01 \\
1977\_2006 &     0.19 &                    

In [13]:
## retrieve the results of the last train/test split 
df.iloc[[-2]]


Unnamed: 0_level_0,Baseline,Matrix factorization-based,Matrix factorization-based,Random walk-based,Random walk-based,Neural network-based,Neural network-based,Neural network-based,Neural network-based,mean
Unnamed: 0_level_1,dummy,grarep,hope,node2vec,deepwalk,line,gcn,graphsage,sdne,Unnamed: 10_level_1
1977_2021,0.0,0.06,0.08,0.04,0.04,0.0,0.15,0.0,0.16,0.06


In [14]:
## process the precision at k results
## precision at k for the model sdne
# precision_at_k = ["prec_at_10","prec_at_20","prec_at_30","prec_at_40"]
precision_at_k = ["prec_at_10"]
# models = ['graphsage', 'sdne', 'deepwalk', 'node2vec', 'line', 'struc2vec', 'grarep', 'hope', 'gcn']
models = ["sdne"]
model_pat10 = dict()

for k in precision_at_k:
    model_patk = dict()
    
    for model in models:
        for root, dirs, files in os.walk(prec_at_k_path+k+"/"+model):
            for file in files:
                if file.endswith('.json') and file.startswith('yr_results'):
                    with open(root+"/"+file, 'r') as f:
                        data=json.load(f)
                        # print(data.keys())
                        for yr in data.keys():
                        # for t in range(2002,2022):
                            # year = "1977_"+str(t)
                            # print(year)
                            # print(data.get(year))
                            p_at_k = data.get(yr)[-1]  # this is correct, the last element is precision at k
                            model_patk[yr] = p_at_k
    
    model_pat10[model] = model_patk

print(model_pat10)


{'sdne': {'1977_2002': 0.30424528301886794, '1977_2003': 0.27310924369747897, '1977_2004': 0.2678275290215589, '1977_2005': 0.21872816212438853, '1977_2006': 0.17642907551164433, '1977_2007': 0.078125, '1977_2008': 0.18152661962796665, '1977_2009': 0.17901234567901234, '1977_2010': 0.1511864406779661, '1977_2011': 0.1423699914748508, '1977_2012': 0.14339826839826839, '1977_2013': 0.1793020457280385, '1977_2014': 0.1202361782071927, '1977_2015': 0.1915159076731129, '1977_2016': 0.10863509749303621, '1977_2017': 0.11301369863013698, '1977_2018': 0.10751932536893886, '1977_2019': 0.0, '1977_2020': 0.0710594315245478, '1977_2021': 0.0}}


In [15]:
## turn a dictionary of dictionaries to a dataframe
## use scientific notation
pd.set_option('display.float_format', '{:.2e}'.format)

prec_k = pd.DataFrame.from_dict(model_pat10)
prec_k


Unnamed: 0,sdne
1977_2002,0.304
1977_2003,0.273
1977_2004,0.268
1977_2005,0.219
1977_2006,0.176
1977_2007,0.0781
1977_2008,0.182
1977_2009,0.179
1977_2010,0.151
1977_2011,0.142


In [16]:
print(prec_k.to_latex(index=True))

\begin{tabular}{lr}
\toprule
{} &     sdne \\
\midrule
1977\_2002 & 3.04e-01 \\
1977\_2003 & 2.73e-01 \\
1977\_2004 & 2.68e-01 \\
1977\_2005 & 2.19e-01 \\
1977\_2006 & 1.76e-01 \\
1977\_2007 & 7.81e-02 \\
1977\_2008 & 1.82e-01 \\
1977\_2009 & 1.79e-01 \\
1977\_2010 & 1.51e-01 \\
1977\_2011 & 1.42e-01 \\
1977\_2012 & 1.43e-01 \\
1977\_2013 & 1.79e-01 \\
1977\_2014 & 1.20e-01 \\
1977\_2015 & 1.92e-01 \\
1977\_2016 & 1.09e-01 \\
1977\_2017 & 1.13e-01 \\
1977\_2018 & 1.08e-01 \\
1977\_2019 & 0.00e+00 \\
1977\_2020 & 7.11e-02 \\
1977\_2021 & 0.00e+00 \\
\bottomrule
\end{tabular}



In [18]:
## look at raw node pairs of precision at k outupts 
precision_at_k_raw = ["prec_at_20_raw","prec_at_30_raw","prec_at_40_raw"]
models = ['node2vec', 'grarep', 'hope', 'gcn']
k_model_patk = dict()

for k in precision_at_k_raw:
    model_patk = dict()
    
    for model in models:

        link_label = dict()

        for root, dirs, files in os.walk(prec_at_k_path+k+"/"+model):
            for file in files:
                if file.endswith('node_idx.json'):
                    with open(root+"/"+file, 'r') as f:
                        idx_dict=json.load(f)
                        idx_inv = dict((v,k) for k,v in idx_dict.items())
                        # print(idx_inv)
                elif file.endswith("raw_dict.json"):
                    with open(root+"/"+file, 'r') as f:
                        raw_dict=json.load(f)
                        # print(raw_dict)

                # print(file)

            true_labels = raw_dict.get("true_labels")
            
            if 1 in true_labels:
                node_pairs = raw_dict.get("node_pairs")
                potential_link = list()
                for node_pair in node_pairs:
                    n1,n2 = idx_inv.get(node_pair[0]),idx_inv.get(node_pair[1])
                    potential_link.append(n1+" "+n2)

                link_label["potential_link"] = potential_link
                link_label["true_labels"] = true_labels
                save = True 
            else:
                save = False
        
        if save == True:
            model_patk[model] = link_label
        # print("--------------")
                        # p_at_k = data.get("1977_2021")[-1]
                        # model_patk[model] = p_at_k
    
    k_model_patk[k] = model_patk

# print(k_model_patk)


In [21]:
node_type = "../../outputs/12_time_slicing/metadata/uid_type.json"

# open the uid type file, linking uids to node types
with open(node_type) as fout:
    uid_type = json.load(fout)
    uid_type = dict((k.lower(), v) for k, v in uid_type.items())

In [22]:
## link nio uid to mention 

def merge(D1,D2):
    py={**D1,**D2}
    return py

    
xml_path = "/Users/yidesdo21/Projects/inputs/dictionary/"
ptc_mention = "../../outputs/12_time_slicing/metadata/ptc_mention_uid.txt"

with open(xml_path+"nio_ado_case.xml") as f:
    nio_xml = f.read()

nio_parsed = xmltodict.parse(nio_xml)
nio_dict = nio_parsed["synonym"]["token"]
nio_uid_canonical = dict()

for nio in nio_dict:
    token_id, cano = nio["@id"], nio["@canonical"]
    nio_uid_canonical[token_id] = cano

## open the dictionary that links ptc uids and the ptc mentions
with open(ptc_mention) as f:
    lines = f.read().split("}, '")
    split = [l.split("': {'") for l in lines]
    
    ptc_uid_mention = dict()
    # special case:"defaultdict(<class 'set'>, {'MESH:D009369"    
    ptc_uid_mention["MESH:D009369"] = ["'cancer', 'spheroids', 'tumour', 'neoplasia', 'cancers', 'tumors', 'lesions', 'tumoral', 'neoplasms', 'neoplasm', 'keratocarcinomata', 'cancerous', 'carcinomas', 'Tumors', 'malignancies', 'tumor', 'carcinoma', 'malignancy', 'tumours', 'disease', 'Cancer'"]

    for s in split[1:]:
        uid, mention = s[0], s[1:]
        ptc_uid_mention[uid] = mention
        
## add two dictionaries together
uid_canonical =(merge(nio_uid_canonical,ptc_uid_mention))
uid_canonical = dict((k.lower(), v) for k, v in uid_canonical.items())

## calculate statistics for link types 
## Step 01: add genotypes and phenotypes to the nodes 
# cluster semantic group to genotypes or phenotypes
geno_pheno = {"AD-DTM": "Genotype", "BrainArea": "Phenotype", "NeuroTest":"Phenotype",
            "CogFunc": "Phenotype", "ADPrep": "Phenotype", "Gene": "Genotype",
            "Variant": "Genotype", "Disease": "Phenotype", "Chemical":"Genotype", "Species": "Phenotype",
            "CellLine": "Genotype"} 



In [90]:
## the network after adding all links until 2021 data
network_df = pd.read_csv(network_path+"1977_2021/"+"edges.csv")
network_df['link'] = network_df["Source"].astype('str')+"::"+network_df["Target"].astype('str')
network_df['link'] = network_df["link"].str.split('::').apply(set)
all_links = list(network_df["link"])


In [27]:
def plot_correlation(fp_df):
    """plot the probabilities of the classifier for FPs that are really FP and FPs that are not FP"""
    
    # display(fp_df.head())

    fp_df["potential_link"] = fp_df["potential_link"].astype(str) 
    fp_df = fp_df[["potential_link","link_type","probabilities","true_fp"]]
    min_prob = fp_df[["probabilities"]].describe().loc["min","probabilities"]

    stripplot =  alt.Chart(fp_df).mark_circle(size=12).encode(
        x=alt.X(
            'jitter:Q',
            title=None,
            axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
            scale=alt.Scale(),
        ),
        y=alt.Y('probabilities:Q',
                scale=alt.Scale(domain=[min_prob,1])),    # min - 0.9995425871
        color=alt.Color('true_fp:N', legend=None),
        column=alt.Column(
            'true_fp:N',
            header=alt.Header(
                labelAngle=-90,
                titleOrient='top',
                labelOrient='bottom',
                labelAlign='right',
                labelPadding=3,
            ),
        ),
        tooltip=['true_fp', 'probabilities',]
    ).transform_calculate(
        # Generate Gaussian jitter with a Box-Muller transform
        jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
    ).configure_facet(
        spacing=0
    ).configure_view(
        stroke=None
    ).properties(
        width=200, height=450,
    ).interactive()

    stripplot.display() 

In [28]:
def prec_k_type(tp_df,fp_df,remove_type=True):
    """calculate the precision at k results for the genetic types -- genotype-genotype, phenotype-phenotype, and genotype-phenotype,
        tps among the precision at k results for different genetic types,
    #    for each single link type,
       k will be all predictions for that genetic type,
    #    k will be all predictions for that link type,
       for fps, the precision at k are calculated with considering fps appearing in later years as tps"""

    # tp_gr = tp_df.groupby(by=["link_type"])
    # print(tp_df.shape[0])
    if remove_type:
        tp_df = tp_df[(tp_df['type1'] != 'ADPrep') & (tp_df['type2'] != 'ADPrep')]
    # print(tp_df.shape[0])
    
    # display(tp_df[["uid1","uid2","cano1","cano2"]])

    tp_gr = tp_df.groupby(by=["geno_type"])
    tp_cnt = dict()

    for key, item in tp_gr:
        gr_df = tp_gr.get_group(key)   
        tp_cnt[key] = gr_df.shape[0]


    # fp_gr = fp_df.groupby(by=["link_type"])
    if remove_type:
        fp_df = fp_df[(fp_df['type1'] != 'ADPrep') & (fp_df['type2'] != 'ADPrep')]

    # display(fp_df[["uid1","uid2","cano1","cano2"]])

    fp_gr = fp_df.groupby(by=["geno_type"])
    fp_cnt = dict()

    for key, item in fp_gr:
        gr_df0 = fp_gr.get_group(key)
        df0_tp = gr_df0[gr_df0["true_fp"] == 0]

        if key in tp_cnt.keys():
            tp_cnt[key] += df0_tp.shape[0]
        else:
            tp_cnt[key] = df0_tp.shape[0]

        df0_fp = gr_df0[gr_df0["true_fp"] == 1]
        fp_cnt[key] = df0_fp.shape[0]

    recal_type = dict()   
    
    for k,v in tp_cnt.items():
        # print(k)
        # print("fp:", fp_cnt.get(k))
        # print("tp:", v)
        recal = v/(v+fp_cnt.get(k,0))
        recal_type[k] = recal
        # print("------------")

    return recal_type
    

In [29]:
def find_dup(k_df,remove_type=True):
    """find entity resolutions with removing the ADPrep type"""

    entities = defaultdict(set)
    dup_entities = dict()
    # tp_gr = tp_df.groupby(by=["link_type"])
    # print(tp_df.shape[0])
    if remove_type:
        k_df = k_df[(k_df['type1'] != 'ADPrep') & (k_df['type2'] != 'ADPrep')]
    # print(tp_df.shape[0])
    
    # display(k_df[["uid1","uid2","cano1","cano2"]])

    cano1 = k_df["cano1"].apply(lambda x: x.lower() if isinstance(x, str) else [m.lower() for m in x]).tolist()
    uid1 = k_df["uid1"].apply(lambda x: x.lower()).tolist()
    cano2 = k_df["cano2"].apply(lambda x: x.lower() if isinstance(x, str) else [m.lower() for m in x]).tolist()
    uid2 = k_df["uid2"].apply(lambda x: x.lower()).tolist()

    for count, value in enumerate(cano1):
        uid = uid1[count]
        if type(value) == list:
            values = value[0].split("', '")
            # value = str(value)
            for v in values:
                entities[v].add(uid)
        else:
            entities[value].add(uid)

    for count, value in enumerate(cano2):
        uid = uid2[count]
        if type(value) == list:
            values = value[0].split("', '")
            # value = str(value)
            for v in values:
                entities[v].add(uid)

        else:
            entities[value].add(uid)
            
    # print(entities)

    for c,u in entities.items():
        if len(u) > 1:
            dup_entities[c] = u

    return dup_entities


In [31]:
## create a map that maps other uids to ado uids so that the entity resolution effect will be erased 
map_to_ado = dict()

for cano,uids in entity_resolution.items():
    sorted_uids = sorted(uids)
    uid1,uid2 = sorted_uids[0],sorted_uids[1]

    if uid1.startswith("alzheimerontology"):
        map_to_ado[uid2] = uid1
    
    elif uid2.startswith("alzheimerontology"):
        map_to_ado[uid1] = uid2 

    elif len(sorted_uids) == 3:
        uid3 = sorted_uids[-1]
        map_to_ado[uid1] = uid3
        map_to_ado[uid2] = uid3

    elif uid2.startswith("ndduo") or uid2.startswith("obo"):
        map_to_ado[uid1] = uid2

    else:  # manually added in t
        map_to_ado['nddo:nddo_10000065'] = 'nddo:nddo_20000369'
        map_to_ado["mesh:d009447"] = "cvcl_0531;ncbitaxid:9606"

print(map_to_ado)

{'mesh:d002118': 'alzheimerontology:calcium', 'obo:fma_61826': 'alzheimerontology:pareital_lobe', 'mesh:d010100': 'alzheimerontology:oxygen', 'obo:fma_61824': 'alzheimerontology:frontal_lobe', 'mesh:d058225': 'obo:fma_61992', 'obo:fma_67944': 'alzheimerontology:cerebellum', 'mesh:d018698': 'alzheimerontology:glutamate', 'mesh:d007511': 'alzheimerontology:necrosis', 'mesh:d007501': 'alzheimerontology:iron', 'ndduo:transcription_factors': 'alzheimerontology:transcription_factors', 'obo:fma_62429': 'alzheimerontology:neocortex', 'mesh:d002784': 'alzheimerontology:cholesterol', 'mesh:d005978': 'alzheimerontology:glutathione', 'obo:nd_0000188': 'alzheimerontology:cerebral_amyloid_angiopathy', 'nddo:nddo_00000106': 'ndduo:person', 'obo:fma_62007': 'alzheimerontology:thalamus', '7227': 'ndduo:drosophila', 'mesh:d009422': 'ndduo:neurodegeneration', 'mesh:d020271': 'ndduo:neurodegeneration', 'mesh:d003224': 'alzheimerontology:red', 'mesh:d000109': 'alzheimerontology:acetylcholine', 'obo:fma_618

In [33]:
def rm_res(k_ent_df, dup_map=map_to_ado, remove_type=True):
    """map the uids that have entity resolution issues to one unique uid,
       erase the entity resolution effect, 
       when uid2 are the same, cano1 are the same, but uid1 are different,
       i.e., the uids are in the values of entity_resolution"""
   
    k_ent_df["uid1"] = k_ent_df["uid1"].apply(lambda x: x.lower())
    k_ent_df["uid2"] = k_ent_df["uid2"].apply(lambda x: x.lower())
    k_ent_df["uid1"] = k_ent_df['uid1'].replace(dup_map)
    k_ent_df["uid2"] = k_ent_df['uid2'].replace(dup_map)

    return k_ent_df
    

In [34]:
def get_patk_model(t,models,precision_at_k_raw,prec_at_k_path):
    """turn the precision at k results for each model from raw
       data to the data format that can be used in the downstream task"""

    k_model_patk = dict()

    for k in precision_at_k_raw:
        model_patk = dict()
        
        for model in models:

            link_label = dict()

            for root, dirs, files in os.walk(prec_at_k_path+k+"/"+model):
                for file in files:
                    # print(file)
                    if file.endswith(str(t)+'_node_idx.json'):
                        with open(root+"/"+file, 'r') as f:
                            idx_dict=json.load(f)
                            idx_inv = dict((v,k) for k,v in idx_dict.items())
                            # print(idx_inv)
                    elif file.endswith(str(t)+"_raw_dict.json"):
                        with open(root+"/"+file, 'r') as f:
                            raw_dict=json.load(f)
                            # print(raw_dict)

                true_labels = raw_dict.get("true_labels")
                prob = raw_dict.get("predicted_labels")

                # print(true_labels)
                # if 1 in true_labels:
                node_pairs = raw_dict.get("node_pairs")
                potential_link = list()
                for node_pair in node_pairs:
                    # print(node_pair)
                    # print(n1,n2)
                    n1,n2 = idx_inv.get(node_pair[0]),idx_inv.get(node_pair[1])
                    potential_link.append(n1+"::"+n2)

                link_label["potential_link"] = potential_link
                link_label["true_labels"] = true_labels
                link_label["probabilities"] = prob   # probabilities of predicting the node pair to be a True link
                # save = True 
                # else:
                #     # save = False
                #     save = True     # to save the 2021 results
                # print(link_label)

            # if save == True:
            #     model_patk[model] = link_label
            model_patk[model] = link_label
            # print("--------------")
                            # p_at_k = data.get("1977_2021")[-1]
                            # model_patk[model] = p_at_k
        
        k_model_patk[k] = model_patk

    return k_model_patk

In [35]:
def link_uids(k_df):
    """link uids to node types, genetic types, link types
    """
    ## link node type to uid
    k_df["type1"] =  k_df["uid1"].apply(lambda x: uid_type.get(x))  
    k_df["type2"] =  k_df["uid2"].apply(lambda x: uid_type.get(x))      
    k_df["cano1"] =  k_df["uid1"].apply(lambda x: uid_canonical.get(x))
    k_df["cano2"] =  k_df["uid2"].apply(lambda x: uid_canonical.get(x))

    ## link node types to genetic type
    k_df["gen1"] = k_df["type1"].apply(lambda x: geno_pheno.get(x))
    k_df["gen2"] = k_df["type2"].apply(lambda x: geno_pheno.get(x))    
    k_df["geno_type"] = k_df[["gen1","gen2"]].values.tolist()

    ## look at link types
    k_df["link_type"] = k_df[["type1","type2"]].values.tolist()

    return k_df    

In [36]:
def rm_false_fps(k_df,original_fps,fp_check,recal_patk,rm_type=False):
# def rm_false_fps(k_df,original_fps,fp_check,recal_patk,rm_type=False,ent_only=False):
    """rm_type is a parameter to decide if removing the ADPrep node type or not 
        ent_only=False --> only include erasing the entity resolution error, not including the false fps issue,
    """
    ## dropping na will affect calculating the precision at k and recalculating the precision at k with FPs actually being TPs error
    tp_df_na = k_df[k_df["true_labels"] == 1]
    fp_df_na = k_df[k_df["true_labels"] == 0]    
    tp_num,fp_num = tp_df_na.shape[0],fp_df_na.shape[0]

    k_df = k_df.dropna()  #  uids that don't have a type

 

    # k_df['geno_type'] = k_df['geno_type'].apply(list)
    # k_df['link_type'] = k_df['link_type'].apply(list)

    k_df["link_type"] = k_df["link_type"].apply(lambda x:tuple(sorted(x)))
    k_df["geno_type"] = k_df["geno_type"].apply(lambda x:tuple(sorted(x)))

    tp_df = k_df[k_df["true_labels"] == 1]
    fp_df = k_df[k_df["true_labels"] == 0]

    if rm_type:
        tp_df = tp_df[(tp_df['type1'] != 'ADPrep') & (tp_df['type2'] != 'ADPrep')]
        fp_df = fp_df[(fp_df['type1'] != 'ADPrep') & (fp_df['type2'] != 'ADPrep')]

    fps = list(fp_df["potential_link"])
    original_fps.append(len(fps))
    
    ## get the links in each year to check whether FPs are really FPs 
    ## it take more than 50 mins to run the below codes, so I turn to the full network data to check FPs, without knowing 
    ##   when the co-occurrence appears
    ## check how many fps appear in later years, which means they are not actually fps 
    cnt = 0 
    true_fps = list()    # if fps appear in later years, label as 0, otherwise 1

    for fp in fps:
        if fp in all_links:
            # print(fp)
            cnt += 1
            true_fps.append(0)
        else:
            true_fps.append(1) 

    # if not ent_only:
    #     for fp in fps:
    #         if fp in all_links:
    #             # print(fp)
    #             cnt += 1
    #             true_fps.append(0)
    #         else:
    #             true_fps.append(1) 
    
    # else:
    #     for fp in fps:
    #         true_fps.append(1)


    fp_df["true_fp"] = true_fps
    
    ## display true fps and try to group the errors
    true_fp_df = fp_df[fp_df["true_fp"] == 1]


    ## plot the correlation between the probabilities and whether the FPs appearing in later years
    # print(type(fp_df[["probabilities"]].describe()))
    # plot_correlation(fp_df)
    fp_cnt,fp_prop = cnt,cnt/len(fps)
    fp_check.append((fp_cnt,fp_prop))

    ## recalculate precision at k: TPs and FPs
    # The FPs appearing in the future will be viewed as TPs 
    tp_num,fp_num = tp_df.shape[0],fp_df.shape[0]
    recal = (fp_cnt+tp_num)/(tp_num+fp_num)
    recal_patk.append(recal)

    return tp_df,fp_df,original_fps,fp_check,recal_patk

In [37]:
pd.set_option('display.max_rows', None)

In [38]:
def examine_types(t,fp_ent_df):
    """examine the node types for the FPs after removing the false FPs and 
    removing the entity resolution effects 
    """
    # print(t)
    # print(fp_ent_df.shape[0])
    examine = fp_ent_df[(fp_ent_df['type1'] == 'ADPrep') | (fp_ent_df['type2'] == 'ADPrep')]
    # print(examine.shape[0])
    # display(examine)
    # print("---------------------")    

    return fp_ent_df.shape[0],examine.shape[0]

In [39]:
## process the precision at k results
## precision at 10 for Year 2002-2021, for model sdne
## look at raw node pairs of precision at k outupts 
precision_at_k_raw = ["prec_at_10_raw"]
# precision_at_k = ["prec_at_10","prec_at_20","prec_at_30","prec_at_40"]
models = ["sdne"]
# models = ['graphsage', 'sdne', 'deepwalk', 'node2vec', 'line', 'struc2vec', 'grarep', 'hope', 'gcn']
# k_model_patk = dict()  
fp_check,fp_ent_check,fp_ent_ty_check = list(),list(),list()
recal_patk,recal_ent_patk,recal_ent_only_patk,recal_ent_ty_patk,recal_ad_only_patk = list(),list(),list(),list(),list()
recal_types,recal_types_fil,recal_ent_types,recal_ent_only_types,recal_ent_types_fil = dict(),dict(),dict(),dict(),dict()
original_fps,original_ent_fps,original_ent_only_fps,original_ent_ty_fps = list(),list(),list(),list()
# entities_res = list()
fps_bf_adprep,fps_aft_adprep = list(),list()

for t in range(2002,2022):   # t == 2021 is the last year, there's no more FPs actually not FPs error. But it is better to keep the data and show FPs turning to be TPs as 0.

    ## get the precision at k results from the raw data
    k_model_patk = get_patk_model(t=t,models=models,precision_at_k_raw=precision_at_k_raw,prec_at_k_path=prec_at_k_path)
    # print(k_model_patk)
    ## examine tps and fps for sdne for datasets from Year 2002-2010
    ## this is for Year 2002
    ## check if FPs in Year 2002 appear in later years
    # print(k_model_patk)
    links = k_model_patk.get("prec_at_10_raw").get("sdne")
    # print(links)
    k_df = pd.DataFrame.from_dict(links)

    # display(k_df)
    # k_df["potential_link"] = k_df["potential_link"].apply(lambda x:tuple(sorted(x)))
    k_df[['uid1','uid2']] = k_df.potential_link.str.split("::",expand=True)   # using space to split the links has issues, nodes like "T cell" will be separated 
    k_df["uid1"] = k_df["uid1"].apply(lambda x: x.lower())
    k_df["uid2"] = k_df["uid2"].apply(lambda x: x.lower())
    k_df = k_df[["uid1","uid2","potential_link","true_labels","probabilities"]]
    k_df['potential_link'] = k_df['potential_link'].str.split('::').apply(set)

    ## remove the effect of entity resolution
    ## replace the entity resolution uids to the ado uids
    k_ent_df = rm_res(copy.deepcopy(k_df),dup_map=map_to_ado,remove_type=True)
    k_ent_df = k_ent_df.drop_duplicates(subset=["uid1","uid2"], keep='last')    

    ## link uids to node types, genetic types, link types
    k_df = link_uids(k_df)
    k_ent_df = link_uids(k_ent_df)
    
    ## calculate precision at k before and after removing the false FPs -- the FPs turning out to be TPs in the future
    ## 'tp_ent_df': removing the entity resolution effects
    tp_df,fp_df,original_fps,fp_check,recal_patk = rm_false_fps(k_df,original_fps,fp_check,recal_patk)
    tp_ent_df,fp_ent_df,original_ent_fps,fp_ent_check,recal_ent_patk = rm_false_fps(k_ent_df,original_ent_fps,fp_ent_check,recal_ent_patk)
    
    # only removing the entity resolution effect:
    # tp_ent_only_df,fp_ent_only_df,original_ent_only_fps,fp_ent_only_check,recal_ent_only_patk = rm_false_fps(k_ent_df,original_ent_only_fps,fp_ent_check,recal_ent_only_patk,ent_only=True)

    ## examine semantic types for the FPs after removing the false FPs and erasing the entity resolution effects
    fp_bf_adprep,fp_aft_adprep = examine_types(t,fp_ent_df)
    fps_bf_adprep.append(fp_bf_adprep)
    fps_aft_adprep.append(fp_aft_adprep)

    ## the FPs not only excluding those becoming TPs in the future, but also removing the ADPrep node type
    tp_ent_ty_df,fp_ent_ty_df,original_ent_ty_fps,fp_ent_ty_check,recal_ent_ty_patk = rm_false_fps(k_ent_df,original_ent_ty_fps,fp_ent_ty_check,recal_ent_ty_patk,rm_type=True)

    ## only removing the ADPrep node type
    # tp_ad_only_df,fp_ad_only_df,original_ad_only_fps,fp_ad_only_check,recal_ad_only_patk = rm_false_fps(k_ent_df,original_ent_ty_fps,fp_ent_ty_check,recal_ent_ty_patk,rm_type=True,ent_only=True)
    

    ## calcualte precision at k for each single genetic type, removing the link type ADPrep
    recal_type_fil = prec_k_type(tp_df,fp_df,remove_type=True)
    recal_types_fil["1977-"+str(t)] = recal_type_fil
    recal_ent_type_fil = prec_k_type(tp_ent_df,fp_ent_df,remove_type=True)
    recal_ent_types_fil["1977-"+str(t)] = recal_ent_type_fil

    ## keep the link type ADPrep
    recal_type = prec_k_type(tp_df,fp_df,remove_type=False)
    recal_types["1977-"+str(t)] = recal_type    
    recal_ent_type = prec_k_type(tp_ent_df,fp_ent_df,remove_type=False)
    recal_ent_types["1977-"+str(t)] = recal_ent_type
    # recal_ent_only_types = prec_k_type(tp_ent_only_df,fp_ent_only_df,remove_type=False)
    # recal_ent_only_types["1977-"+str(t)] = recal_ent_only_types


    ## find the entity resolutions
    # entity_res = find_dup(k_df,remove_type=True)
    # entities_res.append(entity_res)


    # break


In [43]:
class NpEncoder(json.JSONEncoder):
    """for saving dictionary as json"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, set):
            return list(obj)
        return super(NpEncoder, self).default(obj)

In [45]:
## how many fps appear in the links in the later years, 
## quantify the timestamp 
## k=10, model=sdne
# fp_cnt is the number of fps that appear later
pd.set_option('display.float_format', lambda x: '%.4f' % x)

def create_recal_patk(model_pat10,fp_check,original_fps,recal_patk):
    t = [*range(2002, 2022, 1)]
    original_patk = list(model_pat10.get("sdne").values())

    fp_check_df = pd.DataFrame(fp_check, columns=['fp_later_cnt', 'fp_later_prop'])
    fp_check_df["dataset"] = ["1977-"+str(i) for i in t]
    fp_check_df["original_fps"] = original_fps
    # fp_check_df["original_patk"] = original_patk[:-1]
    fp_check_df["original_patk"] = original_patk
    fp_check_df["recal_patk"] = recal_patk
    fp_check_df = fp_check_df.round({"fp_appear_later":3})
    fp_check_df = fp_check_df[["dataset","original_fps","fp_later_cnt","fp_later_prop","original_patk","recal_patk"]]
    return fp_check_df


In [47]:
fp_check_df = create_recal_patk(model_pat10,fp_check,original_fps,recal_patk)
fp_check_df

Unnamed: 0,dataset,original_fps,fp_later_cnt,fp_later_prop,original_patk,recal_patk
0,1977-2002,276,258,0.9348,0.3042,0.9554
1,1977-2003,679,626,0.9219,0.2731,0.9422
2,1977-2004,865,776,0.8971,0.2678,0.9241
3,1977-2005,1025,870,0.8488,0.2187,0.8895
4,1977-2006,1281,552,0.4309,0.1764,0.4855
5,1977-2007,480,417,0.8688,0.0781,0.877
6,1977-2008,1204,920,0.7641,0.1815,0.8095
7,1977-2009,1329,916,0.6892,0.179,0.7447
8,1977-2010,1289,796,0.6175,0.1512,0.6655
9,1977-2011,981,627,0.6391,0.1424,0.694


In [48]:
## precision at k results after removing the entity resolution effect, 
##  recal_patk is the precision at k results after treating the fps that are actually tps as true
fp_check_ent_df = create_recal_patk(model_pat10,fp_ent_check,original_ent_fps,recal_ent_patk)
fp_check_ent_df

Unnamed: 0,dataset,original_fps,fp_later_cnt,fp_later_prop,original_patk,recal_patk
0,1977-2002,251,235,0.9363,0.3042,0.9566
1,1977-2003,621,572,0.9211,0.2731,0.9424
2,1977-2004,753,678,0.9004,0.2678,0.928
3,1977-2005,892,760,0.852,0.2187,0.8935
4,1977-2006,1223,514,0.4203,0.1764,0.4768
5,1977-2007,445,385,0.8652,0.0781,0.8739
6,1977-2008,1092,832,0.7619,0.1815,0.8088
7,1977-2009,1204,840,0.6977,0.179,0.7534
8,1977-2010,1208,749,0.62,0.1512,0.6691
9,1977-2011,906,584,0.6446,0.1424,0.701


In [49]:
## precision at k results after removing the entity resolution effect, and removing the ADPrep node type
##  recal_patk is the precision at k results after treating the fps that are actually tps as true
fp_check_ent_ad_df = create_recal_patk(model_pat10,fp_ent_ty_check,original_ent_ty_fps,recal_ent_ty_patk)
fp_check_ent_ad_df


Unnamed: 0,dataset,original_fps,fp_later_cnt,fp_later_prop,original_patk,recal_patk
0,1977-2002,183,167,0.9126,0.3042,0.937
1,1977-2003,459,415,0.9041,0.2731,0.9294
2,1977-2004,616,548,0.8896,0.2678,0.9181
3,1977-2005,732,614,0.8388,0.2187,0.884
4,1977-2006,1121,447,0.3988,0.1764,0.4534
5,1977-2007,362,312,0.8619,0.0781,0.8708
6,1977-2008,908,676,0.7445,0.1815,0.7947
7,1977-2009,1016,698,0.687,0.179,0.7429
8,1977-2010,1020,614,0.602,0.1512,0.6539
9,1977-2011,793,504,0.6356,0.1424,0.6922


In [None]:
rm_res = fp_check_ent_df["recal_patk"]
rm_ad = fp_check_ent_ad_df["recal_patk"]
fp_check_df["recal_patk_res"] = rm_res
fp_check_df["recal_patk_res_ad"] = rm_ad
# fp_check_df

Unnamed: 0,dataset,original_fps,fp_later_cnt,fp_later_prop,original_patk,recal_patk,recal_patk_res,recal_patk_res_ad
0,1977-2002,276,258,0.9348,0.3042,0.9554,0.9566,0.937
1,1977-2003,679,626,0.9219,0.2731,0.9422,0.9424,0.9294
2,1977-2004,865,776,0.8971,0.2678,0.9241,0.928,0.9181
3,1977-2005,1025,870,0.8488,0.2187,0.8895,0.8935,0.884
4,1977-2006,1281,552,0.4309,0.1764,0.4855,0.4768,0.4534
5,1977-2007,480,417,0.8688,0.0781,0.877,0.8739,0.8708
6,1977-2008,1204,920,0.7641,0.1815,0.8095,0.8088,0.7947
7,1977-2009,1329,916,0.6892,0.179,0.7447,0.7534,0.7429
8,1977-2010,1289,796,0.6175,0.1512,0.6655,0.6691,0.6539
9,1977-2011,981,627,0.6391,0.1424,0.694,0.701,0.6922


In [51]:
fp_later_df = fp_check_df[["dataset","original_fps","fp_later_cnt","fp_later_prop"]].rename(columns={"dataset": "Dataset","original_fps": "#FP one year", "fp_later_cnt": "#FP later years", "fp_later_prop": "Proportion"})
fp_later_df['Proportion'] = fp_later_df['Proportion'].astype(float).map("{:.2%}".format)
fp_later_df

Unnamed: 0,Dataset,#FP one year,#FP later years,Proportion
0,1977-2002,276,258,93.48%
1,1977-2003,679,626,92.19%
2,1977-2004,865,776,89.71%
3,1977-2005,1025,870,84.88%
4,1977-2006,1281,552,43.09%
5,1977-2007,480,417,86.88%
6,1977-2008,1204,920,76.41%
7,1977-2009,1329,916,68.92%
8,1977-2010,1289,796,61.75%
9,1977-2011,981,627,63.91%


In [52]:
print(fp_later_df.to_latex(index=False))

\begin{tabular}{lrrl}
\toprule
  Dataset &  \#FP one year &  \#FP later years & Proportion \\
\midrule
1977-2002 &           276 &              258 &     93.48\% \\
1977-2003 &           679 &              626 &     92.19\% \\
1977-2004 &           865 &              776 &     89.71\% \\
1977-2005 &          1025 &              870 &     84.88\% \\
1977-2006 &          1281 &              552 &     43.09\% \\
1977-2007 &           480 &              417 &     86.88\% \\
1977-2008 &          1204 &              920 &     76.41\% \\
1977-2009 &          1329 &              916 &     68.92\% \\
1977-2010 &          1289 &              796 &     61.75\% \\
1977-2011 &           981 &              627 &     63.91\% \\
1977-2012 &          1515 &              854 &     56.37\% \\
1977-2013 &          1291 &              750 &     58.09\% \\
1977-2014 &          1594 &              588 &     36.89\% \\
1977-2015 &          1345 &              512 &     38.07\% \\
1977-2016 &          1275 &  

In [53]:
## read true fps 
## group true fps with link type
def grouped_true_fps(t):
    df = pd.read_pickle(error_path+str(t)+"_sdne")
    gr = df.groupby(by=["link_type"])
    num_fps = df.shape[0]
    # print(num_fps)
    group_dict = dict()

    for key, item in gr:
        # print(gr.get_group(key), "\n\n")
        # print(key)
        num_group = item.shape[0]
        perc_group = round(100*(num_group/num_fps),3)
        # print(perc_group)
        group_dict[key] = perc_group

        # display(item)
        # print("----------")
        # if "ADPrep" in key:
        #     print(key)
        #     print(item.shape[0])
        #     display(item)
        #     print("----------")
    
    # print("----------------------")

    return group_dict


In [55]:
## for the sdne model
## right three columns are the precision at k results after removing the ADPrep node type
def rm_type(recal_types,recal_types_fil):
    t = [*range(2002, 2022, 1)]

    genetic_types = pd.DataFrame.from_dict(recal_types,orient="index")
    genetic_types_fil = pd.DataFrame.from_dict(recal_types_fil,orient="index")

    # display(genetic_types)
    genetic_result = pd.concat([genetic_types, genetic_types_fil], axis=1)
    # genetic_result["dataset"] = ["1977-"+str(i) for i in t]
    
    return genetic_result.fillna(0).reset_index().rename(columns={"index":"dataset"})

In [56]:
# without removing the entity resolution effect, left three columns: without removing the ADPrep node type
#   right three columns: removing the ADPrep node type
genetic_result = rm_type(recal_types,recal_types_fil)

# remove the entity resolution effect, left three columns: without removing the ADPrep node type
#   right three columns: removing the ADPrep node type
genetic_ent_result = rm_type(recal_ent_types,recal_ent_types_fil)

In [62]:
## for altair visualization
## use original_patk	recal_patk	recal_patk_res	recal_patk_res_ad as four columns, the original patk, and removing three errors
## plot three subplots with three pairs -- original::remove the first error, original::second error, original::third error
fp_check_df = fp_check_df.rename(columns={"dataset":"Dataset"})
first_error = fp_check_df[["Dataset","original_patk","recal_patk"]]
first_melt = first_error.melt(id_vars=["Dataset",], 
        var_name="patk_label", 
        value_name="patk")
first_melt = first_melt.replace({"original_patk": "short-term context", "recal_patk": "longer-term context"})
first_melt["Error"] = "Prediction space"

second_error = fp_check_df[["Dataset","original_patk","recal_patk_res"]]
second_melt = second_error.melt(id_vars=["Dataset",], 
        var_name="patk_label", 
        value_name="patk")
second_melt = second_melt.replace({"original_patk": "With limitation", "recal_patk_res": "Remove limitation"})
second_melt["Error"] = "Multiple annotators"

third_error = fp_check_df[["Dataset","original_patk","recal_patk_res_ad"]]
third_melt = third_error.melt(id_vars=["Dataset",], 
        var_name="patk_label", 
        value_name="patk")
third_melt = third_melt.replace({"original_patk": "With limitation", "recal_patk_res_ad": "Remove limitation"})
third_melt["Error"] = "Semantic entity types"

melt_df = pd.concat([first_melt, second_melt,third_melt], axis=0)

In [79]:
chart = alt.Chart(first_melt).mark_line().encode(
    alt.X('Dataset'),
    alt.Y('patk:Q',title="Precision @ 10%"),
    alt.Color('patk_label',title="",legend=alt.Legend(
        orient='none',
        legendX=245, legendY=2,
        direction='vertical',
        titleAnchor='middle'))
).properties(
    width=400,
    height=350
# ).
# ).facet(
#     alt.Facet('Error',title="Three types of errors", 
#             sort=["Prediction space","Multiple annotators","Semantic entity types"],
#             header=alt.Header(labelFontSize=16)),
#     columns=3
).configure_axis(
    titleFontSize=16,
).configure_legend(
    titleFontSize=18,
    labelFontSize=15
) 

chart