In [None]:
"""
This notebook aims to create the train/test pairs for the experiment.
==============================================
- Each train/test pair will have a feature network and a label network 
for the training data, and a feature network and a label network for the testing data.
- Nodes will be the nodes in the feature network in the training data.
- Negative links in the label network of the training data will be undersampled.
- Pre-processing: Nodes with degree more than 2000 are filtered. 

"""


In [1]:
from create_edge import create_yr_link
import json
from collections import defaultdict,Counter
import numpy as np
from imblearn.under_sampling import RandomUnderSampler 

In [2]:
# load the metadata from the json
meta_path = "/Users/yidesdo21/Projects/outputs/12_time_slicing/metadata/"

with open(meta_path+"articles_with_anno.json") as f:
    metadata = json.load(f)


In [3]:
year_edge,year_ln = create_yr_link(metadata)

In [4]:
## prepare the raw corpus
# 131 nodes have degrees more than 2000, these nodes are filtered in the graph
hubs_2000 = {'AlzheimerOntology:cerebrospinal_fluid', 'NDDUO:Enzymes', '10090', '54226', 'NDDUO:Cluster', 'obo:BFO_0000066', 'AlzheimerOntology:presenilin_1', 'MESH:D019636', 'NDDUO:Mouse_', 'AlzheimerOntology:Subtypes', 'AlzheimerOntology:protein', 'AlzheimerOntology:t_tau', 'NDDUO:Risk_factor', 'obo:RO_0000079', 'AlzheimerOntology:t_tau_Amyloid_beta_42', 'obo:BFO_0000015', 'NDDUO:Exposure', 'obo:ND_0003000', 'snap:Function', 'AlzheimerOntology:Brain_Imaging', 'NDDUO:male', 'obo:FMA_62493', 'AlzheimerOntology:vascular_dementia', 'MESH:D003072', 'NDDUO:Intervention', 'AlzheimerOntology:Memory_orientation_screening_test', 'obo:OGMS_0000023', 'AlzheimerOntology:Oxidative_stress', 'snap:Role', 'AlzheimerOntology:Microglia', 'MESH:D064420', 'AlzheimerOntology:Membrane', 'NDDUO:disorder', 'obo:UO_0000033', 'NDDUO:biomarker', 'AlzheimerOntology:brain', 'AlzheimerOntology:control_group_study_arm', 'NDDUO:Distribution', 'NDDUO:Clinical', 'AlzheimerOntology:presenilin', 'NDDUO:In_vivo_models', 'AlzheimerOntology:secretase', 'MESH:D000544', 'AlzheimerOntology:APP', 'MESH:D014867', 'AlzheimerOntology:cortex', 'AlzheimerOntology:disease_onset', 'obo:UO_0000036', 'obo:UO_0000035', 'AlzheimerOntology:Mild_cognitive_Impairment', 'obo:IAO_0000651', 'span:Process', 'obo:BFO_0000023', 'NDDUO:Cell_death', 'obo:ND_0000113', 'obo:OGMS_0000031', 'NDDUO:Age', 'obo:BFO_0000034', 'AlzheimerOntology:antibody', 'obo:IAO_0000230', '11820', 'NDDUO:disease', 'AlzheimerOntology:senile_dementia', 'NDDUO:gene', 'NDDUO:drug', 'AlzheimerOntology:Moderate_cognitive_Decline', 'AlzheimerOntology:ApoE_Protein', 'AlzheimerOntology:lipid', 'NDDUO:Rat', 'AlzheimerOntology:presence_of_amyloid_plaque', 'AlzheimerOntology:age_risk_factor', 'NDDUO:model', 'AlzheimerOntology:amyloid_precursor_protein', 'AlzheimerOntology:Phosphorylation', 'snap:Site', 'AlzheimerOntology:Cognitive_Tests', 'AlzheimerOntology:ratio', 'obo:FMA_50801', 'AlzheimerOntology:advanced_glycation_end_product', '9606', 'AlzheimerOntology:Receptors', 'obo:PATO_0000011', 'AlzheimerOntology:inflammation', 'AlzheimerOntology:frontotemporal_dementia', 'obo:GO_0007612', 'MESH:D009410', 'NDDUO:Mild', 'NDDUO:Sampling', 'AlzheimerOntology:senile_plaque', 'MESH:D003704', 'NDDUO:In_vivo', 'AlzheimerOntology:Hippocampus', '351', 'NDDUO:neurodegenerative_disease', 'obo:ND_0000152', 'obo:GO_0007613', 'NDDUO:Proportion', 'AlzheimerOntology:Late_Onset_Alzheimer_s_Disease', 'obo:NBO_0000215', 'NDDUO:In_vitro', 'NDDUO:Study_type', 'MESH:D008569', 'obo:OBI_0000753', 'obo:NBO_0000282', 'NDDUO:Neurons', 'AlzheimerOntology:amyloid_beta_protein', 'AlzheimerOntology:Fibrils', 'MESH:D020258', '10116', 'AlzheimerOntology:amyloid_beta_deposition', 'obo:OBI_0000070', 'AlzheimerOntology:Gama_secretase', 'NDDUO:rate', 'AlzheimerOntology:enzyme', 'AlzheimerOntology:Apoptosis', 'obo:IDO_0000666', 'AlzheimerOntology:mutation', 'obo:FMA_61109', '4137', 'obo:IAO_0000101', 'obo:RO_0000081', 'AlzheimerOntology:Abeta_42', 'MESH:D003643', 'obo:NPT_0015009', 'NDDUO:inhibitor', 'obo:RO_0002211', 'NDDUO:Stages', 'AlzheimerOntology:mental_disorder', 'NDDUO:Pathogenesis', 'AlzheimerOntology:Intervention', 'AlzheimerOntology:neurotoxic'}



In [5]:
def create_indices(year_edge):
    """create a uniform indices for all nodes in all networks, use this globally.
    indices starting with 0, to fit with CSVDataset in DGL library,
    input -- year_edge: a dictionary with {year:(set_of_links)},
    output -- a dictionary with {entity_name1:entity_index1: ...}"""
    
    vertices = {} # lookup table for entity name and entity index
    node_index = -1  # index range [0,len_node-1]
    
    for yr, es in year_edge.items():   
        for e in es:
            entity1,entity2 = e[0],e[1]

            if entity1 not in vertices:
                node_index += 1
                vertices[entity1] = node_index  

            if entity2 not in vertices:
                node_index += 1
                vertices[entity2] = node_index

            
                
    print("\n%s vertices in the whole network read." % (len(vertices)))      
    return vertices


In [18]:
def fix_nodes(year_edge,g_0, g_t, hubs=hubs_2000):
    """node control when constructing the training data and the testing data,
        the links for both feature network and label network have to come from nodes in the feature network only
        otherwise the features can't be extracted,
        under this setting, g_t will be the end of feature network from the training data
        --- update: we add one experiment with new nodes considered, 
                    under this setting, g_t will be the end of label network from the testing data
        use this function as a fitler
      input -- g_0: the start year, g_t: end of the feature network,
               year_edge: a dictionary with {year:(set_of_links)}
      output -- a set of entities only appear in the feature network
    """
    entities = set()
    
    for yr, es in year_edge.items():
        if g_0 <= yr <= g_t: 
            for e in es:
                entity1,entity2 = e[0],e[1]   
                
#                 if entity1 == "MESH:D018698" or entity2 == "MESH:D018698":
#                     print(e)
                
                if entity1 in hubs or entity2 in hubs:
                    continue
                    
                else:
                    entities.add(entity1)
                    entities.add(entity2)
        
    return entities

In [50]:
g_0 = 1997
g_t = 2001

for yr, es in year_edge.items():
    if g_0 <= yr <= g_t: 
        for e in es:
            entity1,entity2 = e[0],e[1]   

            if entity1 == "MESH:D003329" or entity2 == "MESH:D003329":
                print(e)

In [7]:
## ignore the indices for easy investigation for the high AUROC
## note that the adjacency matrix doubles the links as it includes the duplications (a,b) and (b,a)
def create_network(year_edge, g_0, g_t, fixed_nodes, ver_idx=""):
    """create the feature network or the label network, 
        the links can only from the nodes in the feature network,
       input -- g_0: start of the network, g_t: end of the network,
             -- year_edge: links in the network
             -- fixed_nodes: a set of entities only appear in the feature network
             -- ver_idx: a dictionary with {entity_name1:entity_index1, ...},
       output -- a dictionary {entity_index1:(entity_index3, ...),...}
    """
    
    adj_mat = defaultdict(set)
    
    for yr, es in year_edge.items():
        if g_0 <= yr <= g_t: 
            for e in es:
                entity1,entity2 = e[0],e[1]
#                 print(entity1,entity2)


                if entity1 in fixed_nodes and entity2 in fixed_nodes:
                    adj_mat[entity1].add(entity2)
                    adj_mat[entity2].add(entity1)
#                 ent1_vertex_ind,ent2_vertex_ind = ver_idx.get(entity1),ver_idx.get(entity2)
#                 adj_mat[ent1_vertex_ind].add(str(ent2_vertex_ind))  # node pairs can be duplicated in year_edge.items()
#                 adj_mat[ent2_vertex_ind].add(str(ent1_vertex_ind))    
    
    return adj_mat


In [8]:
## the negative links have to obey the fixed nodes rule
def create_negatives(year_edge, fea_pos_adj_mat, lab_pos_adj_mat, fixed_nodes, ver_idx=""):
    """return all negatives in the label network, only the links with the nodes in the feature network are included
        all negative cases in the feature network minus all new positive cases in the label network,
       only the nodes from the feature network can be included,
           fea_pos_adj_mat -- all positive cases in the feature network,
           lab_pos_adj_mat -- all new positive cases in the label network,
           fixed_nodes -- a set of entities only appear in the feature network,
           ver_idx -- a dictionary {entity_name:entity_index},
       output -- neg_label: a dictionary {entity1:("entity2",...),...}
                 all negative cases in the label network (excluding new nodes)
       """
    
    ## all negative cases in the feature network = all possible cases in the feature network - all positive cases in the feature network
    # all possible cases in the feature network 
#     all_possible = set([str(ver_idx.get(n)) for n in fixed_nodes])
    neg_feature = {}
    neg_label = defaultdict(set)  # all negative cases in the label network (excluding new nodes) 
    node_pairs = set()  # for removing duplications (a,b) and (b,a)
    
    # all negative cases in the feature network
    for idx1,neighs in fea_pos_adj_mat.items():
        neighs_set = set(neighs)
        neighs_set.add(idx1)
        neg_feature[idx1] = fixed_nodes.difference(neighs_set)   # exclude itself     
    
    
    ## all negative cases in the label network = all negative cases in the feature network - all new positive cases in the label network
    # all keys/nodes are already filtered to be in the feature network in the create_label_network function
    lab_pos_idxs = lab_pos_adj_mat.keys()

    for idx2,negs in neg_feature.items():
        if idx2 not in lab_pos_idxs:
            for neg in negs:    ## check duplications (a,b) (b,a)
                if (idx2,neg) in node_pairs:
                    continue
                if (neg,idx2) in node_pairs:
                    continue

                neg_label[idx2].add(neg)
                node_pairs.add((idx2,neg))   

        else:
            idx2_pos_label = lab_pos_adj_mat.get(idx2)
            negs_set = set(negs).difference(set(idx2_pos_label))
            for neg2 in negs_set:    ## check duplications (a,b) (b,a)
                if (idx2,neg2) in node_pairs:
                    continue
                if (neg2,idx2) in node_pairs:
                    continue

                neg_label[idx2].add(neg2)
                node_pairs.add((idx2,neg2))             
    
    return neg_label


In [9]:
def remove_dup(lab_pos_adj_mat):
    """remove (b,a) if (a,b) exists for the positive links in feature networks,
    input -- lab_pos_adj_mat, a dictionary with {entity_index1:("entity_index2",...),...},
    output -- a filterd label positive adjacency matrix, also a dictionary in the same format"""
    node_pairs = set()
    lab_pos_fil = defaultdict(set)   

    for k,vs in lab_pos_adj_mat.items():
        key = str(k)
        for v in vs:
            if (key,v) in node_pairs:
    #             print(key,v)
    #             print("---------")
                continue
            if (v,key) in node_pairs:
    #             print(key,v)
    #             print("---------")
                continue

            lab_pos_fil[key].add(v)
            node_pairs.add((key,v))
        
    return lab_pos_fil

In [10]:
# turn all the pairs to a dictionary 
# Dict not really needed here but indexing in a dict is much faster than a list.
# from: {entity_index1:(set_of_entity_indices),...}
# to. : ('58920::snap:Function': 1, '58920::MESH:D012694': 1, ...) 
#.    1 for positive links, 0 for negative links

def adj_mat_to_dict(adj_mat,label):
    """turn the adjacency matrix to a dictionary
        input -- adj_mat: a dictionary,
              -- label: 0 for negative labels, 1 for postiive labels,
        output -- a dictionary {"index1::index2":positive_or_negative_link}"""
    pair_dict = dict()

    for k,v in adj_mat.items():
#         entity1 = vertices_inv.get(k)
#         entities = [vertices_inv[int(node_ind)] for node_ind in v]

        for idx in v:
            pair_dict["%s::%s" % (k, idx)] = label
    
    return pair_dict

In [11]:
# careful for the random state in sampling
def sample_neg(lab_pos_adj_mat,lab_neg_adj_mat,sampling=True):
    """sample from lab_neg_adj_mat, making the negative link number equal to 
        the links in lab_pos_adj_mat,
        input -- lab_pos_adj_mat: positive node pairs in the label network,  
                 lab_neg_adj_mat: negative node pairs in the label network,
        output -- X_res: sampled node pairs names, with 1:1 positive and negative labels
                  y_res: labels for the node pairs"""

#     lab_pos_dict = adj_mat_to_dict(adj_mat=remove_dup(lab_pos_adj_mat,label=1) )
    lab_pos_dict = adj_mat_to_dict(adj_mat=remove_dup(lab_pos_adj_mat),label=1)  # already removed duplications
    lab_neg_dict = adj_mat_to_dict(adj_mat=lab_neg_adj_mat,label=0)  # duplications should have been removed
    lab_dict = {**lab_pos_dict,**lab_neg_dict}

    X = np.array(list(lab_dict.keys())).reshape(-1, 1)
    y = list(lab_dict.values())
    print('Original dataset shape %s' % Counter(y))

    if sampling is True:
        undersam_neg = RandomUnderSampler()
        X_res, y_res = undersam_neg.fit_resample(X, y)
        print('Resampled dataset shape %s' % Counter(y_res))

        return X_res, y_res

    else:
        return X,y

In [39]:
# a pipeline to create a training or testing data
def data_pipeline(year_edge, year_ln, 
                   g_f0, g_ft, g_l0, g_lt, fixed_nodes,       
                   hubs = hubs_2000,sampling=True,
                   training=True):
    """create feature networks and label networks for both the training and testing network
        input -- year_edge: edges in each year, a dictionary with {year:(set_of_links)},, 
                 year_ln: new edges in each year, 
                 fixed_nodes: fixed nodes for all networks in the training and testing data,
                 g_f0: start of the feature network for both the training and testing networks,
                 g_ft: end of the feature network, 
                 g_l0: start of the label network,
                 g_lt: end of the label network,
                 training: if training is True, then the edges to induce node embeddings will be returned,"""
 
    ## step 1: create fixed nodes for the network
    print("start of the feature network:", g_f0)
    print("end of the feature network:", g_ft)
    print("start of the label network:", g_l0)
    print("end of the label network:", g_lt)    
    print("-----------")
    
#     fixed_test = fix_nodes(year_edge,g_f0,g_ft,hubs)
    print("number of fixed nodes:", len(fixed_nodes))
    print("-----------")
    
    ## step 2: create feature network
    test_fea_net = create_network(year_edge,g_f0,g_ft,
                                  fixed_nodes=fixed_nodes)
    test_fea_cnt = dict()

    for k,v in test_fea_net.items():
        test_fea_cnt[k] = len(v)

    # print(sum(test_fea_cnt.values()))   # duplications
    print("positive links in the feature network:", sum(test_fea_cnt.values())/2)   # without duplications
    print("-----------")
    
    ## step 3: create edges to achieve the similarity measures
    test_fea_dict = adj_mat_to_dict(adj_mat=remove_dup(test_fea_net),label=1)
    print("number of links to achieve the similarity measures:", len(test_fea_dict))
    print("-----------")
    
    ## step 4: create label network
    # positive cases in the label network are the new links in the label network
    # need to exclude the links with new nodes
    test_lab_net = create_network(year_ln,g_l0,g_lt, 
                                  fixed_nodes=fixed_nodes, ver_idx="")
    test_lab_cnt = dict()
    
#     print(fixed_test)
#     print(test_lab_net)
    for k,v in test_lab_net.items():
        test_lab_cnt[k] = len(v)

    # print(sum(test_lab_cnt.values()))   # this is doubled
    print("positive links in the label network:", sum(test_lab_cnt.values())/2)
    print("-----------")
    
    ## step 5: create feature+label network to induce the node embeddings 
    if training == True:
        test_fea_lab_net = create_network(year_edge,g_f0,g_lt,
                                          fixed_nodes=fixed_nodes)  
        
        debug1 = remove_dup(test_fea_lab_net)
        debug2 = test_fea_lab_net
        
        test_fea_lab_dict = adj_mat_to_dict(adj_mat=remove_dup(test_fea_lab_net),label=1)
        print("number of links to induce the node embeddings:", len(test_fea_lab_dict))    
        print("-----------")
    
    ## step 6: create negative links
    neg_label = create_negatives(year_edge, fea_pos_adj_mat=test_fea_net, 
                                 lab_pos_adj_mat=test_lab_net, fixed_nodes=fixed_nodes)
    neg_label_cnt = dict()
    
    for k,v in neg_label.items():
        neg_label_cnt[k] = len(v)

    print("negative links in the feature network before undersampling:", sum(neg_label_cnt.values()))   
    # duplications removed, links from label network removed
    print("-----------")
    
    ## step 7: sample the negative links to maintain 1:1 for the positive links
    X_res_test, y_res_test = sample_neg(test_lab_net,neg_label,sampling)
    
#     ## step 6: calculate model scores
# #     print(test_fea_net)
#     cn_score,jc_score,aa_score,pa_score = calculate_feature(X_res=X_res_test, 
#                                                                     adj_matrix=test_fea_net, 
#                                                                     y_res=y_res_test, vertices="")
    
#     ## step 7: evaluate the models
#     model_prc,model_roc = evaluate_models(cn_score,jc_score,aa_score,pa_score,y_res_test)
#     print("----------------------")
    
    
#     records.append((len(fixed_test), sum(test_fea_cnt.values())/2,
#                      sum(test_lab_cnt.values())/2, sum(neg_label_cnt.values()),
#                      (model_prc,model_roc)))
    if training == True:
        return X_res_test, y_res_test, test_fea_dict, test_fea_lab_dict, debug1, debug2

    else:
        return X_res_test, y_res_test, test_fea_dict


In [20]:
# ## for test
g_0,g_t=1977,2002

fixed_test = fix_nodes(year_edge,g_0,g_t,set())
print("number of fixed nodes:", len(fixed_test))
print("-----------")

number of fixed nodes: 2807
-----------


In [45]:
"MESH:D003329" in fixed_test

True

In [41]:
g_f0, g_ft, g_l0, g_lt = 1977,2000,2001,2001
X_train, y_train, train_sim, train_ne, ne_bfdup, ne_dup = data_pipeline(year_edge=year_edge, year_ln=year_ln, 
                   g_f0=g_f0, g_ft=g_ft, g_l0=g_l0, g_lt=g_lt,                                                      
                   fixed_nodes=fixed_test,       
#                    hubs = set(),
                                                      sampling=True,
                   training=True)

start of the feature network: 1977
end of the feature network: 2000
start of the label network: 2001
end of the label network: 2001
-----------
number of fixed nodes: 2807
-----------
positive links in the feature network: 26000.0
-----------
number of links to achieve the similarity measures: 26000
-----------
positive links in the label network: 67751.0
-----------
number of links to induce the node embeddings: 93751
-----------
negative links in the feature network before undersampling: 1712534
-----------
Original dataset shape Counter({0: 1712534, 1: 67751})
Resampled dataset shape Counter({0: 67751, 1: 67751})


In [46]:
"MESH:D003329" in ne_bfdup.keys()

False

In [47]:
"MESH:D003329" in ne_dup.keys()

False

In [None]:
ne_dup.get("")

In [42]:
for i,j in ne_bfdup.items():
    

defaultdict(set,
            {'10116': {'100034700',
              '10090',
              '10141',
              '103694380',
              '10508',
              '1137',
              '11491',
              '11820',
              '1385',
              '140908',
              '14149',
              '171045',
              '17708',
              '17709',
              '18125',
              '19164',
              '19225',
              '24185',
              '24221',
              '24225',
              '24261',
              '24318',
              '24596',
              '24599',
              '24604',
              '24680',
              '24804',
              '24835',
              '24890',
              '25054',
              '25290',
              '25402',
              '25467',
              '25470',
              '25496',
              '25595',
              '25603',
              '25661',
              '25673',
              '25700',
              '25728',
              '2670',
 

In [37]:
for k in train_ne.keys():
    n1,n2 = k.split("::")[0],k.split("::")[1]
    if n1 == "MESH:D003329" or n2 == "MESH:D003329":
#     if n1 == '10116' or n2 == 'AlzheimerOntology:NOS2':
        print(k)

In [None]:
print("Generating Training data:")
# edge to train classifiers,label to train classifiers, edge to get similarity measures, edge to induce embeddings
X_train, y_train, train_sim, train_ne = data_pipeline(year_edge, year_ln, g_f0=g_f0_tr, g_ft=g_ft_tr, 
                                 g_l0=g_l0_tr, g_lt=g_lt_tr, fixed_nodes=fixed_nodes,sampling=True,training=True)

In [23]:
# ## step 2: create feature network
# test_fea_net = create_network(year_edge,g_f0,g_ft,
#                               fixed_nodes=fixed_test)

# test_fea_dict = adj_mat_to_dict(adj_mat=remove_dup(test_fea_net),label=True)
# test_fea_cnt = dict()

# for k,v in test_fea_net.items():
#     test_fea_cnt[k] = len(v)

# # print(sum(test_fea_cnt.values()))   # duplications
# print("positive links in the feature network:", sum(test_fea_cnt.values())/2)   # without duplications
# print("-----------")

In [34]:
# ## step 3: create label network
# # positive cases in the label network are the new links in the label network
# # need to exclude the links with new nodes
# test_lab_net = create_network(year_ln,g_l0,g_lt, 
#                               fixed_nodes=fixed_test, ver_idx="")
# test_lab_cnt = dict()

# #     print(fixed_test)
# #     print(test_lab_net)
# for k,v in test_lab_net.items():
#     test_lab_cnt[k] = len(v)

# # print(sum(test_lab_cnt.values()))   # this is doubled
# print("positive links in the label network:", sum(test_lab_cnt.values())/2)
# print("-----------")

positive links in the label network: 4309.0
-----------


In [35]:
# ## step 4: create negative links
# neg_label = create_negatives(year_edge, fea_pos_adj_mat=test_fea_net, 
#                              lab_pos_adj_mat=test_lab_net, fixed_nodes=fixed_test)
# neg_label_cnt = dict()

# for k,v in neg_label.items():
#     neg_label_cnt[k] = len(v)

# print("negative links in the feature network before undersampling:", sum(neg_label_cnt.values()))   
# # duplications removed, links from label network removed
# print("-----------")

negative links in the feature network before undersampling: 173436
-----------


In [42]:
# ## step 5: sample the negative links to maintain 1:1 for the positive links
# X_res_test, y_res_test = sample_neg(test_lab_net,neg_label,sampling=True)

Original dataset shape Counter({0: 173436, 1: 4309})
Resampled dataset shape Counter({0: 4309, 1: 4309})
