In [1]:
"""
Turn the training and testing datasets to a csv file that suits the DGL library
==============================================
Under the folder of ./16_dgl_csv/, we will have 20 train/test datasets, named with year, e.g. "./1977_2002/..."
    - ./16_dgl_csv/ filters hubs with more than 2000 degrees
Under the folder of ./18_dgl_sample_csv/, we will have 20 train/test datasets, named with year, e.g. "./1977_2002/..."
    - ./18_dgl_sample_csv/ undersamples the negative links in the testing data 
    - ./18_dgl_sample_csv/ filters hubs with more than 2000 degrees
Under the folder of ./20_dgl_hubs_csv/, we will have 20 train/test datasets, named with year, e.g. "./1977_2002/..."
    - ./20_dgl_hubs_csv/ undersamples the negative links in the testing data 
    - ./20_dgl_hubs_csv/ keeps hubs with more than 2000 degrees    
Under the folder of ./22_dgl_new_nodes_csv/, we will have 20 train/test datasets, named with year, e.g. "./1977_2002/..."
    - ./22_dgl_new_nodes_csv/ undersamples the negative links in the testing data 
    - ./22_dgl_new_nodes_csv/ keeps hubs with more than 2000 degrees   
    - ./22_dgl_new_nodes_csv/ keeps new nodes in the testing data in the training data

The first graph is used to train a link prediction binary classifier with features:
    ./1977_2002/clf/
    |-- meta.yaml     # metadata of the dataset
    |-- nodes.csv     # node_id    
    |-- edges.csv     # src_id, dst_id, label, train_mask, test_mask   # edges to train the node embeddings, the edges come from feature networks of either the training and testing data 
for meta.yaml:
    dataset_name: clf
    edge_data:
    - file_name: edges.csv
    node_data:
    - file_name: nodes.csv
    
The second graph is used to achieve the features, including similarity measures and node embeddings 
    ./1977_2002/ne/
    |-- meta.yaml     # metadata of the dataset
    |-- nodes.csv     # node_id
    |-- edges.csv     # src_id, dst_id, train_fea_mask, test_fea_mask, induce_mask  
                      ### train_fea_mask: edges to achieve similarity measures for the training data
                      ### test_fea_mask: edges to achieve similarity measures for the testing data
                      ### induce_mask: edges to induce node embeddings
for meta.yaml:
    dataset_name: ne
    edge_data:
    - file_name: edges.csv
    node_data:
    - file_name: nodes.csv
"""



In [1]:
from create_edge import create_yr_link
from create_train_test_data import fix_nodes,data_pipeline,create_indices
import json
import torch
import csv
import pandas as pd
from collections import Counter
import yaml
import os

In [6]:
# save path for the csv data
# save_path = "/Users/yidesdo21/Projects/outputs/16_dgl_csv/"
# save_path = "/Users/yidesdo21/Projects/outputs/18_dgl_sample_csv/"
# save_path = "/Users/yidesdo21/Projects/outputs/20_dgl_hubs_csv/"

save_path = "/Users/yidesdo21/Projects/outputs/22_dgl_new_nodes_csv/"


In [7]:
# load the metadata from the json
meta_path = "/Users/yidesdo21/Projects/outputs/12_time_slicing/metadata/"

with open(meta_path+"articles_with_anno.json") as f:
    metadata = json.load(f)

year_edge,year_ln = create_yr_link(metadata)

In [8]:
# Step 01: create Training and testing data for similarity meausres and node embeddings
# default filter for hubs is removing hubs with degree more than 2000
g_f0_tr,g_ft_tr,g_l0_tr,g_lt_tr = 1977,2000,2001,2001
g_f0_tt,g_ft_tt,g_l0_tt,g_lt_tt = 1977,2001,2002,2002

# fixed_nodes = fix_nodes(year_edge,g_f0_tr, g_ft_tr,hubs=set())
fixed_nodes = fix_nodes(year_edge,g_0=g_f0_tr, g_t=g_lt_tt,hubs=set())  # new nodes considered, g_t = end of the label network in the testing data


In [9]:
len(fixed_nodes)

2807

In [10]:

print("Generating Training data:")
# edge to train classifiers,label to train classifiers, edge to get similarity measures, edge to induce embeddings
X_train, y_train, train_sim, train_ne = data_pipeline(year_edge, year_ln, g_f0=g_f0_tr, g_ft=g_ft_tr, 
                                 g_l0=g_l0_tr, g_lt=g_lt_tr, fixed_nodes=fixed_nodes,sampling=True,training=True)

print("------------------------------------------------")
print("Generating Testing data:")
### use sampling
X_test, y_test, test_sim = data_pipeline(year_edge, year_ln, g_f0=g_f0_tt, g_ft=g_ft_tt, 
                                 g_l0=g_l0_tt, g_lt=g_lt_tt, fixed_nodes=fixed_nodes,sampling=True,training=False)

Generating Training data:
start of the feature network: 1977
end of the feature network: 2000
start of the label network: 2001
end of the label network: 2001
-----------
number of fixed nodes: 2807
-----------
positive links in the feature network: 26000.0
-----------
number of links to achieve the similarity measures: 26000
-----------
positive links in the label network: 67751.0
-----------
number of links to induce the node embeddings: 93751
-----------
negative links in the feature network before undersampling: 1712534
-----------
Original dataset shape Counter({0: 1712534, 1: 67751})
Resampled dataset shape Counter({0: 67751, 1: 67751})
------------------------------------------------
Generating Testing data:
start of the feature network: 1977
end of the feature network: 2001
start of the label network: 2002
end of the label network: 2002
-----------
number of fixed nodes: 2807
-----------
positive links in the feature network: 93751.0
-----------
number of links to achieve the si

In [25]:
"""  
targeted format example:
    /path/to/dataset/
    |-- meta.yaml     # metadata of the dataset
    |-- edges_0.csv   # edge data including src_id, dst_id, feature, label and so on
    |-- ...           # you can have as many CSVs for edge data as you want
    |-- nodes_0.csv   # node data including node_id, feature, label and so on
    |-- ...           # you can have as many CSVs for node data as you want
    |-- graphs.csv    # graph-level features

the first graph is used to train the embeddings with DeepWalk from the feature network:
    ./1977_2002/clf/
    |-- meta.yaml     # metadata of the dataset
    |-- nodes.csv     # node_id    
    |-- edges.csv      # src_id, dst_id, label, train_mask, test_mask   # edges to train the node embeddings, the edges come from feature networks of either the training and testing data 
for meta.yaml:
    dataset_name: clf
    edge_data:
    - file_name: edges.csv
    node_data:
    - file_name: nodes.csv
    
the second graph is used to train the embeddings with DeepWalk from the feature network:
    ./1977_2002/ne/
    |-- meta.yaml     # metadata of the dataset
    |-- nodes.csv     # node_id
    |-- edges.csv     # src_id, dst_id, train_fea_mask, test_fea_mask, induce_mask  
                      ### train_fea_mask: edges to achieve similarity measures for the training data
                      ### test_fea_mask: edges to achieve similarity measures for the testing data
                      ### induce_mask: edges to induce node embeddings
for meta.yaml:
    dataset_name: ne
    edge_data:
    - file_name: edges.csv
    node_data:
    - file_name: nodes.csv
"""



'  \ntargeted format example:\n    /path/to/dataset/\n    |-- meta.yaml     # metadata of the dataset\n    |-- edges_0.csv   # edge data including src_id, dst_id, feature, label and so on\n    |-- ...           # you can have as many CSVs for edge data as you want\n    |-- nodes_0.csv   # node data including node_id, feature, label and so on\n    |-- ...           # you can have as many CSVs for node data as you want\n    |-- graphs.csv    # graph-level features\n\nthe first graph is used to train the embeddings with DeepWalk from the feature network:\n    ./1977_2002/clf/\n    |-- meta.yaml     # metadata of the dataset\n    |-- nodes.csv     # node_id    \n    |-- edges.csv      # src_id, dst_id, label   # edges to train the node embeddings, the edges come from feature networks of either the training and testing data \nfor meta.yaml:\n    dataset_name: clf\n    edge_data:\n    - file_name: edges.csv\n    node_data:\n    - file_name: nodes.csv\n    \nthe second graph is used to train 

In [8]:
for folder in ["ne","clf"]:
    PATH = save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/"+folder
    if not os.path.exists(PATH):
        os.makedirs(PATH)

def create_nodes_csv(save_path,fixed_nodes):
    # create nodes.csv 
#     nodes_idx = list()

#     for n in fixed_nodes:
#         nodes_idx.append(node_idx.get(n))

    nodes_df = pd.DataFrame(fixed_nodes, columns = ["node_id"])

    nodes_df.to_csv(save_path+"nodes.csv",index=False)


 
create_nodes_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/clf/",
                 fixed_nodes=fixed_nodes)
create_nodes_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/ne/",
                 fixed_nodes=fixed_nodes)

In [9]:
# create edges.csv for the clf 
def create_edge_clf_csv(save_path,X_train,y_train,X_test,y_test):
    src_id,dst_id,label,train_mask,test_mask = list(),list(),list(),list(),list()

    for node_pair,lab in zip(X_train,y_train):
        n1,n2 = node_pair[0].split("::")[0],node_pair[0].split("::")[1]
#         n1_idx,n2_idx = node_idx.get(n1),node_idx.get(n2)
        src_id.append(n1)
        dst_id.append(n2)
        label.append(lab)
        train_mask.append(True)
        test_mask.append(False)

    for node_pair,lab in zip(X_test,y_test):
        n1,n2 = node_pair[0].split("::")[0],node_pair[0].split("::")[1]
#         n1_idx,n2_idx = node_idx.get(n1),node_idx.get(n2)
        src_id.append(n1)
        dst_id.append(n2)
        label.append(lab)
        train_mask.append(False)
        test_mask.append(True)


    edges_clf_dict = {'src_id': src_id, 'dst_id': dst_id, 'label': label,
                      "train_mask":train_mask, "test_mask":test_mask}  
    edges_clf_df = pd.DataFrame(edges_clf_dict)
    edges_clf_df.to_csv(save_path+"edges.csv",index=False)
    

create_edge_clf_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/clf/",
                    X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)


In [10]:
# create edges.csv for the ne
def create_edge_ne_csv(save_path,train_sim,train_ne,test_sim):
    src_id,dst_id,train_fea_mask,test_fea_mask,induce_mask = list(),list(),list(),list(),list()

    for node_pair in train_sim.keys():
        n1,n2 = node_pair.split("::")[0],node_pair.split("::")[1]
#         n1_idx,n2_idx = node_idx.get(n1),node_idx.get(n2)
        src_id.append(n1)
        dst_id.append(n2)
        train_fea_mask.append(True)
        test_fea_mask.append(False)
        induce_mask.append(False)

    for node_pair in test_sim.keys():
        n1,n2 = node_pair.split("::")[0],node_pair.split("::")[1]
#         n1_idx,n2_idx = node_idx.get(n1),node_idx.get(n2)
        src_id.append(n1)
        dst_id.append(n2)
        train_fea_mask.append(False)
        test_fea_mask.append(True)  
        induce_mask.append(False)
        
    for node_pair in train_ne.keys():
        n1,n2 = node_pair.split("::")[0],node_pair.split("::")[1]
#         n1_idx,n2_idx = node_idx.get(n1),node_idx.get(n2)
        src_id.append(n1)
        dst_id.append(n2)
        train_fea_mask.append(False)
        test_fea_mask.append(False)  
        induce_mask.append(True)    

    edges_ne_dict = {'src_id': src_id, 'dst_id': dst_id, 
                      "train_fea_mask":train_fea_mask, "test_fea_mask":test_fea_mask,
                      "induce_mask":induce_mask}  
    edges_ne_df = pd.DataFrame(edges_ne_dict)
    edges_ne_df.to_csv(save_path+"edges.csv",index=False)
    
    
create_edge_ne_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/ne/",
                   train_sim=train_sim,train_ne=train_ne,test_sim=test_sim)


In [11]:
# create a yaml file
def create_yaml(save_path,dataset_name):
    yaml_data =  {'dataset_name': dataset_name, 
                     'edge_data': [{"file_name":"edges.csv",
                                    "src_id_field":"src_id",
                                    "dst_id_field":"dst_id"}],
                     'node_data': [{"file_name":"nodes.csv",
                                    "node_id_field": "node_id"}]}
    
    with open(save_path+'/meta.yaml',"w") as file:
        yaml.dump(yaml_data, file)    # Write a YAML representation of data to 'document.yaml'.

create_yaml(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/ne/", 
            dataset_name="ne")
create_yaml(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/clf/", 
            dataset_name="clf")

In [12]:
def to_csv_pipeline(year_edge,year_ln,
                    g_f0_tr,g_ft_tr,g_l0_tr,g_lt_tr,
                    g_f0_tt,g_ft_tt,g_l0_tt,g_lt_tt,
                    save_path=save_path):
    """a pipeline to turn node and edge data to csv files"""
    ## Step 01: create Training and testing data for similarity meausres and node embeddings
    # default filter for hubs is removing hubs with degree more than 2000
    #. if not removing hub words, use the argument hubs=set()
#     fixed_nodes = fix_nodes(year_edge,g_f0_tr, g_ft_tr, hubs=set())   
    fixed_nodes = fix_nodes(year_edge,g_f0_tr, g_lt_tt, hubs=set())    # new nodes considered 
    

    ## Step 02: 
    print("Generating Training data:")
    # edge to train classifiers,label to train classifiers, edge to get similarity measures, edge to induce embeddings
    X_train, y_train, train_sim, train_ne = data_pipeline(year_edge, year_ln, g_f0=g_f0_tr, g_ft=g_ft_tr, 
                                     g_l0=g_l0_tr, g_lt=g_lt_tr, fixed_nodes=fixed_nodes,sampling=True,training=True)

    print("------------------------------------------------")
    print("Generating Testing data:")
    ### use sampling
    X_test, y_test, test_sim = data_pipeline(year_edge, year_ln, g_f0=g_f0_tt, g_ft=g_ft_tt, 
                                     g_l0=g_l0_tt, g_lt=g_lt_tt, fixed_nodes=fixed_nodes,sampling=True,training=False)
   
    ## Step 03: create folders for csv files 
    for folder in ["ne","clf"]:
        PATH = save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/"+folder
        if not os.path.exists(PATH):
            os.makedirs(PATH)

    ## Step 04: create csv file for nodes
    create_nodes_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/clf/",
                     fixed_nodes=fixed_nodes)
    create_nodes_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/ne/",
                     fixed_nodes=fixed_nodes)    
   
    ## Step 05: create csv file for edges to train the classifier
    create_edge_clf_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/clf/",
                        X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)   
    
    ## Step 06: create csv file for edges to achieve similarity measures and to induce node embeddings
    create_edge_ne_csv(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/ne/",
                   train_sim=train_sim,train_ne=train_ne,test_sim=test_sim)

    ## Step 07: create yaml file for dgl graph to parse
    create_yaml(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/ne/", 
                dataset_name="ne")
    create_yaml(save_path=save_path+str(g_f0_tr)+"_"+str(g_lt_tt)+"/clf/", 
                dataset_name="clf")
    print("########################################################")
    
    

In [13]:
g_f0_tr,g_f0_tt=1977,1977

    
for t in range(2000,2020):
    g_ft_tr,g_l0_tr,g_lt_tr,g_ft_tt,g_l0_tt,g_lt_tt = t,t+1,t+1,t+1,t+2,t+2
    print(g_f0_tr,g_ft_tr,g_l0_tr,g_lt_tr,
          g_f0_tt,g_ft_tt,g_l0_tt,g_lt_tt)
    print("-------")
    to_csv_pipeline(year_edge,year_ln,
                    g_f0_tr,g_ft_tr,g_l0_tr,g_lt_tr,
                    g_f0_tt,g_ft_tt,g_l0_tt,g_lt_tt,
                    save_path=save_path)    

1977 2000 2001 2001 1977 2001 2002 2002
-------
Generating Training data:
start of the feature network: 1977
end of the feature network: 2000
start of the label network: 2001
end of the label network: 2001
-----------
number of fixed nodes: 2807
-----------
positive links in the feature network: 26000.0
-----------
number of links to achieve the similarity measures: 26000
-----------
positive links in the label network: 67751.0
-----------
number of links to induce the node embeddings: 93751
-----------
negative links in the feature network before undersampling: 1712534
-----------
Original dataset shape Counter({0: 1712534, 1: 67751})
Resampled dataset shape Counter({0: 67751, 1: 67751})
------------------------------------------------
Generating Testing data:
start of the feature network: 1977
end of the feature network: 2001
start of the label network: 2002
end of the label network: 2002
-----------
number of fixed nodes: 2807
-----------
positive links in the feature network: 93751

Original dataset shape Counter({0: 14551497, 1: 14734})
Resampled dataset shape Counter({0: 14734, 1: 14734})
########################################################
1977 2006 2007 2007 1977 2007 2008 2008
-------
Generating Training data:
start of the feature network: 1977
end of the feature network: 2006
start of the label network: 2007
end of the label network: 2007
-----------
number of fixed nodes: 6035
-----------
positive links in the feature network: 345893.0
-----------
number of links to achieve the similarity measures: 345893
-----------
positive links in the label network: 14734.0
-----------
number of links to induce the node embeddings: 360627
-----------
negative links in the feature network before undersampling: 17579510
-----------
Original dataset shape Counter({0: 17579510, 1: 14734})
Resampled dataset shape Counter({0: 14734, 1: 14734})
------------------------------------------------
Generating Testing data:
start of the feature network: 1977
end of the feature ne

negative links in the feature network before undersampling: 39379077
-----------
Original dataset shape Counter({0: 39379077, 1: 49077})
Resampled dataset shape Counter({0: 49077, 1: 49077})
########################################################
1977 2012 2013 2013 1977 2013 2014 2014
-------
Generating Training data:
start of the feature network: 1977
end of the feature network: 2012
start of the label network: 2013
end of the label network: 2013
-----------
number of fixed nodes: 9667
-----------
positive links in the feature network: 593984.0
-----------
number of links to achieve the similarity measures: 593984
-----------
positive links in the label network: 49077.0
-----------
number of links to induce the node embeddings: 643061
-----------
negative links in the feature network before undersampling: 45256962
-----------
Original dataset shape Counter({0: 45256962, 1: 49077})
Resampled dataset shape Counter({0: 49077, 1: 49077})
------------------------------------------------


positive links in the feature network: 845226.0
-----------
number of links to achieve the similarity measures: 845226
-----------
positive links in the label network: 20334.0
-----------
negative links in the feature network before undersampling: 65150994
-----------
Original dataset shape Counter({0: 65150994, 1: 20334})
Resampled dataset shape Counter({0: 20334, 1: 20334})
########################################################
1977 2018 2019 2019 1977 2019 2020 2020
-------
Generating Training data:
start of the feature network: 1977
end of the feature network: 2018
start of the label network: 2019
end of the label network: 2019
-----------
number of fixed nodes: 11712
-----------
positive links in the feature network: 845226.0
-----------
number of links to achieve the similarity measures: 845226
-----------
positive links in the label network: 20334.0
-----------
number of links to induce the node embeddings: 865560
-----------
negative links in the feature network before unders