In [2]:
"""
This notebook is used to create a table for the ground truth data.

Overall, we have 20 pairs of train/test dataset. Each train/test pair will have two 
corresponding feature networks and label networks. The training and testing data
have been generated and saved in ./outputs/16_dgl_csv/

We use the csv files to create a table for the paper. 

"""

'\nThis notebook is used to create a table for the ground truth data.\n\nOverall, we have 20 pairs of train/test dataset. Each train/test pair will have two \ncorresponding feature networks and label networks. The training and testing data\nhave been generated and saved in ./outputs/16_dgl_csv/\n\nWe use the csv files to create a table for the paper. \n\n'

In [2]:
import csv
import pandas as pd
from collections import Counter

In [3]:
path = "../../outputs/16_dgl_csv/1977_"   

In [4]:
## first, we retrieve the training and testing data for the clf
# number of positive links in the training data, number of sampled negative links in the training data
# number of positive links in the testing data, number of negative links in the testing data
train_pos,train_neg,test_pos,test_neg = list(),list(),list(),list()

for t in range(2002,2022):
    
    edges_df = pd.read_csv(path+str(t)+"/clf/edges.csv")
    train_df, test_df = edges_df[edges_df["train_mask"] == True],edges_df[edges_df["train_mask"] == False]
    train_dict, test_dict = Counter(train_df["label"]),Counter(test_df["label"])
    train_pos.append(train_dict.get(1))
    train_neg.append(train_dict.get(0))
    test_pos.append(test_dict.get(1))
    test_neg.append(test_dict.get(0))
    

In [5]:
## next, we retrieve the metadata used to build the training and testing networks
# .  i.e., the feature networks and the label networks
# number of link use in the feature network of the training data, number of link use in the feature network of the testing data, 
# number of fixed nodes in each train/test pair
train_link,test_link,nodes = list(),list(),list()

for t in range(2002,2022):
    # get the link use of feature networks of training and testing data
    network_df = pd.read_csv(path+str(t)+"/ne/edges.csv")
    train_fea_df, test_fea_df = network_df[network_df["train_fea_mask"] == True],network_df[network_df["test_fea_mask"] == True]
    train_fea_link, test_fea_link = train_fea_df.shape[0],test_fea_df.shape[0]
    # display(network_df)

    # get the fixed node numbers for each train/test dataset 
    node_df = pd.read_csv(path+str(t)+"/ne/nodes.csv")
    fixed_nodes = node_df.shape[0]

    train_link.append(train_fea_link)
    test_link.append(test_fea_link)
    nodes.append(fixed_nodes)
    # break




In [6]:
## create a dataframe for the training and testing data that train a classifier
year_idx = [t for t in range(2002,2022)]
data = ["train_data","test_data"]
cols = pd.MultiIndex.from_product([data,["pos_links","neg_links"]])
train_clf = pd.DataFrame(
    zip(train_pos,train_neg,test_pos,test_neg),
    columns=cols,
    index=year_idx
)

In [7]:
train_clf

Unnamed: 0_level_0,train_data,train_data,test_data,test_data
Unnamed: 0_level_1,pos_links,neg_links,pos_links,neg_links
2002,4309,4309,4238,169198
2003,12661,12661,9511,1512606
2004,11868,11868,12060,3503711
2005,13082,13082,14309,5345284
2006,15214,15214,14169,7506135
2007,14957,14957,5117,10219633
2008,5289,5289,15584,13199906
2009,15810,15810,16195,14026886
2010,16979,16979,14741,17210827
2011,15413,15413,11725,21073898


In [8]:
print(train_clf.to_latex(index=True))

\begin{tabular}{lrrrr}
\toprule
{} & \multicolumn{2}{l}{train\_data} & \multicolumn{2}{l}{test\_data} \\
{} &  pos\_links & neg\_links & pos\_links & neg\_links \\
\midrule
2002 &       4309 &      4309 &      4238 &    169198 \\
2003 &      12661 &     12661 &      9511 &   1512606 \\
2004 &      11868 &     11868 &     12060 &   3503711 \\
2005 &      13082 &     13082 &     14309 &   5345284 \\
2006 &      15214 &     15214 &     14169 &   7506135 \\
2007 &      14957 &     14957 &      5117 &  10219633 \\
2008 &       5289 &      5289 &     15584 &  13199906 \\
2009 &      15810 &     15810 &     16195 &  14026886 \\
2010 &      16979 &     16979 &     14741 &  17210827 \\
2011 &      15413 &     15413 &     11725 &  21073898 \\
2012 &      12206 &     12206 &     18473 &  25399895 \\
2013 &      18924 &     18924 &     16618 &  28876787 \\
2014 &      17129 &     17129 &     18627 &  33725997 \\
2015 &      19164 &     19164 &     16030 &  38654749 \\
2016 &      16720 &     16720

In [9]:
def get_density(nodes,links):
    """get density for a network,
        ns: nodes, ls: links, both are lists"""
    density = list()
    len_nodes = len(nodes)
    # train_link

    for i in range(len_nodes):
        num_nodes = nodes[i]
        num_links = links[i]
        d = 2*num_links/(num_nodes*num_nodes)
        density.append(str(round(d*100,4))+"%")
    
    return density

In [10]:
## create a dataframe for the training and testing networks

networks = ["training network"]*6+["testing network"]*6
fea_label = ["feature network"]*3+["label network"]*3+["feature network"]*3+["label network"]*3
range_use = ["time range", "link use", "density"]+["time range", "link use", "density"]+["time range", "link use", "density"]+["time range", "link use", "density"]
# net_cols = pd.MultiIndex.from_product([networks,fea_label,range_use])
net_cols = pd.MultiIndex.from_arrays([networks,fea_label,range_use])

year_idx = [t for t in range(2002,2022)]
tr_fea_idx = ["1977-"+str(t) for t in range(2000,2020)]
tr_lab_idx = [t for t in range(2001,2021)]
tt_fea_idx = ["1977-"+str(t) for t in range(2001,2021)]
tt_lab_idx = [t for t in range(2002,2022)]

train_density = get_density(nodes,train_link)
train_lab_density = get_density(nodes,train_pos)
test_density = get_density(nodes,test_link)
test_lab_density = get_density(nodes,test_pos)

network_data = pd.DataFrame(
    zip(tr_fea_idx,train_link,train_density,tr_lab_idx,train_pos,train_lab_density,tt_fea_idx,test_link,test_density,tt_lab_idx,test_pos,test_lab_density),
    columns=net_cols,
    index=nodes
)

network_data.rename_axis("Nodes",inplace=True)

In [11]:
network_data

Unnamed: 0_level_0,training network,training network,training network,training network,training network,training network,testing network,testing network,testing network,testing network,testing network,testing network
Unnamed: 0_level_1,feature network,feature network,feature network,label network,label network,label network,feature network,feature network,feature network,label network,label network,label network
Unnamed: 0_level_2,time range,link use,density,time range,link use,density,time range,link use,density,time range,link use,density
Nodes,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
607,1977-2000,6176,3.3524%,2001,4309,2.339%,1977-2001,10485,5.6914%,2002,4238,2.3005%
1767,1977-2001,25483,1.6323%,2002,12661,0.811%,1977-2002,38144,2.4433%,2003,9511,0.6092%
2675,1977-2002,48836,1.365%,2003,11868,0.3317%,1977-2003,60704,1.6967%,2004,12060,0.3371%
3299,1977-2003,67376,1.2381%,2004,13082,0.2404%,1977-2004,80458,1.4785%,2005,14309,0.263%
3905,1977-2004,87042,1.1416%,2005,15214,0.1995%,1977-2005,102256,1.3411%,2006,14169,0.1858%
4550,1977-2005,109268,1.0556%,2006,14957,0.1445%,1977-2006,124225,1.2001%,2007,5117,0.0494%
5168,1977-2006,130749,0.9791%,2007,5289,0.0396%,1977-2007,136038,1.0187%,2008,15584,0.1167%
5329,1977-2007,137565,0.9688%,2008,15810,0.1113%,1977-2008,153375,1.0802%,2009,16195,0.1141%
5900,1977-2008,159503,0.9164%,2009,16979,0.0976%,1977-2009,176482,1.014%,2010,14741,0.0847%
6525,1977-2009,183514,0.8621%,2010,15413,0.0724%,1977-2010,198927,0.9345%,2011,11725,0.0551%


In [12]:
network_data.iloc[[-1]]

Unnamed: 0_level_0,training network,training network,training network,training network,training network,training network,testing network,testing network,testing network,testing network,testing network,testing network
Unnamed: 0_level_1,feature network,feature network,feature network,label network,label network,label network,feature network,feature network,feature network,label network,label network,label network
Unnamed: 0_level_2,time range,link use,density,time range,link use,density,time range,link use,density,time range,link use,density
Nodes,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
11352,1977-2019,385558,0.5984%,2020,7820,0.0121%,1977-2020,393378,0.6105%,2021,430,0.0007%


In [13]:
print(network_data.to_latex(index=True))

\begin{tabular}{llrlrrllrlrrl}
\toprule
{} & \multicolumn{6}{l}{training network} & \multicolumn{6}{l}{testing network} \\
{} & \multicolumn{3}{l}{feature network} & \multicolumn{3}{l}{label network} & \multicolumn{3}{l}{feature network} & \multicolumn{3}{l}{label network} \\
{} &       time range & link use &  density &    time range & link use &  density &      time range & link use &  density &    time range & link use &  density \\
Nodes &                  &          &          &               &          &          &                 &          &          &               &          &          \\
\midrule
607   &        1977-2000 &     6176 &  3.3524\% &          2001 &     4309 &   2.339\% &       1977-2001 &    10485 &  5.6914\% &          2002 &     4238 &  2.3005\% \\
1767  &        1977-2001 &    25483 &  1.6323\% &          2002 &    12661 &   0.811\% &       1977-2002 &    38144 &  2.4433\% &          2003 &     9511 &  0.6092\% \\
2675  &        1977-2002 &    48836 &   1.365