In [2]:
"""
This notebook is used to create a table for the ground truth data.

Overall, we have 20 pairs of train/test dataset. Each train/test pair will have two 
corresponding feature networks and label networks. The training and testing data
have been generated and saved in ./outputs/16_dgl_csv/

We use the csv files to create a table for the paper. 

"""

'\nThis notebook is used to create a table for the ground truth data.\n\nOverall, we have 20 pairs of train/test dataset. Each train/test pair will have two \ncorresponding feature networks and label networks. The training and testing data\nhave been generated and saved in ./outputs/16_dgl_csv/\n\nWe use the csv files to create a table for the paper. \n\n'

In [14]:
import csv
import pandas as pd
from collections import Counter

In [15]:
# path = "../../outputs/16_dgl_csv/1977_"   ## ../ goes to the parent directory of pwd, .. goes to the current directory
# path = "../../outputs/22_dgl_new_nodes_csv/1977_"   ## to calculate the networks with no fixed nodes
path = "../../outputs/25_dgl_csv_long/1977_"


In [16]:
## first, we retrieve the training and testing data for the clf
# number of positive links in the training data, number of sampled negative links in the training data
# number of positive links in the testing data, number of negative links in the testing data
train_pos,train_neg,test_pos,test_neg = list(),list(),list(),list()
# t_end = 2008
t_end = 2022

for t in range(2002,t_end):
    
    edges_df = pd.read_csv(path+str(t)+"/clf/edges.csv")
    train_df, test_df = edges_df[edges_df["train_mask"] == True],edges_df[edges_df["train_mask"] == False]
    train_dict, test_dict = Counter(train_df["label"]),Counter(test_df["label"])
    train_pos.append(train_dict.get(1))
    train_neg.append(train_dict.get(0))
    test_pos.append(test_dict.get(1))
    test_neg.append(test_dict.get(0))
    

In [17]:
## next, we retrieve the metadata used to build the training and testing networks
# .  i.e., the feature networks and the label networks
# number of link use in the feature network of the training data, number of link use in the feature network of the testing data, 
# number of fixed nodes in each train/test pair
train_link,test_link,nodes = list(),list(),list()

for t in range(2002,t_end):
    # get the link use of feature networks of training and testing data
    network_df = pd.read_csv(path+str(t)+"/ne/edges.csv")
    train_fea_df, test_fea_df = network_df[network_df["train_fea_mask"] == True],network_df[network_df["test_fea_mask"] == True]
    train_fea_link, test_fea_link = train_fea_df.shape[0],test_fea_df.shape[0]
    # display(network_df)

    # get the fixed node numbers for each train/test dataset 
    node_df = pd.read_csv(path+str(t)+"/ne/nodes.csv")
    fixed_nodes = node_df.shape[0]

    train_link.append(train_fea_link)
    test_link.append(test_fea_link)
    nodes.append(fixed_nodes)
    # break




In [43]:
## create a dataframe for the training and testing data that train a classifier
year_idx = ["1977-"+str(t) for t in range(2002,t_end)]
data = ["train_data","test_data"]
cols = pd.MultiIndex.from_product([data,["pos_links","neg_links"]])
train_clf = pd.DataFrame(
    zip(train_pos,train_neg,test_pos,test_neg),
    columns=cols,
    index=year_idx
)

In [44]:
train_clf

Unnamed: 0_level_0,train_data,train_data,test_data,test_data
Unnamed: 0_level_1,pos_links,neg_links,pos_links,neg_links
1977-2002,4309,4309,33273,140163
1977-2003,12661,12661,113896,1408221
1977-2004,11868,11868,146638,3369133
1977-2005,13082,13082,152364,5207229
1977-2006,15214,15214,151189,7369115
1977-2007,14957,14957,148155,10076595
1977-2008,5289,5289,152835,13062655
1977-2009,15810,15810,139480,13903601
1977-2010,16979,16979,130228,17095340
1977-2011,15413,15413,121927,20963696


In [45]:
print(train_clf.to_latex(index=True))

\begin{tabular}{lrrrr}
\toprule
{} & \multicolumn{2}{l}{train\_data} & \multicolumn{2}{l}{test\_data} \\
{} &  pos\_links & neg\_links & pos\_links & neg\_links \\
\midrule
1977-2002 &       4309 &      4309 &     33273 &    140163 \\
1977-2003 &      12661 &     12661 &    113896 &   1408221 \\
1977-2004 &      11868 &     11868 &    146638 &   3369133 \\
1977-2005 &      13082 &     13082 &    152364 &   5207229 \\
1977-2006 &      15214 &     15214 &    151189 &   7369115 \\
1977-2007 &      14957 &     14957 &    148155 &  10076595 \\
1977-2008 &       5289 &      5289 &    152835 &  13062655 \\
1977-2009 &      15810 &     15810 &    139480 &  13903601 \\
1977-2010 &      16979 &     16979 &    130228 &  17095340 \\
1977-2011 &      15413 &     15413 &    121927 &  20963696 \\
1977-2012 &      12206 &     12206 &    114362 &  25304006 \\
1977-2013 &      18924 &     18924 &     99046 &  28794359 \\
1977-2014 &      17129 &     17129 &     85330 &  33659294 \\
1977-2015 &      1916

In [39]:
def get_density(nodes,links,label=False):
    """get density for a network,
        ns: nodes, ls: links, both are lists"""
    density = list()
    len_nodes = len(nodes)
    # train_link

    for i in range(len_nodes):
        num_nodes = nodes[i]
        num_links = links[i]
        d = 2*num_links/(num_nodes*(num_nodes-1))
        x = round(d*100,2)

        if label == False:
            density.append(f'{x:.2f}')
        else:
            density.append("+"+f'{x:.2f}')
    
    return density

In [40]:
## create a dataframe for the training and testing networks

networks = ["Training network"]*6+["Testing network"]*6
fea_label = ["Feature network"]*3+["Label network"]*3+["Feature network"]*3+["Label network"]*3
range_use = ["Years", "Links", "D (%)"]+["Year", "New links", "\delta D (%)"]+["Years", "Links", "D (%)"]+["Years", "New links", "\delta D (%)"]
# net_cols = pd.MultiIndex.from_product([networks,fea_label,range_use])
net_cols = pd.MultiIndex.from_arrays([networks,fea_label,range_use])

year_idx = [t for t in range(2002,t_end)]
tr_fea_idx = ["1977-"+str(t) for t in range(2000,t_end-2)]
tr_lab_idx = [t for t in range(2001,t_end-1)]
tt_fea_idx = ["1977-"+str(t) for t in range(2001,t_end-1)]
tt_lab_idx = [str(t)+"-2021" for t in range(2002,t_end)]

train_density = get_density(nodes,train_link)
train_lab_density = get_density(nodes,train_pos,label=True)
test_density = get_density(nodes,test_link)
test_lab_density = get_density(nodes,test_pos,label=True)

network_data = pd.DataFrame(
    zip(tr_fea_idx,train_link,train_density,tr_lab_idx,train_pos,train_lab_density,tt_fea_idx,test_link,test_density,tt_lab_idx,test_pos,test_lab_density),
    columns=net_cols,
    index=nodes
)

network_data.rename_axis("Nodes",inplace=True)

In [41]:
network_data

## if delta D is 0.00..., written as <0.01

Unnamed: 0_level_0,Training network,Training network,Training network,Training network,Training network,Training network,Testing network,Testing network,Testing network,Testing network,Testing network,Testing network
Unnamed: 0_level_1,Feature network,Feature network,Feature network,Label network,Label network,Label network,Feature network,Feature network,Feature network,Label network,Label network,Label network
Unnamed: 0_level_2,Years,Links,D (%),Year,New links,\delta D (%),Years,Links,D (%),Years,New links,\delta D (%)
Nodes,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
607,1977-2000,6176,3.36,2001,4309,2.34,1977-2001,10485,5.7,2002-2021,33273,18.09
1767,1977-2001,25483,1.63,2002,12661,0.81,1977-2002,38144,2.44,2003-2021,113896,7.3
2675,1977-2002,48836,1.37,2003,11868,0.33,1977-2003,60704,1.7,2004-2021,146638,4.1
3299,1977-2003,67376,1.24,2004,13082,0.24,1977-2004,80458,1.48,2005-2021,152364,2.8
3905,1977-2004,87042,1.14,2005,15214,0.2,1977-2005,102256,1.34,2006-2021,151189,1.98
4550,1977-2005,109268,1.06,2006,14957,0.14,1977-2006,124225,1.2,2007-2021,148155,1.43
5168,1977-2006,130749,0.98,2007,5289,0.04,1977-2007,136038,1.02,2008-2021,152835,1.14
5329,1977-2007,137565,0.97,2008,15810,0.11,1977-2008,153375,1.08,2009-2021,139480,0.98
5900,1977-2008,159503,0.92,2009,16979,0.1,1977-2009,176482,1.01,2010-2021,130228,0.75
6525,1977-2009,183514,0.86,2010,15413,0.07,1977-2010,198927,0.93,2011-2021,121927,0.57


In [12]:
network_data.iloc[[-1]]

Unnamed: 0_level_0,training network,training network,training network,training network,training network,training network,testing network,testing network,testing network,testing network,testing network,testing network
Unnamed: 0_level_1,feature network,feature network,feature network,label network,label network,label network,feature network,feature network,feature network,label network,label network,label network
Unnamed: 0_level_2,time range,link use,density,time range,link use,density,time range,link use,density,time range,link use,density
Nodes,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
11352,1977-2019,385558,0.5984%,2020,7820,0.0121%,1977-2020,393378,0.6105%,2021,430,0.0007%


In [42]:
print(network_data.to_latex(index=True))

\begin{tabular}{llrlrrllrllrl}
\toprule
{} & \multicolumn{6}{l}{Training network} & \multicolumn{6}{l}{Testing network} \\
{} & \multicolumn{3}{l}{Feature network} & \multicolumn{3}{l}{Label network} & \multicolumn{3}{l}{Feature network} & \multicolumn{3}{l}{Label network} \\
{} &            Years &   Links & D (\%) &          Year & New links & \textbackslash delta D (\%) &           Years &   Links & D (\%) &         Years & New links & \textbackslash delta D (\%) \\
Nodes &                  &         &       &               &           &              &                 &         &       &               &           &              \\
\midrule
607   &        1977-2000 &    6176 &  3.36 &          2001 &      4309 &        +2.34 &       1977-2001 &   10485 &  5.70 &     2002-2021 &     33273 &       +18.09 \\
1767  &        1977-2001 &   25483 &  1.63 &          2002 &     12661 &        +0.81 &       1977-2002 &   38144 &  2.44 &     2003-2021 &    113896 &        +7.30 \\
2675  &      