In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
dataset_list     = ['reddit', 'enron', 'lastfm', 'mooc', 'uci', 'wikipedia', 'SocialEvo']
granularity_list = [1, 60, 3600, 3600 * 24, 3600 * 24 * 30, 3600 * 24 * 30 * 12]
granularity2text = {
         1             : 'second',
         60            : 'minute',
         3600          : 'hour',
    3600 * 24          : 'day',
    3600 * 24 * 30     : 'month',
    3600 * 24 * 30 * 12: 'year'
}

data_csv      = '../DG_data/TG_network_datasets/{}/ml_{}.csv'
data_node_npy = '../DG_data/TG_network_datasets/{}/ml_{}_node.npy'
data_npy      = '../DG_data/TG_network_datasets/{}/ml_{}.npy'


## Data Generation

In [3]:
for dataset in dataset_list:
    dataset_path = '../DG_data_coarse/{}'.format(dataset)
    os.makedirs(dataset_path, exist_ok=True)
    # load datasets
    data_path = data_csv.format(dataset, dataset)
    print('Data loaded from {}'.format(data_path))
    data_df = pd.read_csv(data_path)
    data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])        
    
    # create datasets of different granularities
    for granularity in granularity_list:
        granularity_path = os.path.join(dataset_path, granularity2text[granularity])
        os.makedirs(granularity_path, exist_ok=True)
        
        temp_data_df = data_df.copy(deep=True)
        temp_data_df['ts'] = (np.ceil(temp_data_df['ts'] / granularity) * granularity)
        
        new_data_path = os.path.join(granularity_path, 'ml_{}.csv'.format(dataset))
        temp_data_df.to_csv(new_data_path)
        print('Data saved to {}'.format(new_data_path))

Data loaded from ../DG_data/TG_network_datasets/reddit/ml_reddit.csv
Data saved to ../DG_data_coarse/reddit/second/ml_reddit.csv
Data saved to ../DG_data_coarse/reddit/minute/ml_reddit.csv
Data saved to ../DG_data_coarse/reddit/hour/ml_reddit.csv
Data saved to ../DG_data_coarse/reddit/day/ml_reddit.csv
Data saved to ../DG_data_coarse/reddit/month/ml_reddit.csv
Data saved to ../DG_data_coarse/reddit/year/ml_reddit.csv
Data loaded from ../DG_data/TG_network_datasets/enron/ml_enron.csv
Data saved to ../DG_data_coarse/enron/second/ml_enron.csv
Data saved to ../DG_data_coarse/enron/minute/ml_enron.csv
Data saved to ../DG_data_coarse/enron/hour/ml_enron.csv
Data saved to ../DG_data_coarse/enron/day/ml_enron.csv
Data saved to ../DG_data_coarse/enron/month/ml_enron.csv
Data saved to ../DG_data_coarse/enron/year/ml_enron.csv
Data loaded from ../DG_data/TG_network_datasets/lastfm/ml_lastfm.csv
Data saved to ../DG_data_coarse/lastfm/second/ml_lastfm.csv
Data saved to ../DG_data_coarse/lastfm/minu

## Split Verification

In [4]:
def get_split(data_df, val_ratio=0.15, test_ratio=0.15):
    val_time, test_time = list(np.quantile(data_df['ts'], [(1 - val_ratio - test_ratio), (1 - test_ratio)]))
    num_train, num_val, num_test = len(data_df[data_df['ts'] <= val_time]), len(data_df[(data_df['ts'] > val_time)&(data_df['ts'] <= test_time)]), len(data_df[data_df['ts'] > test_time])
    return num_train, num_val, num_test

In [5]:
for dataset in dataset_list:
    dataset_path = '../DG_data_coarse/{}'.format(dataset)
    
    for granularity in granularity_list:
        granularity_path = os.path.join(dataset_path, granularity2text[granularity])
        
        data_csv = os.path.join(granularity_path, 'ml_{}.csv'.format(dataset))
        data_df = pd.read_csv(data_csv)
        
        num_train, num_val, num_test = get_split(data_df)
        print('{:10}|{:8}|{:8}|{:8}|{:8}'.format(dataset, granularity2text[granularity], num_train, num_val, num_test))

reddit    |second  |  470713|  100867|  100867
reddit    |minute  |  470722|  100858|  100867
reddit    |hour    |  470815|  101296|  100336
reddit    |day     |  475299|  114325|   82823
reddit    |month   |  650442|       0|   22005
reddit    |year    |  672447|       0|       0
enron     |second  |   87664|   18786|   18785
enron     |minute  |   87664|   18786|   18785
enron     |hour    |   87665|   18808|   18762
enron     |day     |   87743|   18971|   18521
enron     |month   |   89523|   24162|   11550
enron     |year    |  105631|   19604|       0
lastfm    |second  |  905172|  193965|  193966
lastfm    |minute  |  905174|  193963|  193966
lastfm    |hour    |  905232|  193919|  193952
lastfm    |day     |  905232|  194399|  193472
lastfm    |month   |  936951|  182053|  174099
lastfm    |year    | 1159991|       0|  133112
mooc      |second  |  288224|   61762|   61763
mooc      |minute  |  288224|   61770|   61755
mooc      |hour    |  288480|   62271|   60998
mooc      |da

In [None]:
1469665|  314930|  314924
SocialEvo |minute  | 1469671|  314924|  314924
SocialEvo |hour    | 1470682|  314396|  314441
SocialEvo |day     | 1474123|  317858|  307538
SocialEvo |month   | 1668041|  424508|    6970
SocialEvo |year    | 2099519