In [1]:
import pandas as pd
import numpy as np

In [2]:
granularity_list = [1, 60, 3600, 3600 * 24, 3600 * 24 * 30, 3600 * 24 * 30 * 12]

In [3]:
def get_split_edges(data_df, split_timestamp_list):
    """compute the number of edges in different splits

    Args:
        data_df (pd.DataFrame): dataset
        split_timestamp_list (list): timestamp for validation and test set

    Returns:
        list: the numbers of edges in different splits
    """
    num_edge_list = []

    train_split = data_df[data_df['ts'] <= split_timestamp_list[0]]['ts']
    valid_split = data_df[(data_df['ts'] > split_timestamp_list[0])
                          & (data_df['ts'] <= split_timestamp_list[1])]['ts']
    test_split = data_df[data_df['ts'] > split_timestamp_list[1]]['ts']
    num_edge_list.append(len(train_split))
    num_edge_list.append(len(valid_split))
    num_edge_list.append(len(test_split))

    return num_edge_list

In [4]:
def get_split_steps(data_df, split_timestamp_list):
    """compute the number of unique steps in different splits

    Args:
        data_df (pd.DataFrame): dataset
        split_timestamp_list (list): timestamp for validation and test set

    Returns:
        list: the numbers of unique steps in different splits
    """
    num_step_list = []

    train_split = data_df[data_df['ts'] <= split_timestamp_list[0]]['ts']
    valid_split = data_df[(data_df['ts'] > split_timestamp_list[0])
                          & (data_df['ts'] <= split_timestamp_list[1])]['ts']
    test_split = data_df[data_df['ts'] > split_timestamp_list[1]]['ts']
    num_step_list.append(len(train_split.value_counts()))
    num_step_list.append(len(valid_split.value_counts()))
    num_step_list.append(len(test_split.value_counts()))

    return num_step_list

## Wikipedia

In [5]:
data_file = '../DG_data/TG_network_datasets/wikipedia/ml_wikipedia.csv'
data_df = pd.read_csv(data_file)
data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])
num_step_list = []
for granularity in granularity_list:
    num_step_list.append(len((np.ceil(data_df['ts'] / granularity)).value_counts()))
num_step_list

[152757, 42569, 745, 32, 3, 2]

In [6]:
day = 60 * 60 * 24
split_timestamp_list = [20 * day, 25 * day]
get_split_edges(data_df, split_timestamp_list), get_split_steps(data_df, split_timestamp_list)

([102799, 27574, 27101], [99701, 26697, 26359])

## Reddit

In [7]:
data_file = '../DG_data/TG_network_datasets/reddit/ml_reddit.csv'
data_df = pd.read_csv(data_file)
data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])
num_step_list = []
for granularity in granularity_list:
    num_step_list.append(len((np.ceil(data_df['ts'] / granularity)).value_counts()))
num_step_list

[588918, 44640, 745, 32, 3, 2]

In [8]:
day = 60 * 60 * 24
split_timestamp_list = [20 * day, 25 * day]
get_split_edges(data_df, split_timestamp_list), get_split_steps(data_df, split_timestamp_list)

([434680, 110586, 127181], [432543, 110004, 126518])

## MOOC

In [9]:
data_file = '../DG_data/TG_network_datasets/mooc/ml_mooc.csv'
data_df = pd.read_csv(data_file)
data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])
num_step_list = []
for granularity in granularity_list:
    num_step_list.append(len((np.ceil(data_df['ts'] / granularity)).value_counts()))
num_step_list

[345600, 33276, 707, 31, 2, 2]

In [10]:
day = 60 * 60 * 24
split_timestamp_list = [20 * day, 25 * day]
get_split_edges(data_df, split_timestamp_list), get_split_steps(data_df, split_timestamp_list)

([255776, 78732, 77241], [216364, 65815, 63421])

## LastFM

In [11]:
data_file = '../DG_data/TG_network_datasets/lastfm/ml_lastfm.csv'
data_df = pd.read_csv(data_file)
data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])
num_step_list = []
for granularity in granularity_list:
    num_step_list.append(len((np.ceil(data_df['ts'] / granularity)).value_counts()))
num_step_list

[1283614, 922509, 37542, 1587, 54, 6]

In [12]:
day = 60 * 60 * 24
split_timestamp_list = [1216 * day, 1520 * day]
get_split_edges(data_df, split_timestamp_list), get_split_steps(data_df, split_timestamp_list)

([921756, 344470, 26877], [916312, 340736, 26566])

## Enron

In [13]:
data_file = '../DG_data/TG_network_datasets/enron/ml_enron.csv'
data_df = pd.read_csv(data_file)
data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])
num_step_list = []
for granularity in granularity_list:
    num_step_list.append(len((np.ceil(data_df['ts'] / granularity)).value_counts()))
num_step_list

[22632, 22189, 8079, 1032, 45, 5]

In [14]:
day = 60 * 60 * 24
split_timestamp_list = [730 * day, 912 * day]
get_split_edges(data_df, split_timestamp_list), get_split_steps(data_df, split_timestamp_list)

([37440, 41866, 45929], [6224, 6357, 10051])

## Social. Evo.

In [15]:
data_file = '../DG_data/TG_network_datasets/SocialEvo/ml_SocialEvo.csv'
data_df = pd.read_csv(data_file)
data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])
num_step_list = []
for granularity in granularity_list:
    num_step_list.append(len((np.ceil(data_df['ts'] / granularity)).value_counts()))
num_step_list

[565932, 242886, 5685, 244, 10, 2]

In [16]:
day = 60 * 60 * 24
split_timestamp_list = [160 * day, 200 * day]
get_split_edges(data_df, split_timestamp_list), get_split_steps(data_df, split_timestamp_list)

([945394, 568867, 585258], [268758, 136849, 160325])

## UCI

In [17]:
data_file = '../DG_data/TG_network_datasets/uci/ml_uci.csv'
data_df = pd.read_csv(data_file)
data_df = data_df.drop(axis=1, columns=['Unnamed: 0'])
num_step_list = []
for granularity in granularity_list:
    num_step_list.append(len((np.ceil(data_df['ts'] / granularity)).value_counts()))
num_step_list

[58911, 35908, 3313, 192, 8, 2]

In [18]:
day = 60 * 60 * 24
split_timestamp_list = [130 * day, 162 * day]
get_split_edges(data_df, split_timestamp_list), get_split_steps(data_df, split_timestamp_list)

([56080, 2404, 1351], [55202, 2402, 1307])