In [1]:
import sys
import torch
# Add a path to the module search path
sys.path.append('../')
from nodeood.data import TEGenerator, ADSGenerator
from nodeood.data import RandomSampler
from nodeood.datasets import OGBNArxiv, Planetoid
from nodeood.methods import Naive
from nodeood.backbones import GCN
from nodeood.utils import set_random_seed

  from .autonotebook import tqdm as notebook_tqdm


## Fundamentals: DRData and DRDataset

DRData describes a homogeneous graph in DeepRobust2.0. <br>
DRData inherits from torch_geometric.data.Data, so we can use various useful functionalities for analyzing graph structures.

In [2]:
from nodeood.data import DRData

In [3]:
# construct a graph
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
data = DRData(x=x, edge_index=edge_index)

In [4]:
# Analyzing the graph structure:
print(f'# Nodes: {data.num_nodes}')
print(f'Is directed: {data.is_directed()}')

# Nodes: 3
Is directed: False


DRDataset is an abstract class representing a graph dataset. A DRDataset contains one or more than one DRData. <br>
DRDataset is responsible for downloading, (customized) processing, and persistence of graphs. <br>
In the following sections, we will provide several examples to load and process datasets under different settings.

## Setting 1: Load Cora with public split (single graph)

In [5]:
data_dir = 'data'

In [6]:
dataset = Planetoid(data_dir, name='cora', mode='public')

Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Planetoid/cora/public/dataset.zip
Processing...


File extracted successfully.


Done!


In [7]:
print(dataset)

Planetoid()


In [8]:
data = dataset[0]

In [9]:
print(data)

DRData(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [10]:
print(f'# train nodes: {data.train_mask.sum()}')
print(f'# val nodes: {data.val_mask.sum()}')
print(f'# test nodes: {data.test_mask.sum()}')

# train nodes: 140
# val nodes: 500
# test nodes: 1000


## Setting 2: Generate node splits for Cora (single graph)

A sampler is used to generate masks for the input graph to simulate a certain distribution shift between training and testing NODES. <br>
We now define a simple generator to generate train, val and test nodes randomly.

In [11]:
from nodeood.data import RandomSampler
sampler = RandomSampler(n_node_per_class=20, n_val=500, n_test=1000, seed=123)
dataset = Planetoid(data_dir, name='cora', mode='sampler', sampler=sampler) 

Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://ghproxy.com/https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [12]:
data = dataset[0]
print(f'# train nodes: {data.train_mask.sum()}')
print(f'# val nodes: {data.val_mask.sum()}')
print(f'# test nodes: {data.test_mask.sum()}')

# train nodes: 140
# val nodes: 500
# test nodes: 1000


## Setting 3: Load Cora with Artificial Distribution Shift (ADS) from EERM (multiple graphs)

In [13]:
dataset_train = Planetoid(data_dir, name='cora', mode='eerm-gcn', split='train')
dataset_val = Planetoid(data_dir, name='cora', mode='eerm-gcn', split='val')
dataset_test = Planetoid(data_dir, name='cora', mode='eerm-gcn', split='test')

Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Planetoid/cora/eerm-gcn/dataset.zip
Processing...


File extracted successfully.


Done!


In [14]:
print(f'Train set: # graphs: {len(dataset_train)}')
print(f'Valid set: # graphs: {len(dataset_val)}')
print(f'Test set: # graphs: {len(dataset_test)}')

Train set: # graphs: 1
Valid set: # graphs: 1
Test set: # graphs: 8


In [15]:
# We can also load other variants of Cora from EERM (e.g., EERM-GAT)

In [16]:
dataset_train = Planetoid(data_dir, name='cora', mode='eerm-gat', split='train')
dataset_val = Planetoid(data_dir, name='cora', mode='eerm-gat', split='val')
dataset_test = Planetoid(data_dir, name='cora', mode='eerm-gat', split='test')

Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Planetoid/cora/eerm-gat/dataset.zip
Processing...


File extracted successfully.


Done!


In [17]:
print(f'Train set: # graphs: {len(dataset_train)}')
print(f'Valid set: # graphs: {len(dataset_val)}')
print(f'Test set: # graphs: {len(dataset_test)}')

Train set: # graphs: 1
Valid set: # graphs: 1
Test set: # graphs: 8


## Setting 4: Generate new variants for Cora using ADS generator (multiple graphs)

A generator is used to generate MULTIPLE graphs from ONE given graph to simulate a certain distribution shift between GRAPHS. <br>
We now define an ADSGenerator to create Artificial Distribution Shift from paper 'EERM'.

In [18]:
from nodeood.data import ADSGenerator
# We need to define a Dict to specify the desired splits of the generated graphs
split = {'train': [0], 'val': [1], 'test': list(range(2, 10))}
generator = ADSGenerator(n_graph=10, n_class=10, n_feat=10, n_hid=10, model='gcn', seed=12345, split=split)
# Now we can get datasets containing the graphs generated by ADSGenerator
dataset_train = Planetoid(data_dir, name='cora', mode='generator', split='train', generator=generator)
dataset_val = Planetoid(data_dir, name='cora', mode='generator', split='val', generator=generator)
dataset_test = Planetoid(data_dir, name='cora', mode='generator', split='test', generator=generator)

Processing...
# processed graphs: [10<10]
Done!


In [19]:
print(f'Train set: # graphs: {len(dataset_train)}')
print(f'Valid set: # graphs: {len(dataset_val)}')
print(f'Test set: # graphs: {len(dataset_test)}')

Train set: # graphs: 1
Valid set: # graphs: 1
Test set: # graphs: 8


Note that the generated graphs are saved to disk with the name of a hash string representing the generator used when the first time generating them. <br> 
After that, processing will be skipped for the same generator.

## Setting 5: Split temporal graphs by time: take OGBN-Arxiv for example 

We can use a TEGenerator to split a temporal graph to create Temporal Evolution shift. <br>
In this case, the input graph will be split by the time nodes were added.

In [20]:
from nodeood.datasets import OGBNArxiv
from nodeood.data import TEGenerator
# We need to define a Dict to specify the time spans to split the input graph
split = {'train': [[1950, 2011]], 'val': [[2011, 2014]], 'test': [[2014, 2016], [2016, 2018], [2018, 2020]]}
generator = TEGenerator(split=split)
dataset_train = OGBNArxiv(data_dir, mode='generator', split='train', generator=generator)
dataset_val = OGBNArxiv(data_dir, mode='generator', split='val', generator=generator)
dataset_test = OGBNArxiv(data_dir, mode='generator', split='test', generator=generator)

Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/OGBNArxiv/edge.csv.gz


Downloaded 0.00 GB: 100%|██████████| 6/6 [00:01<00:00,  4.76it/s]


Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/OGBNArxiv/node_year.csv.gz


Downloaded 0.00 GB: 100%|██████████| 2/2 [00:00<00:00,  6.78it/s]


Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/OGBNArxiv/node-feat.csv.gz


Downloaded 0.07 GB: 100%|██████████| 74/74 [00:05<00:00, 14.59it/s]


Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/OGBNArxiv/node-label.csv.gz


Downloaded 0.00 GB: 100%|██████████| 2/2 [00:00<00:00,  3.90it/s]


Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/OGBNArxiv/num-edge-list.csv.gz


Downloaded 0.00 GB: 100%|██████████| 2/2 [00:00<00:00, 415.03it/s]


Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/OGBNArxiv/num-node-list.csv.gz


Downloaded 0.00 GB: 100%|██████████| 2/2 [00:00<00:00, 451.75it/s]
Processing...


Loading necessary files...
This might take a while.
Processing graphs...


# processed graphs: [5<5]
Done!


In [21]:
print(f'Train set: # graphs: {len(dataset_train)}')
print(f'Valid set: # graphs: {len(dataset_val)}')
print(f'Test set: # graphs: {len(dataset_test)}')

Train set: # graphs: 1
Valid set: # graphs: 1
Test set: # graphs: 3


In [22]:
for name, dataset in zip(['train', 'val', 'test'], [dataset_train, dataset_val, dataset_test]):
    for i in range(len(dataset)):
        data = dataset[i]
        span = [data.node_year[data.mask].min().item(), data.node_year[data.mask].max().item()]
        print(f'Dataset: {name}:{i}, # Nodes: {data.num_nodes}, Test Time Span: {span}.')

Dataset: train:0, # Nodes: 17401, Test Time Span: [1971, 2011].
Dataset: val:0, # Nodes: 41125, Test Time Span: [2012, 2014].
Dataset: test:0, # Nodes: 69499, Test Time Span: [2015, 2016].
Dataset: test:1, # Nodes: 120740, Test Time Span: [2017, 2018].
Dataset: test:2, # Nodes: 169343, Test Time Span: [2019, 2020].


## Setting 6: Load Twitch - A set of graphs from multiple domains 

In [23]:
from nodeood.datasets import Twitch

In [24]:
names = Twitch.get_names()
print(names)

['DE', 'ENGB', 'ES', 'FR', 'PTBR', 'RU', 'TW']


In [25]:
# We can load one or more graphs from the above domains
domains = ['ENGB', 'FR', 'TW']
dataset = Twitch(data_dir, domains)
print(dataset)
print('Dataset Details:')
print('Number of graphs: ', len(dataset))
for i in range(len(dataset)):
    data = dataset[i]
    print('Graph name: {}'.format(data.name))

Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Twitch/ENGB/musae_ENGB_edges.csv
Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Twitch/ENGB/musae_ENGB_features.json
Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Twitch/ENGB/musae_ENGB_target.csv
Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Twitch/FR/musae_FR_edges.csv
Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Twitch/FR/musae_FR_features.json
Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Twitch/FR/musae_FR_target.csv
Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master/Twitch/TW/musae_TW_edges.csv
Downloading https://ghproxy.com/https://raw.githubusercontent.com/SongYYYY/DeepRobustData/master

Twitch(3)
Dataset Details:
Number of graphs:  3
Graph name: ENGB
Graph name: FR
Graph name: TW
