# Create the dataset
    Note for creating own graph dataset using pytorch geometric

## import some packages

In [1]:
import numpy as np
import os
import torch
from torch_geometric.data import InMemoryDataset
from torch_geometric.data import Data
import torch_geometric.utils as ut
np.random.seed(42)

## Create two simulation datasets
- 10 graphs and 30 nodes per graph with random edges connections
- number of node feature = 3
- number of edge feature = 1
- node's classification and graph classification
        Adj [num_graph, num_node, num_node] be the adjacent matrices (sparse)
        node_feature [num_graph, num_node, num_node_feature]
        edge_feature [num_graph, num_node, num_node] (sparse)

In [2]:
num_graph = 10
num_node = 50
num_node_features = 3
num_edge_features = 1

Adj = np.random.rand(num_graph, num_node, num_node)
Adj[Adj >= 0.8] = True
Adj[Adj <= 0.8] = False
node_feature = np.random.rand(num_graph, num_node, num_node_features)
edge_feature = np.random.rand(num_graph, num_node, num_node) * Adj

graph_label = np.random.rand(num_graph)
graph_label[graph_label>0.5] = 1
graph_label[graph_label<0.5] = 0
graph_label = graph_label.astype(int)

node_label = np. random.rand(num_graph, num_node)
node_label[node_label>0.5] = 1
node_label[node_label<0.5] = 0
node_label = node_label.astype(int)

print(Adj[0, :,:], edge_feature[0, :, :], node_feature[0, :, :])

[[0. 1. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 1. 1. 0.]] [[0.         0.72864219 0.         ... 0.         0.         0.        ]
 [0.79184669 0.         0.02333868 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.3352479  0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.697748   0.         ... 0.81314479 0.12877876 0.        ]] [[0.09237084 0.06075373 0.60419195]
 [0.96611632 0.50272131 0.05151523]
 [0.22121866 0.24479291 0.33785713]
 [0.04360338 0.9048927  0.51556394]
 [0.67093079 0.06717284 0.05411547]
 [0.58646777 0.88729601 0.528482  ]
 [0.87781917 0.81099127 0.07585379]
 [0.20195747 0.65771067 0.44350138]
 [0.29069981 0.41863386 0.44519965]
 [0.91055225 0.10345828 0.27973075]
 [0.53756482 0.69828693 0.63063729]
 [0.

## Example of node classification task InMemoryDataset

In [56]:
class NodeDatasetInMem(InMemoryDataset):
    """
    node classification in one graph
    Should define the mask for training, validation and test
    """
    def __init__(self, root, num_train_per_class=15, num_val=10, num_test=10, transform=None, pre_transform=None):
        self.num_train_per_class = num_train_per_class
        self.num_val = num_val
        self.num_test = num_test
        super(NodeDatasetInMem, self).__init__(root,transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [r'.\NodeDatasetInMem.dataset']
    
    def download(self):
        pass

    def process(self):
        num_train_per_class = self.num_train_per_class
        num_val = self.num_val
        num_test = self.num_test
        #data_list = []  # node classification do not neet to define data_list just data (one graph)
        i=0
        source_nodes, target_nodes = np.nonzero(Adj[i, :, :])
        source_nodes = source_nodes.reshape((1, -1))
        target_nodes = target_nodes.reshape((1, -1))

        edge_index = torch.tensor(np.concatenate((source_nodes, target_nodes), axis=0), dtype=torch.long) # edge_index should be long type

        edge_weight = edge_feature[i, source_nodes, target_nodes]
        edge_weight = torch.tensor(edge_weight.reshape((-1, num_edge_features)), dtype=torch.float) # edge_index should be float
        type
        train_mask = np.zeros((num_node,), dtype=bool)
        val_mask = np.zeros((num_node,), dtype=bool)
        test_mask = np.zeros((num_node,), dtype=bool)

        label = node_label[i, :]
        [org_class_0_ind] =  np.nonzero(label == 0) 
        org_class_0_ind = org_class_0_ind.reshape(-1)
        perm_class_0_ind = org_class_0_ind[np.random.permutation(org_class_0_ind.shape[0])]

        [org_class_1_ind] =  np.nonzero(label == 1) 
        org_class_1_ind = org_class_1_ind.reshape(-1)
        perm_class_1_ind = org_class_1_ind[np.random.permutation(org_class_1_ind.shape[0])]


        train_ind = np.concatenate((perm_class_0_ind[:num_train_per_class], perm_class_1_ind[:num_train_per_class]), axis=0)
        train_mask[train_ind] = True

        [remaining] = np.nonzero(~train_mask)
        remaining = remaining.reshape(-1)

        val_mask[remaining[:num_val]] = True
        test_mask[remaining[num_val:num_val+num_test]] = True

        train_mask = torch.tensor(train_mask, dtype=torch.bool) # mask should be long type
        val_mask = torch.tensor(val_mask, dtype=torch.bool)
        test_mask = torch.tensor(test_mask, dtype=torch.bool)

        x = torch.tensor(node_feature[i, :, :], dtype=torch.float) 
        y = torch.tensor(node_label[i, :], dtype=torch.long) # y should be long type

        data = Data(x=x, edge_index=edge_index, y=y, edge_attr=edge_weight, train_mask = train_mask, val_mask = val_mask, test_mask = test_mask)
            
        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

In [57]:
dataset_node_InMem = NodeDatasetInMem(root='./')

Processing...
Done!


In [59]:
print(dataset_node_InMem[0].y)
print(dataset_node_InMem[0].y.shape)

tensor([1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1])
torch.Size([50])


## Example of graph classification task InMemoryDataset

In [7]:
class GraphDatasetInMem(InMemoryDataset):
    """
    Graph classification 
    """
    def __init__(self, root, transform=None, pre_transform=None):
        super(GraphDatasetInMem, self).__init__(root,transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [r'.\GraphDatasetInMem.dataset']
    
    def download(self):
        pass

    def process(self):
        data_list = [] # graph classification need to define data_list for multiple graph
        for i in range(num_graph):
            source_nodes, target_nodes = np.nonzero(Adj[i, :, :])
            source_nodes = source_nodes.reshape((1, -1))
            target_nodes = target_nodes.reshape((1, -1))

            edge_index = torch.tensor(np.concatenate((source_nodes, target_nodes), axis=0), dtype=torch.long) # edge_index should be long type

            edge_weight = edge_feature[i, source_nodes, target_nodes]
            edge_weight = torch.tensor(edge_weight.reshape((-1, num_edge_features)), dtype=torch.float) # edge_index should be float
            type

            x = torch.tensor(node_feature[i, :, :], dtype=torch.float) 
            
            # y should be long type, graph label should not be a 0-dimesion tensor
            # use [graph_label[i]] ranther than graph_label[i]
            y = torch.tensor([graph_label[i]], dtype=torch.long) 

            data = Data(x=x, edge_index=edge_index, y=y, edge_attr=edge_weight)
            data_list.append(data)
            
        data, slices = self.collate(data_list) # Here used to be [data] for one graph
        torch.save((data, slices), self.processed_paths[0])

In [8]:
dataset_graph_InMem = GraphDatasetInMem(root='./')

Processing...
Done!


In [44]:
print(dataset_graph_InMem[0])
print(dataset_graph_InMem[1])

Data(edge_attr=[504, 1], edge_index=[2, 504], x=[50, 3], y=[1])
Data(edge_attr=[495, 1], edge_index=[2, 495], x=[50, 3], y=[1])


## Example of node classification task Dataset

In [27]:
import os.path as osp
from torch_geometric.data import Dataset

In [31]:
class NodeDataset(Dataset):
    """
    node classification in one graph
    Should define the mask for training, validation and test
    """
    def __init__(self, root, num_train_per_class=15, num_val=10, num_test=10, transform=None, pre_transform=None):
        self.num_train_per_class = num_train_per_class
        self.num_val = num_val
        self.num_test = num_test
        super(NodeDataset, self).__init__(root,transform, pre_transform)
        # Do not load the data and slices here
        #self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [r'./NodeDataset_0.pt']
    
    def download(self):
        pass

    def process(self):
        num_train_per_class = self.num_train_per_class
        num_val = self.num_val
        num_test = self.num_test
        #data_list = []  # node classification do not neet to define data_list just data (one graph)
        i=0
        source_nodes, target_nodes = np.nonzero(Adj[i, :, :])
        source_nodes = source_nodes.reshape((1, -1))
        target_nodes = target_nodes.reshape((1, -1))

        edge_index = torch.tensor(np.concatenate((source_nodes, target_nodes), axis=0), dtype=torch.long) # edge_index should be long type

        edge_weight = edge_feature[i, source_nodes, target_nodes]
        edge_weight = torch.tensor(edge_weight.reshape((-1, num_edge_features)), dtype=torch.float) # edge_index should be float
        type
        train_mask = np.zeros((num_node,), dtype=bool)
        val_mask = np.zeros((num_node,), dtype=bool)
        test_mask = np.zeros((num_node,), dtype=bool)

        label = node_label[i, :]
        [org_class_0_ind] =  np.nonzero(label == 0) 
        org_class_0_ind = org_class_0_ind.reshape(-1)
        perm_class_0_ind = org_class_0_ind[np.random.permutation(org_class_0_ind.shape[0])]

        [org_class_1_ind] =  np.nonzero(label == 1) 
        org_class_1_ind = org_class_1_ind.reshape(-1)
        perm_class_1_ind = org_class_1_ind[np.random.permutation(org_class_1_ind.shape[0])]


        train_ind = np.concatenate((perm_class_0_ind[:num_train_per_class], perm_class_1_ind[:num_train_per_class]), axis=0)
        train_mask[train_ind] = True

        [remaining] = np.nonzero(~train_mask)
        remaining = remaining.reshape(-1)

        val_mask[remaining[:num_val]] = True
        test_mask[remaining[num_val:num_val+num_test]] = True

        train_mask = torch.tensor(train_mask, dtype=torch.bool) # mask should be long type
        val_mask = torch.tensor(val_mask, dtype=torch.bool)
        test_mask = torch.tensor(test_mask, dtype=torch.bool)

        x = torch.tensor(node_feature[i, :, :], dtype=torch.float) 
        y = torch.tensor(node_label[i, :], dtype=torch.long) # y should be long type

        data = Data(x=x, edge_index=edge_index, y=y, edge_attr=edge_weight, train_mask = train_mask, val_mask = val_mask, test_mask = test_mask)
        # Directly save the data in order as .pt form
        torch.save(data, osp.join(self.processed_dir, 'NodeDataset_{}.pt'.format(i)))
        
    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, 'NodeDataset_{}.pt'.format(idx)))
        return data

In [32]:
dataset_node = NodeDataset(root='./')

Processing...
Done!


In [33]:
dataset_node[0]

Data(edge_attr=[504, 1], edge_index=[2, 504], test_mask=[50], train_mask=[50], val_mask=[50], x=[50, 3], y=[50])

## Example of graph classification task Dataset
    save one graph per .pt file

In [60]:
class GraphDataset_1(Dataset):
    """
    Graph classification 
    """
    def __init__(self, root, transform=None, pre_transform=None):
        super(GraphDataset_1, self).__init__(root,transform, pre_transform)

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [r'.\GraphDataset1_0.pt', r'.\GraphDataset1_1.pt', r'.\GraphDataset1_2.pt', r'.\GraphDataset1_3.pt', r'.\GraphDataset1_4.pt', r'.\GraphDataset1_5.pt', r'.\GraphDataset1_6.pt', r'.\GraphDataset1_7.pt', r'.\GraphDataset1_8.pt', r'.\GraphDataset1_9.pt']
    
    def download(self):
        pass

    def process(self):
        #data_list = [] # graph classification need to define data_list for multiple graph
        for i in range(num_graph):
            source_nodes, target_nodes = np.nonzero(Adj[i, :, :])
            source_nodes = source_nodes.reshape((1, -1))
            target_nodes = target_nodes.reshape((1, -1))

            edge_index = torch.tensor(np.concatenate((source_nodes, target_nodes), axis=0), dtype=torch.long) # edge_index should be long type

            edge_weight = edge_feature[i, source_nodes, target_nodes]
            edge_weight = torch.tensor(edge_weight.reshape((-1, num_edge_features)), dtype=torch.float) # edge_index should be float
            type

            x = torch.tensor(node_feature[i, :, :], dtype=torch.float) 
            
            # y should be long type, graph label should not be a 0-dimesion tensor
            # use [graph_label[i]] ranther than graph_label[i]
            y = torch.tensor([graph_label[i]], dtype=torch.long) 

            data = Data(x=x, edge_index=edge_index, y=y, edge_attr=edge_weight)
            #data_list.append(data)
            # save one graph per time
            torch.save(data, osp.join(self.processed_dir, 'graphDataset1_{}.pt'.format(i)))
            
    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, 'graphDataset1_{}.pt'.format(idx)))
        return data
            

In [61]:
dataset_graph_1 = GraphDataset_1(root='./')

Processing...
Done!


In [62]:
print(dataset_graph_1[0])
print(dataset_graph_1[1])

Data(edge_attr=[504, 1], edge_index=[2, 504], x=[50, 3], y=[1])
Data(edge_attr=[495, 1], edge_index=[2, 495], x=[50, 3], y=[1])
