# PyG的Data

In [48]:
import torch
from torch_geometric.data import Data
DATA_PATH = "../datas/"

edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)
data

Data(x=[3, 1], edge_index=[2, 4])

In [49]:
print(data.keys)
print("-" * 50)
print(data['x'])
print("-" * 50)
for key, item in data:
    print(f'{key} found in data')
print("-" * 50)
print('edge_attr' in data)
print("-" * 50)
print(data.num_nodes)
print("-" * 50)
print(data.num_edges)
print("-" * 50)
print(data.num_node_features)
print("-" * 50)
print(data.has_isolated_nodes())
print("-" * 50)
print(data.has_self_loops())
print("-" * 50)
print(data.is_directed())
print("-" * 50)

# Transfer data object to GPU.
device = torch.device('cuda')
data = data.to(device)

['x', 'edge_index']
--------------------------------------------------
tensor([[-1.],
        [ 0.],
        [ 1.]])
--------------------------------------------------
x found in data
edge_index found in data
--------------------------------------------------
False
--------------------------------------------------
3
--------------------------------------------------
4
--------------------------------------------------
1
--------------------------------------------------
False
--------------------------------------------------
False
--------------------------------------------------
False
--------------------------------------------------


# 通用基准数据集

In [50]:
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root=DATA_PATH + 'ENZYMES', name='ENZYMES')
print("length of ENZYMES:{}".format(len(dataset)))
print("num classes:{}".format(dataset.num_classes))
print("num node features:{}".format(dataset.num_node_features))
print(dataset[0])
print(dataset[:20]) # dataset切片
print(dataset[0].is_undirected())
# print(dataset.__dict__) #查看具体属性

length of ENZYMES:600
num classes:6
num node features:3
Data(edge_index=[2, 168], x=[37, 3], y=[1])
ENZYMES(20)
True


In [51]:
dataset = dataset.shuffle() # dataset打乱
# 上述方法等于
perm = torch.randperm(len(dataset))
dataset = dataset[perm]

In [52]:
'''from torch_geometric.datasets import Planetoid

dataset = Planetoid(root=DATA_PATH + 'Cora', name='Cora')
print("length of Cora:{}".format(len(dataset)))
print("num classes:{}".format(dataset.num_classes))
print("num node features:{}".format(dataset.num_node_features))'''

'from torch_geometric.datasets import Planetoid\n\ndataset = Planetoid(root=DATA_PATH + \'Cora\', name=\'Cora\')\nprint("length of Cora:{}".format(len(dataset)))\nprint("num classes:{}".format(dataset.num_classes))\nprint("num node features:{}".format(dataset.num_node_features))'

# mini-batch

In [53]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root=DATA_PATH + 'ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for batch in loader:
    print(batch)
    break

print(dataset[0])
print(dataset[200])
print(list(batch.__dict__["_store"].keys()))

DataBatch(edge_index=[2, 4414], x=[1115, 21], y=[32], batch=[1115], ptr=[33])
Data(edge_index=[2, 168], x=[37, 21], y=[1])
Data(edge_index=[2, 106], x=[29, 21], y=[1])
['edge_index', 'x', 'y', 'batch', 'ptr']


# Batch子图处理

In [54]:
from torch_scatter import scatter_mean

dataset = TUDataset(root= DATA_PATH + 'ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for data in loader:
    print(data)
    print(data.num_graphs)
    x = scatter_mean(data.x, data.batch, dim=0)
    print(x.size())
    break

DataBatch(edge_index=[2, 3930], x=[1052, 21], y=[32], batch=[1052], ptr=[33])
32
torch.Size([32, 21])


## Data Transform

In [55]:
import torch_geometric.transforms as T
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(root=DATA_PATH + 'ShapeNet', categories=['Airplane'],
                    pre_transform=T.KNNGraph(k=6),
                    transform=T.RandomTranslate(0.01))
# lastest version: RandomTranslate convert to RandomJitter

print(dataset[0])
print(dataset[0].edge_index)
print(dataset[0].y)

Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1], edge_index=[2, 15108])
tensor([[ 927,  929, 2365,  ..., 2516,  798, 1644],
        [   0,    0,    0,  ..., 2517, 2517, 2517]])
tensor([0, 0, 3,  ..., 3, 1, 1])


## Learning Methods on Graphs

In [56]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid


dataset = Planetoid(root= DATA_PATH + 'Cora', name='Cora')
print(dataset[0])

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # GCNConv类的forward类首先调用基类MessagePassing中的propogate,
        # propogate会调起覆写的message_and_passing()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gcn = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(gcn.parameters(), lr=0.01, weight_decay=5e-4)

gcn.train()
for epoch in range(100):
    optimizer.zero_grad()
    out = gcn(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

gcn.eval()
pred = gcn(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Accuracy: 0.8020


# Exercise

In [57]:
import numpy as np
import pandas as pd


dataset = TUDataset(root=DATA_PATH + 'IMDB-BINARY', name='IMDB-BINARY',use_node_attr=True, use_edge_attr = True)
print("data info:{}, data len:{}".format(dataset[0],len(dataset)))

data info:Data(edge_index=[2, 146], y=[1], num_nodes=20), data len:1000


In [58]:
dataset[0].__dict__

{'_store': {'edge_index': tensor([[ 0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,
           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
           3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,
           6,  6,  6,  6,  6,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,
           9,  9,  9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12,
          12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
          15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
          18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
          19, 19],
         [ 2,  4,  5,  9, 10,  2,  6,  8, 12, 14, 17, 18, 19,  0,  1,  3,  4,  5,
           6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,  2,  6, 11, 12,
          16, 19,  0,  2,  5,  9, 10,  0,  2,  4,  9, 10,  1,  2,  3,  8, 11, 12,
          14, 16, 17, 18, 19,  2, 13, 15,  1,  2,  6, 

In [59]:
def dataset_split(dataset, train_frac = 0.8, val_frac = 0.1, test_frac = 0.1):
    sample_num = len(dataset)
    idxs = np.array(range(sample_num))
    np.random.shuffle(idxs)
    trn_end = int(train_frac * sample_num)
    val_end = trn_end + int(val_frac * sample_num)
    tst_end = val_end + int(test_frac * sample_num)

    trn_data = dataset[:trn_end]
    val_data = dataset[trn_end: val_end]
    tst_data = dataset[val_end: tst_end]

    return trn_data, val_data, tst_data

trn_data, val_data, tst_data = dataset_split(dataset)
trn_loader = DataLoader(trn_data, batch_size=32)
val_loader = DataLoader(val_data, batch_size=32)
tst_loader = DataLoader(tst_data, batch_size=32)