## 数据

- torch_geometric.data
    - Data
    - DataLoader
- torch_geometric.datasets
- torch_geometric.transforms

In [None]:
'''
单个图由torch_geometric.data.Data的一个实例描述
具体属性：
    data.x: 结点特征矩阵，[num_nodes, num_node_features]
    data.edge_index: COO格式，[2, num_edges]，type为torch.long
'''

In [1]:
import torch
from torch_geometric.data import Data

In [2]:
# 创建一个无权无向图，三个结点，四条边，每个结点只有一个特征
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long) # 若edge_index = torch.tensor([[0, 1], [1, 0], [1, 2], [2, 1]], dtype=torch.long)，则data = Data(x=x, edge_index=edge_index.t().contiguous())
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)
data

Data(edge_index=[2, 4], x=[3, 1])

In [3]:
print(data.keys)
print(data['x'])


['x', 'edge_index']
tensor([[-1.],
        [ 0.],
        [ 1.]])


In [4]:
print(data.num_nodes, data.num_edges, data.num_node_features, data.contains_isolated_nodes(), data.contains_self_loops(), data.is_directed())

3 4 1 False False False


In [5]:
device = torch.device('cuda')
data = data.to(device)

In [None]:
'''
常见基准数据集
    所有Planetoid数据集: Cora, Citeseer, Pubmed
    所有图分类数据集
    QM7和QM9数据集
    一些3D mesh/point cloud数据集，如FAUST, ModelNet10/40和ShapeNet
这些数据集的初始化为自动下载源文件，并处理成上述Data格式
'''

In [6]:
# 下载ENZYMES数据集，包含600个图，共6类
from torch_geometric.datasets import TUDataset

In [7]:
dataset = TUDataset(root='data/ENZYMES', name='ENZYMES')

In [8]:
dataset

ENZYMES(600)

In [9]:
print(dataset.num_classes, dataset.num_node_features)

6 3


In [10]:
data = dataset[0] # 获取第一个图，168/2=84个无向边，该图被赋予一个类
data

Data(edge_index=[2, 168], x=[37, 3], y=[1])

In [11]:
data.is_undirected()

True

In [12]:
train_dataset = dataset[:540]
test_dataset = dataset[540:]
print(train_dataset, test_dataset)

ENZYMES(540) ENZYMES(60)


In [13]:
dataset = dataset.shuffle() # 等价于perm = torch.randperm(len(dataset))  dataset = dataset[perm]

In [14]:
# 下载Cora数据集(用于半监督图结点分类的基准数据集)
from torch_geometric.datasets import Planetoid

In [15]:
dataset = Planetoid(root='data/Cora', name='Cora') # 若遇到错误RemoteDisconnected: Remote end closed connection without response，则可直接从github上下载原始数据到目录data/Cora下

In [16]:
len(dataset)

1

In [17]:
print(dataset.num_classes, dataset.num_node_features)

7 1433


In [18]:
data = dataset[0]
print(data)

Data(edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])


In [19]:
print(data.is_undirected(), data.train_mask.sum().item(), data.val_mask.sum().item(), data.test_mask.sum().item()) # mask表示对应的划分使用到的结点

True 140 500 1000


In [None]:
'''
Mini-batches和DataLoader
'''

In [20]:
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader

In [21]:
dataset = TUDataset(root='data/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [22]:
for batch in loader:
    print(batch) # batch是个列向量，将每个结点映射到它对应的图中
    print(batch.num_graphs)

Batch(batch=[1116], edge_index=[2, 4172], ptr=[33], x=[1116, 21], y=[32])
32
Batch(batch=[1235], edge_index=[2, 4422], ptr=[33], x=[1235, 21], y=[32])
32
Batch(batch=[979], edge_index=[2, 3760], ptr=[33], x=[979, 21], y=[32])
32
Batch(batch=[1011], edge_index=[2, 3862], ptr=[33], x=[1011, 21], y=[32])
32
Batch(batch=[1047], edge_index=[2, 4166], ptr=[33], x=[1047, 21], y=[32])
32
Batch(batch=[1069], edge_index=[2, 4114], ptr=[33], x=[1069, 21], y=[32])
32
Batch(batch=[1009], edge_index=[2, 3916], ptr=[33], x=[1009, 21], y=[32])
32
Batch(batch=[1013], edge_index=[2, 3850], ptr=[33], x=[1013, 21], y=[32])
32
Batch(batch=[950], edge_index=[2, 3656], ptr=[33], x=[950, 21], y=[32])
32
Batch(batch=[977], edge_index=[2, 3894], ptr=[33], x=[977, 21], y=[32])
32
Batch(batch=[1026], edge_index=[2, 4004], ptr=[33], x=[1026, 21], y=[32])
32
Batch(batch=[1008], edge_index=[2, 3634], ptr=[33], x=[1008, 21], y=[32])
32
Batch(batch=[1000], edge_index=[2, 3892], ptr=[33], x=[1000, 21], y=[32])
32
Batch

In [23]:
from torch_scatter import scatter_mean
for data in loader:
    print(data)
    print(data.num_graphs)
    x = scatter_mean(data.x, data.batch, dim=0)
    print(x.size())

Batch(batch=[1026], edge_index=[2, 4048], ptr=[33], x=[1026, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1187], edge_index=[2, 4080], ptr=[33], x=[1187, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1087], edge_index=[2, 4158], ptr=[33], x=[1087, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1027], edge_index=[2, 3880], ptr=[33], x=[1027, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1084], edge_index=[2, 4310], ptr=[33], x=[1084, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1085], edge_index=[2, 4028], ptr=[33], x=[1085, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[949], edge_index=[2, 3810], ptr=[33], x=[949, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1000], edge_index=[2, 3998], ptr=[33], x=[1000, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1038], edge_index=[2, 3878], ptr=[33], x=[1038, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1066], edge_index=[2, 4162], ptr=[33], x=[1066, 21], y=[32])
32
torch.Size([32, 21])
Batch(batch=[1048], ed

In [None]:
'''
Data Transforms: 以Data为输入，返回一个变换后的Data
可用torch_geometric.transforms.Compose链接变换
'''

In [24]:
from torch_geometric.datasets import ShapeNet
dataset = ShapeNet(root='data/ShapeNet', categories=['Airplane']) # 点云数据集，包含17000个3D形状的点云，共有16个形状类别

Processing...
Done!


In [25]:
dataset[0]

Data(category=[1], pos=[2518, 3], x=[2518, 3], y=[2518])

In [26]:
# 通过生成最近邻图来将点云数据集变换成图数据集
import torch_geometric.transforms as T
dataset = ShapeNet(root='data/ShapeNet', categories=['Airplane'],
                    pre_transform=T.KNNGraph(k=6)) # 需要先删除之前生成的processed; 还可添加参数transform=T.RandomTranslate(0.01)
print(dataset[0])


Processing...
Done!
Data(category=[1], edge_index=[2, 15108], pos=[2518, 3], x=[2518, 3], y=[2518])


## 图上的学习方法

- torch_geometric.nn

In [27]:
dataset = Planetoid(root='data/Cora', name='Cora')

In [28]:
print(dataset)

Cora()


In [29]:
## 执行两层GCN
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [31]:
model.eval()
_, pred = model(data).max(dim=1)
correct = int(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / int(data.test_mask.sum())
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.7940
