# torch_geometric.data.Data
[documents](https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html#data-handling-of-graphs)
### data.x 顶点的特征矩阵 dim:[num_nodes,num_node_features]
### data.edge_index 图的连通性矩阵 dim:[2,num_edges] torch.long
### data.edge_attr 边的特征矩阵 dim:[num_edges,num_edge_features]
### data.y 要训练的目标，dim随意，例如边的target是[num_nodes,*],全局的是[1,*]
### data.pos 节点的位置矩阵 dim[num_nodes,num_dimensions]

In [2]:
#定义一个无权重的无向图，它有3个顶点4条便，每个顶点有一个特征

import torch as t
from torch_geometric.data import Data

edge_index = t.tensor([[0,1,1,2],[1,0,2,1]],dtype=t.long)
x= t.tensor([[-1],[0],[1]],dtype=t.float32)

data=Data(x=x,edge_index=edge_index)
data

Data(x=[3, 1], edge_index=[2, 4])

edge_index是定义从一个边的起始点和终点，而不是节点下标的元组，如果要这么写的话，调用transpose和contiguous就可以

contiguous() → Tensor
        返回一个内存连续的有相同数据的 tensor，如果原 tensor 内存连续则返回原 tensor

In [3]:
edge_index2 = t.tensor([[0, 1],
                           [1, 0],
                           [1, 2],
                           [2, 1]], dtype=t.long)
edge_index2.t()
print(edge_index2.t().contiguous())


tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])


**虽然一个图只有两个边，我们要定义4个下标对来表示边的两个方向**


In [4]:
# 访问data的属性

print(data.keys)
print(data["x"])

for key,item in data:
    print(f'{key} found in data , it is {item}')

print("edge_attr" in data)

['edge_index', 'x']
tensor([[-1.],
        [ 0.],
        [ 1.]])
x found in data , it is tensor([[-1.],
        [ 0.],
        [ 1.]])
edge_index found in data , it is tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])
False


In [5]:
print(data.num_nodes)
print(data.num_node_features)
print(data.num_edges)
print(data.num_edge_features)
print(data.has_isolated_nodes())
print(data.has_self_loops())
print(data.is_directed())
print(data.is_undirected())
print(data.is_cuda)


3
1
4
0
False
False
False
True
False


# Benchmark Datasets
Planetoid 数据集 图分类数据集及其清洁版本，QM7 QM9数据集 少数3d网格、电云数据集

In [6]:
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root="./datasets/ENZYMES",name="ENZYMES")
dataset


ENZYMES(600)

In [7]:
print(len(dataset))
print(dataset.num_classes)
print(dataset.num_node_features)

600
6
3


we now have access to all 600 graphs in the dataset

In [8]:
data = dataset[0]
print(data)
print(data.is_undirected())
print(data.y)

Data(edge_index=[2, 168], x=[37, 3], y=[1])
True
tensor([5])


第一张图含有37个节点，一个节点有3个特征，有168/2 = 84 条边，然后这张图有一个类别，此外，data对象包含有一个图级别的target

使用切片 long或者bool张量来分割数据集

In [9]:
train_data = dataset[:540]
test_data = dataset[540:]
print(train_data,test_data)

ENZYMES(540) ENZYMES(60)


In [10]:
# shuffle
dataset = dataset.shuffle()
""" 
EQUALS to : 
permutation = t.randperm(len(dataset))
dataset = dataset [perm]
"""

' \nEQUALS to : \npermutation = t.randperm(len(dataset))\ndataset = dataset [perm]\n'

# CORA
半监督学习的标准数据集，用于节点分类

In [11]:
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root="./datasets/Cora",name="Cora")
print(dataset)
print(len(dataset))
print(dataset.num_classes)
print(dataset.num_node_features)

Cora()
1
7
1433


In [12]:
data = dataset[0]
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [13]:
print(data.is_undirected())
print(data.train_mask.sum().item())
print(data.train_mask.shape)
print(data.train_mask)
print(data.val_mask.sum().item())
print(data.test_mask.sum().item())


True
140
torch.Size([2708])
tensor([ True,  True,  True,  ..., False, False, False])
500
1000


这次，data对象中存储着每个节点的标签，和额外的顶点级别的属性：train_mask,val_mask,test_mask

# mini - batches
神经网络经常使用batch来加快速度，PYG也有类似功能，PyG通过创建稀疏的块状对角线邻接矩阵来实现小批量的并行化。这种组合允许在一个批次中对实例进行不同数量的节点和边的训练

pyg有自己的dataloader： torch_geometric.loader.DataLoader


In [14]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root="./datasets/ENZYMES",name = "ENZYMES",use_node_attr=True)
loader=DataLoader(dataset,batch_size=32,shuffle=True)
for batch in loader : 
    print(batch)
print("-------------------")
print(dataset[2])
print(f"batch.num_graphs is {batch.num_graphs}")

DataBatch(edge_index=[2, 3120], x=[839, 21], y=[32], batch=[839], ptr=[33])
DataBatch(edge_index=[2, 3968], x=[1022, 21], y=[32], batch=[1022], ptr=[33])
DataBatch(edge_index=[2, 4190], x=[1081, 21], y=[32], batch=[1081], ptr=[33])
DataBatch(edge_index=[2, 3948], x=[994, 21], y=[32], batch=[994], ptr=[33])
DataBatch(edge_index=[2, 4632], x=[1344, 21], y=[32], batch=[1344], ptr=[33])
DataBatch(edge_index=[2, 4002], x=[1083, 21], y=[32], batch=[1083], ptr=[33])
DataBatch(edge_index=[2, 4162], x=[1049, 21], y=[32], batch=[1049], ptr=[33])
DataBatch(edge_index=[2, 3832], x=[1000, 21], y=[32], batch=[1000], ptr=[33])
DataBatch(edge_index=[2, 3874], x=[996, 21], y=[32], batch=[996], ptr=[33])
DataBatch(edge_index=[2, 3884], x=[1053, 21], y=[32], batch=[1053], ptr=[33])
DataBatch(edge_index=[2, 4536], x=[1146, 21], y=[32], batch=[1146], ptr=[33])
DataBatch(edge_index=[2, 3924], x=[1130, 21], y=[32], batch=[1130], ptr=[33])
DataBatch(edge_index=[2, 3830], x=[998, 21], y=[32], batch=[998], ptr=

其中tg.data.Batch 是从 tg.data.Data继承过来的，它包含了batch中所有的信息

batch是一个向量，它保存了节点到节点属于的图的映射

In [15]:
batch.batch

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6, 

可以对它们进行处理

In [17]:
dataset = TUDataset(root="./datasets/ENZYMES",
                    name="ENZYMES", use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
for batch_data in loader:
    print(batch_data)
    break

batch_data.num_graphs

from torch_scatter import scatter_mean
x = scatter_mean(batch_data.x, batch_data.batch, dim=0)
x.size()



DataBatch(edge_index=[2, 3752], x=[1008, 21], y=[32], batch=[1008], ptr=[33])


torch.Size([32, 21])

# DATA TRANSFORMS
转化是torchvision里面一个很常见的操作，它被用于处理图片

可以向torchvision里面一样，将它们变成一个流水线
```python
import torch_geometric.transforms.Compose
```

In [2]:
from torch_geometric.datasets import ShapeNet
dataset = ShapeNet(root="./datasets/ShapeNet",categories=["Airplane"])
dataset[0]

Downloading https://shapenet.cs.stanford.edu/media/shapenetcore_partanno_segmentation_benchmark_v0_normal.zip
Extracting datasets\ShapeNet\shapenetcore_partanno_segmentation_benchmark_v0_normal.zip
Processing...
Done!


Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

In [None]:
import torch_geometric.transforms as T
dataset = ShapeNet( root="./datasets/ShapeNet",
                    categories=["Airplane"],
                    pre_transform=T.KNNGraph(k=6),
)
dataset[0]

使用pre_transform去处理数据，再将它存到磁盘上去，下次这个数据集被载入的时候就不需要做预处理，如果pre_transform没有与之匹配的数据集，将会得到警告

# 使用GNN处理数据
学习了 dataset dataloader之后，来进行一个实战

In [23]:
from torch_geometric.datasets import Planetoid
import torch as t
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

dataset = Planetoid(root="./datasets/Cora",name="Cora")



In [24]:
class GCN(t.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv1=GCNConv(dataset.num_node_features,16)
        self.conv2=GCNConv(16,dataset.num_classes)
    
    def forward(self,data):
        x,edge_index=data.x,data.edge_index
        x=self.conv1(x,edge_index)
        x=F.relu(x)
        x=F.dropout(x,training=self.training)
        x=self.conv2(x,edge_index)
        return F.log_softmax(x,dim=1)

device=t.device("cuda" if t.cuda.is_available() else "cpu")
model=GCN().to(device=device)
data = dataset[0].to(device)
optimizer = t.optim.Adam(model.parameters(),lr=0.01,weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out=model(data)
    loss = F.nll_loss(out[data.train_mask],data.y[data.train_mask])
    loss.backward()
    optimizer.step()



In [28]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')


Accuracy: 0.7990


# EXERCISES
1. What does edge_index.t().contiguous() do?

首先转置一下，然后调用contiguous进行深拷贝，保证数据是连续的，而不仅仅是改变数组的映射，在这里我觉得是出于性能考虑

2. Load the "IMDB-BINARY" dataset from the TUDataset benchmark suite and randomly split it into 80%/10%/10% training, validation and test graphs.

使用上面单元格里的切片就可以做到
```python
train_data = dataset[:540]
test_data = dataset[540:]
print(train_data,test_data)
```

3. What does each number of the following output mean?
```python
print(batch)
>>> DataBatch(batch=[1082], edge_index=[2, 4066], x=[1082, 21], y=[32])
```

batch=1082 : 有1082个节点
edge_index=[2,4066]:一共有4066条边
x=[1082,21]:有1082个节点，每个节点有21个features
y=[32] batch有32个标签，这里特征是全局的，也就是一个图一个标签
