In [None]:
#https://github.com/dmlc/dgl/blob/master/examples/pytorch/gat/train.py
#节点分类：临近节点属性替换该节点
#边 ： 不能用临近节点属性替换
#所有值都是数值型：字符转成哑变量

In [2]:
import os
dir = "../lz-graph/"
print(os.listdir(dir))

['data_preprocessing.ipynb', 'pyg_autoencoder.ipynb', 'dgl_gcn_example.ipynb', 'DGL_GCN.ipynb', 'DGL_GAT_demo.ipynb', 'NOTE.txt', 'DGL_GAT.ipynb', '.ipynb_checkpoints', 'data', 'pyg_autoencoder_example.py']


In [3]:
"""
Graph Attention Networks in DGL using SPMV optimization.
References
----------
Paper: https://arxiv.org/abs/1710.10903
Author's code: https://github.com/PetarV-/GAT
Pytorch implementation: https://github.com/Diego999/pyGAT
"""
import dgl
import pandas as pd
import torch
import torch.nn as nn
import dgl.function as fn
from dgl.nn.pytorch import EdgeSoftmax

## 数据准备

In [4]:
nodes_data = '../lz-graph/data/nodes.csv'
nodes_data = pd.read_csv(nodes_data,header = 0 ,index_col=0)
nodes_data.head(2)

  mask |= (ar1 == a)


Unnamed: 0,id,ACCTNO,CUSTOMTYPE,NAME
0,0,0,0,MISSING
1,1,6214664260258704,0,


In [5]:
links_data = '../lz-graph/data/links.csv'
links = pd.read_csv(links_data,header = 0 ,index_col=0)
links.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,source,target,TIME,TRANSOURCE,ACCTNO,ACCTNO1,TRANAMT,CDFLAG
0,327880,0,2015-07-01-00.09.48.687448,88,101661000111853,,0.0,C
1,338506,316022,2015-07-01-00.00.34.024563,61,78652380011,6.214968250550032e+18,40000.0,D


In [6]:
import time
links["timestamp"] = (links["TIME"].str[0:19])
#links["timestamp"] = pd.to_datetime(links["timestamp"],format = '%Y-%m-%d-%H.%M.%S' )
links["timestamp"]  = links["timestamp"].apply(lambda x : time.mktime(time.strptime(x,'%Y-%m-%d-%H.%M.%S')   )) 
links["ACCTNO"] = links["ACCTNO"].astype(str)  #
links["ACCTNO1"] = links["ACCTNO1"].astype(str) 
links.head(2)

Unnamed: 0,source,target,TIME,TRANSOURCE,ACCTNO,ACCTNO1,TRANAMT,CDFLAG,timestamp
0,327880,0,2015-07-01-00.09.48.687448,88,101661000111853,,0.0,C,1435681000.0
1,338506,316022,2015-07-01-00.00.34.024563,61,78652380011,6.214968250550032e+18,40000.0,D,1435680000.0


## 选择异常或正常点做标签

In [7]:
acctno = "102561000017549"
links.dtypes

source          int64
target          int64
TIME           object
TRANSOURCE     object
ACCTNO         object
ACCTNO1        object
TRANAMT       float64
CDFLAG         object
timestamp     float64
dtype: object

In [8]:
links.loc[(links["ACCTNO1"]== acctno),:]

Unnamed: 0,source,target,TIME,TRANSOURCE,ACCTNO,ACCTNO1,TRANAMT,CDFLAG,timestamp


In [None]:
links.loc[(links["ACCTNO"]==acctno) | (links["ACCTNO1"]==acctno),:]

## Graph  data

In [10]:
cleans = links.loc[links["TRANAMT"]>1000,:]
import torch as th
#nodes_id = th.tensor(nodes_data["id"].astype("int").values)
def build_graph(nodes,links):
    g = dgl.DGLGraph()
    # add 34 nodes into the graph; nodes are labeled from 0~33
    g.add_nodes(len(nodes))
    edge_list = [tuple(x) for x in links[["source","target"]].values]
    src, dst = tuple(zip(*edge_list))
    print(type(src))
    g.add_edges(src,dst)
    #g.edata["y"] = links.as_matrix()
    return g
G = build_graph(nodes = nodes_data,links = cleans)
print('We have %d nodes.' % G.number_of_nodes())
print('We have %d edges.' % G.number_of_edges())
print(type(G))

<class 'tuple'>
We have 1631199 nodes.
We have 2564961 edges.
<class 'dgl.graph.DGLGraph'>


In [11]:
import torch
customtype = pd.get_dummies(nodes_data["CUSTOMTYPE"].astype(str)).as_matrix()   #convert pandas to pytorch tensor
inputs = th.tensor(customtype,dtype = torch.float)   #哑变量  ,,dtype=torch.long  #https://wsonh.com/article/6.html   #nodes的features
id0 = nodes_data.loc[nodes_data["ACCTNO"]=="102561000017549","id"].values[0]    #来源于数据源，任意取
id1 = nodes_data.loc[nodes_data["ACCTNO"]=="101091000178241","id"].values[0]    #来源于异常报告，挑选的几个典型的异常点  ，#2015/7/16
id2 = nodes_data.loc[nodes_data["ACCTNO"]=="102192000158598","id"].values[0]  
id3 = nodes_data.loc[nodes_data["ACCTNO"]=="6214968210550305540","id"].values[0] #2015/7/16
#id3 = nodes_data.loc[nodes_data["ACCTNO"]=="102191000178651","id"].values[0]  #暂时，找不到这个异常点

labeled_nodes = th.tensor([id0 ,id1, id2 ,id3])  # only the instructor and the president nodes are labeled
labels = th.tensor([0,1,1,0])  # their labels are different

  


## GAT

In [12]:
# 所有问题都要转换成矩阵，并且行数要是node的个数。因此边的属性，可能是多个dataframe，也就是需要tensor的结果来组织，进行计算。3维怎么计算？
#多维张量 tensor的计算规则。
#基于多维张量的清洗，运算方式。矩阵计算
import torch
import torch.nn as nn
import torch.nn.functional as F

class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.g = g
        # 公式 (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # 公式 (2)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)

    def edge_attention(self, edges):
        # 公式 (2) 所需，边上的用户定义函数
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e' : F.leaky_relu(a)}
    
    #消息传递函数
    def message_func(self, edges):                             #属性定义成变量
        # 公式 (3), (4)所需，传递消息用的用户定义函数
        return {'z' : edges.src['z'], 'e' : edges.data['e']}   #把边属性放进来了,e的结构不确定，但边里面知道来源节点和目标节点信息
    #消息累计函数
    def reduce_func(self, nodes):                       #可以看作是：数据清理函数，定义*****，打印验证
        # 公式 (3), (4)所需, 归约用的用户定义函数         #softmax:标准化，mailbox：邻近节点特征
        # 公式 (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)     #边和节点进行了计算，得到一个规范矩阵==》进模型
        # 公式 (4)                                       #使用 Softmax 计算每个类别的概率，使用向后传播更新参数。?
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)      #对所有邻节点节点特征求平均并覆盖原本的节点特征===》有点像时间窗口？
        return {'h' : h}

    def forward(self, h):
        # 公式 (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # 公式 (2)
        self.g.apply_edges(self.edge_attention)
        # 公式 (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        return self.g.ndata.pop('h')

## 多头注意力 (Multi-head attention)

神似卷积神经网络里的多通道，GAT 引入了多头注意力来丰富模型的能力和稳定训练的过程。每一个注意力的头都有它自己的参数。如何整合多个注意力机制的输出结果一般有两种方式：

In [13]:
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # 对输出特征维度（第1维）做拼接
            return torch.cat(head_outs, dim=1)
        else:
            # 用求平均整合多头结果
            return torch.mean(torch.stack(head_outs))

In [14]:
# 定义一个两层的 GAT 模型：
class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads)
        # 注意输入的维度是 hidden_dim * num_heads 因为多头的结果都被拼接在了
        # 一起。 此外输出层只有一个头。
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1)

    def forward(self, h):
        h = self.layer1(h)
        h = F.elu(h)
        h = self.layer2(h)
        return h

In [15]:
# 我们使用 DGL 自带的数据模块加载 Cora 数据集。
from dgl import DGLGraph
from dgl.data import citation_graph as citegrh

def load_cora_data():
    data = citegrh.load_cora()                     # num_nodes=2708, num_edges=10556
    features = torch.FloatTensor(data.features)    #2708 , 1433
    labels = torch.LongTensor(data.labels)         #2708  [int]==》目标变量的取值
    mask = torch.ByteTensor(data.train_mask)       #2708  [1,0]===>标签？
    g = DGLGraph(data.graph)
    return g, features, labels, mask

In [16]:
# 新加的 准确率 函数、训练集、测试集、验证集
# 参考：https://github.com/dmlc/dgl/blob/master/examples/pytorch/gat/train.py
def accuracy(logits, labels):
    _, indices = torch.max(logits, dim=1)
    correct = torch.sum(indices == labels)     # indices 预测值  == label便签。correct个数
    return correct.item() * 1.0 / len(labels)  #预测正确的个数/总标签个数

def evaluate(model, features, labels):
    model.eval()
    with torch.no_grad():                      #再次验证
        logits = model(features)
        logits = logits[labels]
        return accuracy(logits, labels)

In [17]:
import requests
import time
import numpy as np

# 创建模型
net = GAT(G, 
          in_dim=4, 
          hidden_dim=3, 
          out_dim=2, 
          num_heads=3)
#print(net)

# 创建优化器
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

# 主流程
dur = []
for epoch in range(30):
    if epoch >=3:
        t0 = time.time()

    logits = net(inputs)   #预测值
    logp = F.log_softmax(logits, 1)  #标准化 ==>值域0,1==》概率
    loss = F.nll_loss(logp[labeled_nodes], labels)

    optimizer.zero_grad()  # 清零所有参数（parameter）的梯度缓存
    loss.backward()        #loss.backward()来反向传播权重
    optimizer.step()       # 更新参数
    train_acc = accuracy(logp[labeled_nodes], labels)   #===mask
    
    if epoch >=3:
        dur.append(time.time() - t0)

    print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}| TrainAcc {:.4f}".format(
            epoch, loss.item(), np.mean(dur),train_acc))


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Epoch 00000 | Loss 0.6624 | Time(s) nan| TrainAcc 0.2500
Epoch 00001 | Loss 0.6614 | Time(s) nan| TrainAcc 0.2500
Epoch 00002 | Loss 0.6604 | Time(s) nan| TrainAcc 0.5000
Epoch 00003 | Loss 0.6594 | Time(s) 40.0679| TrainAcc 0.5000
Epoch 00004 | Loss 0.6585 | Time(s) 40.1332| TrainAcc 0.5000
Epoch 00005 | Loss 0.6575 | Time(s) 40.0968| TrainAcc 0.5000
Epoch 00006 | Loss 0.6565 | Time(s) 40.0323| TrainAcc 0.7500
Epoch 00007 | Loss 0.6555 | Time(s) 40.0267| TrainAcc 0.7500
Epoch 00008 | Loss 0.6545 | Time(s) 40.0211| TrainAcc 0.7500
Epoch 00009 | Loss 0.6536 | Time(s) 40.0029| TrainAcc 0.7500
Epoch 00010 | Loss 0.6526 | Time(s) 40.0304| TrainAcc 0.7500
Epoch 00011 | Loss 0.6516 | Time(s) 40.0118| TrainAcc 0.7500
Epoch 00012 | Loss 0.6506 | Time(s) 40.0165| TrainAcc 0.7500
Epoch 00013 | Loss 0.6497 | Time(s) 40.0725| TrainAcc 0.7500
Epoch 00014 | Loss 0.6487 | Time(s) 40.0956| TrainAcc 0.7500
Epoch 00015 | Loss 0.6477 | Time(s) 40.1261| TrainAcc 0.7500
Epoch 00016 | Loss 0.6467 | Time(s) 

In [146]:
acc = evaluate(net, inputs, labels)
print("Test Accuracy {:.4f}".format(acc))

Test Accuracy 1.0000


In [143]:
_, indices = torch.max(logp, dim=1)   #  预测值
indices

tensor([0, 1, 1,  ..., 1, 1, 1])

In [144]:
sum(indices)

tensor(1451686)

In [147]:
logp[labeled_nodes]
_, indices = torch.max(logp[labeled_nodes], dim=1)   #  预测值
indices

tensor([1, 1, 1, 0])