## RGCN模型

In [None]:

class RGCN(nn.Module):
    def __init__(self, num_nodes, h_dim, out_dim, num_rels,
                 regularizer="basis", num_bases=-1, dropout=0.,
                 self_loop=False,
                 ns_mode=False):          
 

        # num_node
        # h_dim: 指定的embedding后的向量维度
        # out_dim: the number of dimensions of (h_i)^(l+1)
        # num_rel(int):number of relations 边类型的数量(关系数量)
        # regularizer: basis (基函数分解) or bdd(block-diagonal-decomposition) (块对角分解)
        # num_base:  W_r 分解的数量，对应公式（3）B值，即累加求和上限 (needed when 'regularizer' is specified)
          # 是我们人为给定的量
        # self_loop: 是否加入自身节点表示 (entity classfication)
          # 加上自身节点表示：Test Accuracy: 0.9167 
          # 不加自身节点表示：Test Accuracy: 0.8333 
        # ns_mode: False
          # 改成True会报错  
    
        super(RGCN, self).__init__()
        
        # num_base 不能小于0 且不能大于现有维度，否则复杂度会变高，参数反而增加
        # 所以如果num_bases = -1或者大于现有维度, 我们让正则化累加求和最大值就等于关系数量
        if num_bases == -1: 
            num_bases = num_rels
        # RelGraphConv(in_feat,out_feat,num_rels,regularizer,num_bases,bias,activation,self_loop,dropout, layer_norm=False)
      
        # 生成节点的embedding, 维度是我们指定的输出维度embedding: h_dim
        self.emb = nn.Embedding(num_nodes, h_dim) 
        # input_feature_size = output_feature_size = h_dim
        self.conv1 = RelGraphConv(h_dim, h_dim, num_rels, regularizer,
                                  num_bases, self_loop=self_loop) 
        self.conv2 = RelGraphConv(h_dim, out_dim, num_rels, regularizer, num_bases, self_loop=self_loop)
        
        self.dropout = nn.Dropout(dropout)
        self.ns_mode = ns_mode 



![formula_of_rgcn.jpg](https://s2.loli.net/2022/07/10/Gn9VWSRiMywT8oF.jpg)



![屏幕截图 2022-07-10 182331.jpg](https://s2.loli.net/2022/07/10/Rbo4d7qhp1azQC6.jpg)


![屏幕截图 2022-07-10 182353.jpg](https://s2.loli.net/2022/07/10/kmYropATFH9JCx6.jpg)

In [1]:
import dgl
import numpy as np
import torch as th
from dgl.nn import RelGraphConv


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
feat = th.ones(6, 10)
conv = RelGraphConv(10, 2, 3, regularizer='basis', num_bases=2)
etype = th.tensor([0,1,2,0,1,2])
res = conv(g, feat, etype)

res


tensor([[-0.7617,  2.6243],
        [ 0.2812,  1.5211],
        [-0.7617,  2.6243],
        [-2.0322,  0.6789],
        [ 0.2812,  1.5211],
        [-0.5258,  2.1150]], grad_fn=<AddBackward0>)

In [15]:
print(g.nodes())
print(g.edges())

# edge end nodes and edge IDs
print(g.edges(form = 'all'))

tensor([0, 1, 2, 3, 4, 5])
(tensor([0, 1, 2, 3, 2, 5]), tensor([1, 2, 3, 4, 0, 3]))
(tensor([0, 1, 2, 3, 2, 5]), tensor([1, 2, 3, 4, 0, 3]), tensor([0, 1, 2, 3, 4, 5]))


In [19]:
import torch as th
import torch.nn as nn

# Embedding.weight (Tensor) – the learnable weights of the module of shape (num_embeddings, embedding_dim)
# Initialized from normal distribution N(0,1)
emb = nn.Embedding(5, 2)
x = emb.weight

print(x)

Parameter containing:
tensor([[ 0.1645,  1.4833],
        [-0.9181, -0.3760],
        [-0.5819, -0.2450],
        [-1.4265, -1.2943],
        [ 0.0770, -0.3770]], requires_grad=True)


In [None]:
# 原始的节点或边的类型和对应的ID被存储在 ndata 和 edata 中
def forward(self, g, nids=None): # nids:原始的特定类型节点ID 
        if self.ns_mode: 
            # forward for neighbor sampling
            # dgl 中的边都是有向的，由 u指向 v
            # g[0]代表传入节点: u
            # g[1]表示传出节点：v  
            # dgl.NID: 节点特征, dgl.EID: 边特征
            x = self.emb(g[0].srcdata[dgl.NID]) # x是embedding之后的传入节点特征
            # 即在第一层输入传入节点u，embedding后的传入节点特征，传入节点特征的边特征， 传入节点的边的l2 norm(欧几里得范数)
            h = self.conv1(g[0], x, g[0].edata[dgl.ETYPE], g[0].edata['norm']) # g[0].edata[dgl.ETYPE]：传入节点的边类型
            h = self.dropout(F.relu(h))
            # 第二层输入传出节点v，第一层输出结果h, 传出节点的边特征，传出节点的边的欧几里得范数
            h = self.conv2(g[1], h, g[1].edata[dgl.ETYPE], g[1].edata['norm']) # g[1].edata[dgl.ETYPE]: 传出节点的边特征
            return h
        
        # entity classification & link prediction
        else: 
            # x 是初始化的权重
            x = self.emb.weight if nids is None else self.emb(nids)
            h = self.conv1(g, x, g.edata[dgl.ETYPE], g.edata['norm']) 
            h = self.dropout(F.relu(h))
            h = self.conv2(g, h, g.edata[dgl.ETYPE], g.edata['norm'])
            return h




## 实体分类(Entity Classification)
assign types and categorical properties to entities \
实体分类是通过在实体(节点)的最后嵌入处附加softmax分类器来完成的。训练是通过标准交叉熵的损失函数进行的。

## AIFB数据集介绍
用于预测数据集中人员的从属关系（即研究小组）
数据集由：staff, research groups, publications 组成
#### AIFB数据集 
- num_edges: 29,043
- Number of Classes: 4 i.e. four types of research groups
- labeled: 176 train:140, test:36
- num_nodes: 8285
- relations: 45

In [None]:
def main(args):
    g, num_rels, num_classes, labels, train_idx, test_idx, target_idx = load_data(
        args.dataset, get_norm=True)
    # data = load_data(dataset='aifb')
    # num_nodes = data.num_nodes = 29,043
    # num_rels = data.num_rels = 45
    # num_classes = data.num_classes = 4
    # labels = data.labels = 176
    # train_idx = data.train_idx
    # val_idx = train_idx[:len(train_idx) // 5]
    # train_idx = train_idx[len(train_idx) // 5:]
    model = RGCN(g.num_nodes(),
                 args.n_hidden,
                 num_classes,
                 num_rels,
                 num_bases=args.n_bases)

    if args.gpu >= 0 and th.cuda.is_available():
        device = th.device(args.gpu)
    else:
        device = th.device('cpu')
    labels = labels.to(device)
    model = model.to(device)
    g = g.int().to(device)

    optimizer = th.optim.Adam(model.parameters(), lr=1e-2, weight_decay=args.wd)

    model.train()
    for epoch in range(100):
        logits = model(g)
        logits = logits[target_idx]
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc = accuracy(logits[train_idx].argmax(dim=1), labels[train_idx]).item()
        print("Epoch {:05d} | Train Accuracy: {:.4f} | Train Loss: {:.4f}".format(
            epoch, train_acc, loss.item()))
    print()

    model.eval()
    with th.no_grad():
        logits = model(g)
    logits = logits[target_idx]
    test_acc = accuracy(logits[test_idx].argmax(dim=1), labels[test_idx]).item()
    print("Test Accuracy: {:.4f}".format(test_acc))

In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='RGCN for entity classification')
    parser.add_argument("--n-hidden", type=int, default=16,
                        help="number of hidden units")
    parser.add_argument("--gpu", type=int, default=-1,
                        help="gpu")
    parser.add_argument("--n-bases", type=int, default=-1,
                        help="number of filter weight matrices, default: -1 [use all]")
    parser.add_argument("-d", "--dataset", type=str, required=True,
                        choices=['aifb', 'mutag', 'bgs', 'am'],
                        help="dataset to use")
    parser.add_argument("--wd", type=float, default=5e-4,
                        help="weight decay")

    args = parser.parse_args()
    print(args)
    main(args)
    

## 关系预测(Link Prediction)
recover missing triples: (subject, relation,object) \
链路预测是通过使用参数化评分函数，使用自动编码器架构重建边缘来完成的。训练使用负采样。

## FB15k-237 数据集介绍：知识图谱Freebase的子集
15k表示知识库中由15,000个主题词，237表示共有237种关系
- 主题词总数：14505
- triple三元组总数: 544230
- 关系的种类数：474 
    - FB15k存在着正反关系，他们意义相同，指向相反。rgcn将一对正反关系视为不同的关系，因此统计数为237*2=474
- 每个主题词的平均三元组数: 37.5
- 每个主题词的平均关系数：10.3
- 每个关系连接的平均实体数：3.57 


In [None]:

class LinkPredict(nn.Module):
    def __init__(self, in_dim, num_rels, h_dim=500, num_bases=100, dropout=0.2, reg_param=0.01):
        super(LinkPredict, self).__init__()
        # in_dim = num_nodes
        # reg_param: 正则化参数 : l2 regularization to the decoder with penalty of 0.01
        self.rgcn = RGCN(in_dim, h_dim, h_dim, num_rels * 2, regularizer="bdd",
                         num_bases=num_bases, dropout=dropout, self_loop=True)
        self.dropout = nn.Dropout(dropout)
        self.reg_param = reg_param
        self.w_relation = nn.Parameter(th.Tensor(num_rels, h_dim))
        nn.init.xavier_uniform_(self.w_relation,
                                gain=nn.init.calculate_gain('relu'))

![score.jpg](https://s2.loli.net/2022/07/10/2U9OEWNiprl4RYG.jpg)

![loss.jpg](https://s2.loli.net/2022/07/10/Qx4cRq9svzPpoS6.jpg)
- l: logistic sigmoid function
- y = 0 for negative triples
- y = 1 for positive triples

In [None]:
def calc_score(self, embedding, triplets):
        # score即对应上方公式(1)
        # e_i = h_i
        # R_r 为对角矩阵
        # 我们用一个directed and labeled graph G = (V, E, R) 来表示知识库(knowledge base)
        # triplets： 即(subject, relation, object)元组
        # assign scores to possible edges(s, r, o) 来判定这些边属于E的可能性
        s = embedding[triplets[:,0]]
        r = self.w_relation[triplets[:,1]]
        o = embedding[triplets[:,2]]
        score = th.sum(s * r * o, dim=1)
        return score

def forward(self, g, nids):
        return self.dropout(self.rgcn(g, nids=nids))

def regularization_loss(self, embedding):
        return th.mean(embedding.pow(2)) + th.mean(self.w_relation.pow(2))

def get_loss(self, embed, triplets, labels):
        # each row in the triplets is a 3-tuple of (source, relation, destination)
        score = self.calc_score(embed, triplets)
        predict_loss = F.binary_cross_entropy_with_logits(score, labels)
        reg_loss = self.regularization_loss(embed)
        return predict_loss + self.reg_param * reg_loss

In [None]:
def main(args):
    data = FB15k237Dataset(reverse=False) # 所以这里的num_rel = 237
    graph = data[0] 
    num_nodes = graph.num_nodes() # 14541
    num_rels = data.num_rels # 237

    # test_g = Graph(num_nodes=14541, num_edges=544230)
    # train_g = Graph(num_nodes=14541,num_edges=272115)
    
    train_g, test_g = preprocess(graph, num_rels)
    test_nids = th.arange(0, num_nodes) # 节点ID
    test_mask = graph.edata['test_mask']
    subg_iter = SubgraphIterator(train_g, num_rels, args.edge_sampler) # 分成子图训练
    dataloader = GraphDataLoader(subg_iter, batch_size=1, collate_fn=lambda x: x[0])

    '''class SubgraphIterator:
    def __init__(self, g, num_rels, pos_sampler, sample_size=30000, num_epochs=6000):
        self.g = g
        self.num_rels = num_rels
        self.sample_size = sample_size
        self.num_epochs = num_epochs
        if pos_sampler == 'neighbor':
            self.pos_sampler = NeighborExpand(g, sample_size)
        else:
            self.pos_sampler = GlobalUniform(g, sample_size)
        self.neg_sampler = NegativeSampler()
    '''

    # Prepare data for metric computation
    # 设定元组
    src, dst = graph.edges()
    triplets = th.stack([src, graph.edata['etype'], dst], dim=1) 

    model = LinkPredict(num_nodes, num_rels)
    optimizer = th.optim.Adam(model.parameters(), lr=1e-2)

    if args.gpu >= 0 and th.cuda.is_available():
        device = th.device(args.gpu)
    else:
        device = th.device('cpu')
    model = model.to(device)
    # 用MRR(mean reciprocal rank)矩阵评估
    # 平均倒数排名是统计学中，依据排序的正确性，对查询请求响应结果的评估。
    # 查询响应结果的倒数排名是第一个正确答案的倒数积
    best_mrr = 0
    model_state_file = 'model_state.pth'
    for epoch, batch_data in enumerate(dataloader):
        model.train()
        g, train_nids, edges, labels = batch_data
        g = g.to(device) 
        train_nids = train_nids.to(device)
        edges = edges.to(device)
        labels = labels.to(device)

        embed = model(g, train_nids)
        loss = model.get_loss(embed, edges, labels)
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # clip gradients
        optimizer.step()

        print("Epoch {:04d} | Loss {:.4f} | Best MRR {:.4f}".format(epoch, loss.item(), best_mrr))

        if (epoch + 1) % 500 == 0:
            # perform validation on CPU because full graph is too large
            model = model.cpu()
            model.eval()
            print("start eval")
            embed = model(test_g, test_nids)
            mrr = calc_mrr(embed, model.w_relation, test_mask, triplets,
                           batch_size=500, eval_p=args.eval_protocol)
            # save best model
            if best_mrr < mrr:
                best_mrr = mrr
                th.save({'state_dict': model.state_dict(), 'epoch': epoch}, model_state_file)

            model = model.to(device)

In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='RGCN for entity classification')
    parser.add_argument("--n-hidden", type=int, default=16,
                        help="number of hidden units")
    parser.add_argument("--gpu", type=int, default=-1,
                        help="gpu")
    parser.add_argument("--n-bases", type=int, default=-1,
                        help="number of filter weight matrices, default: -1 [use all]")
    parser.add_argument("-d", "--dataset", type=str, required=True,
                        choices=['aifb', 'mutag', 'bgs', 'am'],
                        help="dataset to use")
    parser.add_argument("--wd", type=float, default=5e-4,
                        help="weight decay")

    args = parser.parse_args()
    print(args)
    main(args)
    