In [2]:
import os
import pickle
import networkx as nx
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import normalize
from graph_utils import read_graph, cal_gcn_matrix
from sampling import EdgeSampler
from tqdm import trange
import pandas as pd
import random

In [4]:
#权重矩阵的初始化
def xavier_init(fan_in, fan_out, constant=1):
    high = constant * np.sqrt(6.0 / (fan_in + fan_out))
    low = -high
    # 均匀分布
    return tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32)

class GraRep:
    """
    This is the base class of graph representation network.
    """
    def __init__(self, graph, node_features, embed_dim=64, batch_size=8, learning_rate=1e-4, regularization=0.1):
        self.graph = graph
        # 计算当前图的节点数量和边的数量
        self.node_num = self.graph.number_of_nodes()
        self.edge_num = self.graph.number_of_edges()
        print("EdgeSampler:nodes=%s, edges=%s"
              % (self.node_num, self.edge_num))

        self.node_features = node_features#(m*n,理论上m表示节点数，n表示特征维度)
        self.feature_dim = self.node_features.shape[1]

        self.embed_dim = embed_dim
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.regularization = regularization

    def _init_params(self):
        pass

    def _construct_network(self):
        pass


In [5]:

class GCN(GraRep):
    def __init__(self, completed_graph, node_features, first_layer_dim=64, embed_dim=64
                 , neg_num=5, batch_size=100):
        super().__init__(completed_graph, node_features, embed_dim=embed_dim
                         , learning_rate=1e-4, batch_size=batch_size)
        self.neg_num = neg_num
        self.node_features = node_features
        self.sample_num = self.batch_size * (1 + self.neg_num)
        # 正负样本数*batch(每次抓取的样本量)=检验数
        self.batch_size = batch_size
        self.node_size = self.graph.number_of_nodes()

        tf.reset_default_graph()
        self.first_layer_dim = first_layer_dim

        self._init_params()  # 初始化参数
        self._construct_network()  # 构造网络
        self._optimize_line()  # 开始进行优化，

        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())

    # 构造模型参数
    def _init_params(self):
        '''
        注意层层之间的权重和卷积层权重的区别，GCN的卷积层权重是共享的，针对节点级别；
        而需要训练的权重参数每层是不一样的，针对特征级别
        '''
        # 初始化GCN的权重参数和偏置项
        self.w0 = tf.Variable(xavier_init(self.feature_dim, self.first_layer_dim))
        self.b0 = tf.Variable(tf.zeros([self.first_layer_dim], dtype=tf.float32))
        self.w1 = tf.Variable(xavier_init(self.first_layer_dim, self.embed_dim))
        self.b1 = tf.Variable(tf.zeros([self.embed_dim], dtype=tf.float32))

        # 构造图的邻接矩阵和GCN卷积部分的权重矩阵
        self.adj_matrix = self.node_features#直接输入拓扑结构
        self.gcn_matrix = np.array(cal_gcn_matrix(self.adj_matrix), np.float32)#权值共享

    # 构造网络结构
    def _construct_network(self):
        print("GCN 第一层， gcn_matrix*node_features")
        # 第一层，初始化H为特征向量
        print(self.gcn_matrix.dtype)
        print(self.node_features.dtype)
        #ax = np.matmul(self.gcn_matrix, self.node_features)，将处理过的拉普拉斯矩阵作为特征输入
        ax = np.matmul(self.gcn_matrix, self.gcn_matrix)
        self.hidden = tf.nn.leaky_relu(tf.add(tf.matmul(ax, self.w0), self.b0))
        print("GCN 第二层")
        self.embed = tf.nn.leaky_relu(tf.add(tf.matmul(tf.matmul(self.gcn_matrix, self.hidden), self.w1), self.b1))
#训练出节点表示
    def _optimize_line(self):
        """
        Unsupervised traininig in LINE manner.
        """
        self.u_i = tf.placeholder(name='u_i', dtype=tf.int32, shape=[self.sample_num])
        self.u_j = tf.placeholder(name='u_j', dtype=tf.int32, shape=[self.sample_num])
        self.label = tf.placeholder(name='label', dtype=tf.float32, shape=[self.sample_num])

        self.u_i_embedding = tf.matmul(tf.one_hot(self.u_i, depth=self.node_num, dtype=tf.float32)
                                       , self.embed)
        #可以推导出one-hot:n*1；embed:n*k(k表示embed的最终维度)，因此计算得来的是u_i的特征表示：k*1
        self.u_j_embedding = tf.matmul(tf.one_hot(self.u_j, depth=self.node_num, dtype=tf.float32)
                                       , self.embed)
        self.inner_product = tf.reduce_sum(self.u_i_embedding * self.u_j_embedding, axis=1)
        #理解为向量之间的相似度
        self.loss = -tf.reduce_mean(tf.log_sigmoid(self.label * self.inner_product))
        #交叉熵损失函数，有边时label=1，minimize loss=maximize 相似度
        
        # 定义优化器
        self.line_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

    def train_line(self, u_i, u_j, label):
        """
        Train one minibatch.
        """
        feed_dict = {self.u_i: u_i, self.u_j: u_j, self.label: label}
        _, loss = self.sess.run((self.line_optimizer, self.loss), feed_dict=feed_dict)
        return loss

    def cal_encoder_embed(self):
        return self.sess.run(self.embed)
    

In [39]:
# 获取邻接矩阵
F = nx.karate_club_graph()
adj = np.array(nx.adjacency_matrix(F).todense())
adj

array([[0, 1, 1, ..., 1, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 1, 1],
       [0, 0, 1, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [40]:
# 为节点添加权重
for i in list(F.nodes):
    F.nodes[i]['weight']=1 # 注意版本，g.node属性已经被弃用

In [41]:
# 为边添加权重
for edge in list(F.edges):
    edge_1 = edge[0]
    edge_2 = edge[1]
    #print(edge_1)
    #g[1][2]['weight'] = 4.7
    F[edge_1][edge_2]['weight'] = 1

In [42]:
node_size = F.number_of_nodes()
node_attr = np.array(adj, np.float32) # 邻接矩阵
neg_num = 5 # 负采样的个数
batch_size = 100
display_batch = 100

In [43]:
model = GCN(F, node_attr, neg_num=neg_num, batch_size=batch_size)

# 采样的结果是用来计算LOSS
print("进行子图采样")
sampler_view_graph = EdgeSampler(F, batch_size, neg_num) 

EdgeSampler:nodes=34, edges=78
GCN 第一层， gcn_matrix*node_features
float32
float32
GCN 第二层
进行子图采样
EdgeSampler:nodes=34, edges=78
Finished edge sampler initialization.




### training model 

In [44]:
avg_loss = 0.
total_batch = node_attr.shape[0]

with trange(total_batch, desc='loss %8.6f' % 0.) as pbar:
    for i in pbar:
        u_i, u_j, label = sampler_view_graph.next_batch()
        loss = model.train_line(u_i, u_j, label)
        avg_loss += loss / display_batch
        if i % display_batch == 0 and i > 0:
            pbar.set_description('loss %8.6f' % avg_loss)
            avg_loss = 0.


loss 0.000000: 100%|██████████| 34/34 [00:00<00:00, 76.53it/s]


In [45]:
# node repr 
base_embed = model.cal_encoder_embed()
base_embed

array([[-0.0293884 ,  0.23871067, -0.00122043, ...,  0.37645763,
        -0.10584487,  0.01860837],
       [-0.00992001,  0.28612608, -0.01248355, ..., -0.0256904 ,
        -0.07387978, -0.02090638],
       [-0.02338236,  0.11746843, -0.00154851, ...,  0.18492165,
        -0.06972755,  0.17885882],
       ...,
       [ 0.01529879, -0.04324222,  0.02093286, ...,  0.12498906,
        -0.03342825, -0.01526836],
       [ 0.11964564, -0.00680267,  0.22470786, ...,  0.00816328,
        -0.02263976, -0.00905914],
       [ 0.20402251, -0.00266688,  0.23170935, ..., -0.06255551,
        -0.06614263,  0.10069647]], dtype=float32)

In [46]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(base_embed)

In [56]:
Hi=[]
officer = []
for i in range(34):
    print(F.nodes[i]['club'])
    if F.nodes[i]['club'] == 'Officer':
        officer.append(i)
    else:
        Hi.append(i)
l=[]
l.append(set(Hi))
l.append(set(officer))
l

Mr. Hi
Mr. Hi
Mr. Hi
Mr. Hi
Mr. Hi
Mr. Hi
Mr. Hi
Mr. Hi
Mr. Hi
Officer
Mr. Hi
Mr. Hi
Mr. Hi
Mr. Hi
Officer
Officer
Mr. Hi
Mr. Hi
Officer
Mr. Hi
Officer
Mr. Hi
Officer
Officer
Officer
Officer
Officer
Officer
Officer
Officer
Officer
Officer
Officer
Officer


[{0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 16, 17, 19, 21},
 {9, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}]

In [48]:
def similarity(l1,l2):
    sum_1 = 0
    sum_2 = 0
    #max_ = 0
    for i in l1:
        max_ = 0
        for j in l2:
            if len(i&j) > max_:
                max_ = len(i&j)
                # print(i&j)
                flag_ = l2.index(j)
        # print(max_,flag_)(测试使用)
        sum_1 += max_
        sum_2 += len(i)
    return sum_1/sum_2

In [58]:
f = 0
for i in list(kmeans.labels_):
    l_2[i].add(f)
    f += 1
l_2

[{8, 9, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33},
 {0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 16, 17, 19, 21}]

In [59]:
print(similarity(l,l_2))
print(similarity(l_2,l))
print((similarity(l,l_2)+similarity(l_2,l))/2)

0.9705882352941176
0.9705882352941176
0.9705882352941176
