In [1]:
import pandas as pd
import numpy as np
import torch
from scipy.sparse import coo_matrix
from torch_geometric.nn import Node2Vec
from torch_geometric.data import Data

In [2]:
def load_graph(graph_path):
  df = pd.read_csv(graph_path,index_col=0)
  node_list = df.index
  A = df.values
  A=torch.LongTensor(A)
  ed = coo_matrix(A)
  edge_index = torch.LongTensor(np.array([ed.row, ed.col]))
  return edge_index,node_list

In [3]:
edge_index,node_list = load_graph('../data/processed data/adj.csv')
edge_index

tensor([[ 0,  0,  0,  ..., 65, 65, 65],
        [ 0,  7, 13,  ..., 56, 64, 65]])

In [4]:
node_list

Index([  4,  12,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,  79,
        87,  88,  90, 100, 107, 113, 114, 116, 120, 125, 127, 128, 137, 140,
       141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166,
       170, 186, 194, 202, 209, 211, 224, 229, 230, 231, 232, 233, 234, 236,
       237, 238, 239, 243, 244, 246, 249, 261, 262, 263],
      dtype='int64', name='id')

In [5]:
data = Data(edge_index=edge_index)

In [6]:
edge_index.size()

torch.Size([2, 964])

In [7]:
data.edge_index

tensor([[ 0,  0,  0,  ..., 65, 65, 65],
        [ 0,  7, 13,  ..., 56, 64, 65]])

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:


# 初始化Node2Vec模型
model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=80,
                 context_size=10, walks_per_node=10, num_negative_samples=1,
                 p=1, q=1, sparse=True)
model.to(device)

# 训练模型
loader = model.loader(batch_size=1, shuffle=True, num_workers=8)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01)
model.train()
for epoch in range(1, 100):
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        # loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss = torch.norm(model.loss(pos_rw.to(device), neg_rw.to(device)), p=2)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, total_loss / len(loader)))


Epoch: 01, Loss: 5.2307
Epoch: 02, Loss: 2.9323
Epoch: 03, Loss: 2.7628
Epoch: 04, Loss: 2.7198
Epoch: 05, Loss: 2.7105
Epoch: 06, Loss: 2.6991
Epoch: 07, Loss: 2.6125
Epoch: 08, Loss: 2.6146
Epoch: 09, Loss: 2.5998
Epoch: 10, Loss: 2.5869
Epoch: 11, Loss: 2.5839
Epoch: 12, Loss: 2.5296
Epoch: 13, Loss: 2.4984
Epoch: 14, Loss: 2.4889
Epoch: 15, Loss: 2.4773
Epoch: 16, Loss: 2.4416
Epoch: 17, Loss: 2.4012
Epoch: 18, Loss: 2.4004
Epoch: 19, Loss: 2.3797
Epoch: 20, Loss: 2.3399
Epoch: 21, Loss: 2.3292
Epoch: 22, Loss: 2.2649
Epoch: 23, Loss: 2.2774
Epoch: 24, Loss: 2.2589
Epoch: 25, Loss: 2.2249
Epoch: 26, Loss: 2.2078
Epoch: 27, Loss: 2.1760
Epoch: 28, Loss: 2.1902
Epoch: 29, Loss: 2.1827
Epoch: 30, Loss: 2.1322
Epoch: 31, Loss: 2.1532
Epoch: 32, Loss: 2.1330
Epoch: 33, Loss: 2.1177
Epoch: 34, Loss: 2.0651
Epoch: 35, Loss: 2.0147
Epoch: 36, Loss: 1.9927
Epoch: 37, Loss: 1.9652
Epoch: 38, Loss: 1.9656
Epoch: 39, Loss: 1.9829
Epoch: 40, Loss: 1.9737
Epoch: 41, Loss: 1.9705
Epoch: 42, Loss:

In [45]:
embeddings = model()

In [46]:
zz = model.embedding.weight.data.cpu()

In [47]:
zz = embeddings.cpu().detach().numpy()

In [48]:
zz

array([[ 0.35409838, -0.61919445, -1.08318   , ..., -0.726736  ,
        -0.5961575 ,  1.141425  ],
       [-0.958784  , -0.68215865, -0.5997079 , ..., -0.16268732,
        -0.2105388 , -0.5981028 ],
       [-0.38101852, -0.73999757, -0.52766097, ...,  0.54665494,
        -0.6742828 , -0.40396854],
       ...,
       [-0.17435788,  1.1435635 ,  0.54444146, ..., -0.24937537,
         0.385152  ,  0.08164018],
       [ 0.33032802, -0.01506186, -0.32034525, ..., -0.75602883,
         0.12225862, -0.9323784 ],
       [-0.552987  ,  0.11646862, -0.2547368 , ...,  0.40270752,
        -0.37536028,  0.37689447]], dtype=float32)

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

# 计算余弦相似度矩阵
similarity_matrix = cosine_similarity(zz,zz)

# 将相似度矩阵转换为 DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=node_list, columns=node_list)

# 打印生成的 DataFrame
similarity_df

id,4,12,13,24,41,42,43,45,48,50,...,237,238,239,243,244,246,249,261,262,263
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.000000,-0.021099,0.002089,-0.032321,-0.034517,-0.051844,-0.022581,0.024571,0.000413,-0.028761,...,-0.024387,-0.031913,-0.032622,-0.052771,-0.056220,-0.016827,0.013431,0.000291,-0.039281,-0.031833
12,-0.021099,1.000000,0.020099,-0.048294,-0.064654,-0.051184,-0.039837,-0.004095,-0.031756,-0.046657,...,-0.042743,-0.046275,-0.043731,-0.038973,-0.040923,-0.029119,-0.014029,0.007434,-0.052115,-0.053798
13,0.002089,0.020099,1.000000,-0.055276,-0.063219,-0.058232,-0.034250,0.019306,-0.027035,-0.029659,...,-0.032239,-0.042424,-0.046975,-0.048963,-0.048234,-0.022360,0.009280,0.018956,-0.054695,-0.059950
24,-0.032321,-0.048294,-0.055276,1.000000,0.023151,0.016718,0.024999,-0.035912,0.003757,0.000619,...,0.023861,0.003140,-0.003360,0.001655,-0.001728,-0.004371,-0.026072,-0.048157,0.013731,0.002196
41,-0.034517,-0.064654,-0.063219,0.023151,1.000000,0.012386,0.030018,-0.056110,0.005061,0.003503,...,0.026243,0.031224,0.029184,0.008678,0.005845,-0.013634,-0.026187,-0.061372,0.003728,0.012164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,-0.016827,-0.029119,-0.022360,-0.004371,-0.013634,-0.039177,0.014783,-0.005074,0.021583,0.011422,...,0.007555,0.001493,0.009130,-0.047147,-0.042235,1.000000,0.026019,-0.013107,-0.022663,-0.021619
249,0.013431,-0.014029,0.009280,-0.026072,-0.026187,-0.049239,0.014298,0.012600,0.018224,-0.002192,...,0.004575,-0.014225,-0.013615,-0.061367,-0.057292,0.026019,1.000000,0.023365,-0.018327,-0.029026
261,0.000291,0.007434,0.018956,-0.048157,-0.061372,-0.059484,-0.042092,0.019635,-0.018219,-0.034779,...,-0.022934,-0.052288,-0.042521,-0.050435,-0.051735,-0.013107,0.023365,1.000000,-0.060004,-0.055828
262,-0.039281,-0.052115,-0.054695,0.013731,0.003728,-0.003738,0.023142,-0.042285,0.018049,-0.017236,...,0.026756,0.002662,0.002758,-0.015381,-0.019965,-0.022663,-0.018327,-0.060004,1.000000,0.022213


In [50]:
x_202 = similarity_df[202].sort_values(ascending=False)
x_202.head(10)

id
202    1.000000
148    0.373624
140    0.020918
263    0.013083
229    0.010762
141    0.009998
262    0.008449
162    0.005155
236    0.005141
237    0.004470
Name: 202, dtype: float32

In [36]:
data

Data(edge_index=[2, 964])

In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
import numpy as np


# 定义 GAT 模型
class GATModel(nn.Module):
    def __init__(self, in_channels, out_channels, num_heads):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(in_channels, out_channels, heads=num_heads)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        return x

# 创建 GAT 模型实例
in_channels = 1  # 输入特征的维度（这里只是一个示例）
out_channels = 64  # 输出特征的维度
num_heads = 2  # 注意力头的数量
model = GATModel(in_channels, out_channels, num_heads)

# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 训练节点嵌入
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    x = torch.randn(data.num_nodes, in_channels)  # 随机初始化节点特征
    embeddings = model(x, data.edge_index)
    loss = torch.norm(embeddings, p=2)  # 正则化项，可以根据需要调整
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 提取节点嵌入
node_embeddings = embeddings




Epoch [1/100], Loss: 4.3501
Epoch [2/100], Loss: 2.8067
Epoch [3/100], Loss: 3.7738
Epoch [4/100], Loss: 3.8037
Epoch [5/100], Loss: 4.2475
Epoch [6/100], Loss: 2.4245
Epoch [7/100], Loss: 2.9730
Epoch [8/100], Loss: 2.2537
Epoch [9/100], Loss: 1.8963
Epoch [10/100], Loss: 2.8573
Epoch [11/100], Loss: 2.6687
Epoch [12/100], Loss: 1.7681
Epoch [13/100], Loss: 1.5913
Epoch [14/100], Loss: 1.5176
Epoch [15/100], Loss: 2.2416
Epoch [16/100], Loss: 2.5784
Epoch [17/100], Loss: 1.8521
Epoch [18/100], Loss: 1.8669
Epoch [19/100], Loss: 1.1048
Epoch [20/100], Loss: 1.2107
Epoch [21/100], Loss: 1.3495
Epoch [22/100], Loss: 1.2848
Epoch [23/100], Loss: 0.8335
Epoch [24/100], Loss: 0.7571
Epoch [25/100], Loss: 0.7786
Epoch [26/100], Loss: 0.8695
Epoch [27/100], Loss: 0.6369
Epoch [28/100], Loss: 0.6634
Epoch [29/100], Loss: 0.5770
Epoch [30/100], Loss: 0.4395
Epoch [31/100], Loss: 0.3734
Epoch [32/100], Loss: 0.3710
Epoch [33/100], Loss: 0.2502
Epoch [34/100], Loss: 0.3174
Epoch [35/100], Loss: 0

In [38]:
node_embeddings

tensor([[ 1.6144e-03, -2.1190e-04,  5.0573e-04,  ...,  1.1055e-03,
          3.9618e-05, -2.1765e-04],
        [ 2.3835e-03, -7.6595e-04,  1.2127e-03,  ...,  6.0867e-03,
          2.0788e-04,  2.7742e-03],
        [ 1.9169e-03, -4.2981e-04,  7.8377e-04,  ...,  3.0525e-03,
          1.0539e-04,  9.5177e-04],
        ...,
        [ 1.9165e-03, -4.2956e-04,  7.8345e-04,  ...,  3.0532e-03,
          1.0541e-04,  9.5221e-04],
        [ 1.7421e-03, -3.0395e-04,  6.2318e-04,  ...,  1.9354e-03,
          6.7650e-05,  2.8079e-04],
        [ 1.7423e-03, -3.0409e-04,  6.2336e-04,  ...,  1.9349e-03,
          6.7636e-05,  2.8053e-04]], grad_fn=<AddBackward0>)

In [40]:
node_embeddings = node_embeddings.detach().numpy()

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

# 计算余弦相似度矩阵
similarity_matrix = cosine_similarity(node_embeddings,node_embeddings)

# 将相似度矩阵转换为 DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=node_list, columns=node_list)

# 打印生成的 DataFrame
similarity_df

id,4,12,13,24,41,42,43,45,48,50,...,237,238,239,243,244,246,249,261,262,263
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.000000,0.581424,0.788388,0.923314,0.986958,0.999945,0.979693,0.880062,0.965461,0.478708,...,0.963842,0.919966,0.949896,0.645437,0.689049,0.903453,0.929914,0.788501,0.930898,0.930839
12,0.581424,1.000000,0.958894,0.849295,0.704803,0.572940,0.732742,0.898034,0.349366,-0.435985,...,0.777202,0.853819,0.806592,-0.246164,-0.188997,0.176510,0.839895,0.958843,0.838438,0.838525
13,0.788388,0.958894,1.000000,0.964181,0.877125,0.781951,0.895717,0.985953,0.600877,-0.162703,...,0.923805,0.966434,0.941163,0.038974,0.097411,0.448552,0.959376,1.000000,0.958618,0.958665
24,0.923314,0.849295,0.964181,1.000000,0.973093,0.919276,0.981566,0.994938,0.791368,0.104823,...,0.992267,0.999963,0.997092,0.302608,0.357890,0.669539,0.999846,0.964231,0.999795,0.999797
41,0.986958,0.704803,0.877125,0.973093,1.000000,0.985235,0.999192,0.945019,0.910935,0.331146,...,0.994167,0.971068,0.987823,0.514076,0.563412,0.822669,0.976988,0.877215,0.977556,0.977521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,0.903453,0.176510,0.448552,0.669539,0.822669,0.907859,0.799158,0.591528,0.983941,0.808867,...,0.756555,0.663105,0.724199,0.910559,0.933200,1.000000,0.682477,0.448717,0.684433,0.684315
249,0.929914,0.839895,0.959376,0.999846,0.976988,0.926045,0.984770,0.993021,0.801979,0.122266,...,0.994294,0.999657,0.998276,0.319295,0.374229,0.682477,1.000000,0.959429,0.999996,0.999996
261,0.788501,0.958843,1.000000,0.964231,0.877215,0.782066,0.895800,0.985984,0.601024,-0.162521,...,0.923877,0.966482,0.941227,0.039158,0.097594,0.448717,0.959429,1.000000,0.958672,0.958718
262,0.930898,0.838438,0.958618,0.999795,0.977556,0.927052,0.985233,0.992703,0.803577,0.124925,...,0.994576,0.999583,0.998429,0.321833,0.376713,0.684433,0.999996,0.958672,1.000000,1.000000


In [43]:
x_202 = similarity_df[202].sort_values(ascending=False)
x_202.head(60)

id
202    1.000000
100    0.999994
114    0.999954
68     0.999467
142    0.998220
42     0.997760
4      0.997008
166    0.993794
161    0.982933
48     0.982702
158    0.978605
74     0.978299
41     0.971578
230    0.969477
153    0.963100
148    0.963091
43     0.961280
125    0.960682
75     0.959710
163    0.953340
162    0.951712
232    0.948751
116    0.943861
237    0.940380
186    0.939487
236    0.934753
246    0.933865
239    0.922917
151    0.921212
90     0.917961
152    0.914592
79     0.901414
262    0.899900
263    0.899828
249    0.898730
24     0.890892
234    0.887774
113    0.887345
238    0.886940
229    0.884473
164    0.858102
233    0.851459
45     0.840747
141    0.820433
120    0.807397
211    0.802789
140    0.800400
107    0.796000
209    0.793910
224    0.790837
170    0.776967
231    0.771330
144    0.757731
128    0.752418
244    0.742969
261    0.738633
13     0.738507
127    0.702510
243    0.702509
137    0.701675
Name: 202, dtype: float32