In [None]:
import numpy as np
import pandas as pd

# 0. 准备工作

In [None]:
# 加载数据
data = pd.read_csv('/workspace/Node2vec_Dataset.csv')
num_nodes = max(data.max())
# 显示数据的前几行以了解格式
print(data.head())
# 显示点的数量
print(num_nodes)

   node_1  node_2
0       0     747
1       1    4257
2       1    2194
3       1     580
4       1    6478
7623


In [2]:
import networkx as nx
from node2vec import Node2Vec

# 使用数据构建图
G = nx.from_pandas_edgelist(data, 'node_1', 'node_2')

# 配置 Node2vec 参数
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)

# 训练模型
model = node2vec.fit(window=10, min_count=1, batch_words=4)

  from .autonotebook import tqdm as notebook_tqdm
Computing transition probabilities: 100%|██████████| 7624/7624 [00:08<00:00, 914.28it/s] 
Generating walks (CPU: 1): 100%|██████████| 50/50 [02:13<00:00,  2.66s/it]
Generating walks (CPU: 2): 100%|██████████| 50/50 [02:11<00:00,  2.63s/it]
Generating walks (CPU: 4): 100%|██████████| 50/50 [02:10<00:00,  2.61s/it]
Generating walks (CPU: 3): 100%|██████████| 50/50 [02:17<00:00,  2.75s/it]


# 1. 每个节点的 embedding 值列表（csv 文件）

In [None]:
# 获取节点嵌入
embeddings = model.wv
node_embeddings = [embeddings[str(i)] for i in range(num_nodes)]

# 将嵌入转换为numpy数组
node_embeddings_array = np.array(node_embeddings)

# 保存为csv文件
np.savetxt('/workspace/node_embeddings.csv', node_embeddings_array, delimiter=',')

# 2. 随机挑选 10 个 node pair，对⽐他们在 embedding 上的相似度和在 betweenness centrality 上的相似度（使⽤ Jaccard similarity）

In [22]:
# 计算图的betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)
print(betweenness_centrality)

{0: 0.0, 747: 0.0003672774363444215, 1: 0.0023061399557024686, 4257: 0.0042099704285937225, 2194: 0.001318568250167888, 580: 0.0, 6478: 0.009596805345788419, 1222: 0.0035137762392582557, 5735: 0.0003494628506040224, 7146: 7.385226060965302e-05, 2204: 0.0009525810160413849, 126: 0.0008825590358086654, 2639: 0.0012551251948347635, 2: 7.169570839335663e-05, 562: 0.00023760693586995135, 1492: 1.6108365970256753e-05, 6: 0.002139799818418406, 5303: 0.0015870991390064262, 7128: 0.00042158238648855686, 4154: 0.00015300506234827206, 5179: 6.005007030194674e-06, 3: 0.0007096114518852534, 1728: 4.0272232260976365e-05, 4560: 0.0011519731512118297, 5060: 0.004168111234317579, 1351: 3.4808931407649435e-05, 6503: 6.733919454682841e-05, 7089: 0.0037355595950981325, 4319: 0.008315319517066736, 6095: 0.0012018292877394472, 272: 6.95894306653162e-05, 4433: 0.0008870420000775504, 2900: 0.0002672765190299345, 3381: 0.007582072131063456, 5943: 0.0005687078174407154, 2107: 4.3763123239886755e-05, 6940: 0.000

In [24]:
print(betweenness_centrality[747])

0.0003672774363444215


In [42]:
# 从data中随机选取10个点对
random_pairs = data.sample(n=10)
print(random_pairs)

from sklearn.metrics import jaccard_score

# 计算embedding的点积相似度
embedding_similarities = []
for _, row in random_pairs.iterrows():
    node1, node2 = str(row['node_1']), str(row['node_2'])
    embedding_sim = np.dot(embeddings[node1], embeddings[node2]) / (np.linalg.norm(embeddings[node1]) * np.linalg.norm(embeddings[node2]))
    embedding_similarities.append(embedding_sim)

# 计算betweenness centrality相似度
betweenness_centralities = []
for _, row in random_pairs.iterrows():
    node1, node2 = row['node_1'], row['node_2']
    betweenness_centralities.append(min(betweenness_centrality[node1],betweenness_centrality[node2])/max(betweenness_centrality[node1],betweenness_centrality[node2]))

# 对 embedding_similarities 进行归一化
embedding_similarities = np.array(embedding_similarities)
embedding_similarities = embedding_similarities / sum(embedding_similarities)

# 对 betweenness_centralities 进行归一化
betweenness_centralities = np.array(betweenness_centralities)
betweenness_centralities = betweenness_centralities / sum(betweenness_centralities)

embedding_similarities = np.round(embedding_similarities * 100).astype(int)
betweenness_centralities = np.round(betweenness_centralities * 100).astype(int)

# 打印结果
print(f"Embedding Similarities :\t\t\t{embedding_similarities}")
print(f"Betweenness Centrality Similarities :\t\t{betweenness_centralities}")

# 计算embedding_similarities和betweenness_centralities的jaccard相似度
jaccard_similarity = jaccard_score(embedding_similarities, betweenness_centralities, average='macro')
# 打印结果
print(f"Jaccard Similarity between embedding similarities and betweenness centralities: {jaccard_similarity}")

       node_1  node_2
4093      591    1721
11873    1890    3666
18205    3131    4738
9254     1471    6231
2294      321    7281
18617    3240    6695
13051    2083    3387
4676      686    5725
11101    1784    2719
17703    3038    4955
Embedding Similarities :			[10 10 11 11 12  9  9  9 10  9]
Betweenness Centrality Similarities :		[10 12  2  0  0  5  4 19 19 29]
Jaccard Similarity between embedding similarities and betweenness centralities: 0.03333333333333333
