https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70

https://aitechtogether.com/article/48759.html

In [None]:
## https://aitechtogether.com/article/48759.html

import pandas as pd
import matplotlib.pyplot as plt
import torch, tqdm
from sklearn.manifold import TSNE
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec

## 设置参数：
import argparse
import sys
parser = argparse.ArgumentParser(description='Test for argparse', formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--P', '-p', help='p值', default=1.0)
parser.add_argument('--Q', '-q', help='q值', default=1.0)
args = parser.parse_args()
result_identity_file_name = sys.argv[0].replace(".py", "").split("-")[-1]
print(sys.argv[0], args.P, args.Q)

# 1.加载数据集
df_ori = pd.read_csv("fromNode_toNode.csv") # pd.read_csv("sample.csv").head(1000) # 
node_order = sorted(list(set(df_ori.company_id.to_list() + df_ori.outcompany_id.to_list())))
mapping = {
    ci: idx for idx, ci in enumerate(node_order)
}
for col in df_ori:
    df_ori[col] = df_ori[col].map(mapping)

from torch_geometric.data import Data

data = Data(
    num_nodes = len(mapping),
    edge_index=torch.tensor(
        df_ori.T.to_numpy(), 
        dtype = torch.long
    )
)

# 2.定义模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
print(device)

# node2vec模型
model = Node2Vec(edge_index=data.edge_index,
                 embedding_dim=128, # 节点维度嵌入长度
                 walk_length=5, # 序列游走长度
                 context_size=4, # 上下文大小
                 walks_per_node=1, # 每个节点游走1个序列
                 p=float(args.P),
                 q=float(args.Q),
                 sparse=True # 权重设置为稀疏矩阵
                ).to(device)

# 迭代器
loader = model.loader(batch_size=64, shuffle=True)
# 优化器
optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01)

# 3.开始训练
model.train()

for epoch in tqdm.tqdm(range(1, 201)):
    total_loss = 0 # 每个epoch的总损失
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device)) # 计算损失
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # 使用逻辑回归任务进行测试生成的embedding效果
    with torch.no_grad():
        model.eval() # 开启测试模式
        z = model() # 获取权重系数，也就是embedding向量表

    # 打印指标
    if (epoch == 1) or ((epoch % 50) == 0):
        print(f'Epoch: {epoch:02d}, Loss: {total_loss:.4f}') # , Acc: {acc:.4f}

# 可视化节点的embedding
with torch.no_grad():
    model.eval() # 开启测试模式
    # 获取节点的embedding向量，形状为[num_nodes, embedding_dim]
    z = model(torch.arange(data.num_nodes, device=device))
    weights = z.detach().cpu().numpy()
    df_rst = pd.DataFrame(weights, columns = [f"col_{i}" for i in range(weights.shape[1])])
    df_rst["company_id"] = node_order
    df_rst.to_csv(f"./embedding/{result_identity_file_name}_p_{args.P}__q_{args.Q}.csv", index=False)
    
    
    
