https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70

这里，我们尝试，不要初始特征，让初始特征变成embedding。

In [1]:
import os.path as osp
import torch
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd
import numpy as np

In [11]:
df_ori = pd.read_csv("sample__company_equity_info__df_h_h.csv")
df_ori

Unnamed: 0,id,equity_amount,pledgee_id,pledgee_type,pledgor_id,pledgor_type,state,up_state
0,1051362,180.9000,338.0,1,2001172,1,有效,2.0
1,1595323,750.0000,10593224.0,1,10593224,1,有效,2.0
2,1633055,620.0000,290764.0,1,4705687,1,有效,2.0
3,1636400,1120.0000,140300.0,1,0,1,有效,2.0
4,1670399,1473.8400,969458.0,1,290764,1,有效,2.0
...,...,...,...,...,...,...,...,...
16320,24541349,1996.8387,48072.0,1,325930,1,有效,1.0
16321,24541463,1920.0000,752045.0,1,13487055,1,有效,1.0
16322,24541464,2000.0000,752045.0,1,12503,1,有效,1.0
16323,24541482,108.3780,68027.0,1,268650,1,有效,1.0


In [12]:
node_order = sorted(list(set(df_ori.pledgee_id.to_list() + df_ori.pledgor_type.to_list())))

mapping = {
    ci: idx for idx, ci in enumerate(node_order)
}
for col in ["pledgee_id", "pledgor_type"]:
    df_ori[col] = df_ori[col].map(mapping)

In [13]:
from torch_geometric.data import Data

data = Data(
    num_nodes = len(mapping),
    edge_index=torch.tensor(
        df_ori[["pledgee_id", "pledgor_type"]].T.to_numpy(), 
        dtype = torch.long
    ),
    edge_weight=torch.tensor(
        df_ori[["equity_amount"]].T.to_numpy(), 
        dtype = torch.long
    )
)

In [14]:
n_nodes = data.num_nodes
hidden_channels = 64

In [15]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,    
)
train_data, val_data, test_data = transform(data)

In [16]:
from torch_geometric.loader import LinkNeighborLoader
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[
        3, 
        2
    ],
    neg_sampling_ratio=2.0,
    batch_size=128,
    shuffle=True,
)

In [17]:
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
class GNN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, 128)
        self.conv2 = SAGEConv(128, out_channels)
    def forward(self, x, edge_index) :
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class Classifier(torch.nn.Module):
    def forward(self, x_from, x_to,):
        return (x_from * x_to).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.emb = torch.nn.Embedding(n_nodes, in_channels)
        self.gnn = GNN(in_channels, out_channels)
        self.classifier = Classifier()
    def forward(self, data):
        x_out = self.gnn(
            self.emb(data.n_id), 
            data.edge_label_index
        )
        pred = self.classifier(
            x_out[data.edge_label_index[0]], ## 边的起始点。
            x_out[data.edge_label_index[-1]] ## 边的终结点。
        )
        return pred
        
model = Model(in_channels=hidden_channels, out_channels=64)

weights = model.emb.weight.detach().numpy()
pd.DataFrame(weights, columns = [f"col_{i}" for i in range(weights.shape[1])])

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_54,col_55,col_56,col_57,col_58,col_59,col_60,col_61,col_62,col_63
0,0.185975,0.939711,-0.128709,0.370708,-0.333293,0.059012,0.371074,0.044043,-0.917792,-0.168400,...,0.887624,0.329911,0.917005,-0.102132,0.634849,-1.571522,1.209813,0.409095,-0.464225,-1.188572
1,0.895257,-0.038805,1.088342,-0.047328,0.063770,0.194396,0.355997,-2.449052,-0.721503,-1.094366,...,-0.064475,-0.940847,-0.666645,0.576844,1.236218,0.763035,0.890970,0.272288,0.032394,-0.150049
2,1.441656,-1.146114,0.968549,-1.380775,-0.499855,0.802476,1.055095,-0.551164,-1.603164,0.985574,...,-1.466041,0.495226,-0.739850,-1.080410,0.261932,-1.879478,0.705469,-0.043974,-0.191139,0.360290
3,-0.967363,-0.975899,-1.691451,-0.171353,-0.375233,0.202316,-1.123232,-1.204270,1.262273,-0.690604,...,-1.445261,0.098234,-0.426552,-0.096978,1.076930,0.372386,1.122425,-1.618461,-0.864361,0.040027
4,-0.171415,-0.175755,0.298851,1.319848,0.420865,0.134479,-1.066459,-0.756031,-1.390961,0.251846,...,-0.174711,0.737901,-0.305916,-0.077763,-0.648565,-0.685477,-0.461003,2.633189,-0.231009,0.514315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10765,0.219807,-0.088833,-0.389428,0.582907,-0.563306,0.534210,-0.539288,-0.567570,-0.636689,0.501028,...,1.130351,0.281616,-0.660049,-1.780592,-0.038243,-0.090336,0.515840,0.991672,-1.734841,-1.135793
10766,0.769219,0.432233,0.902775,-0.163657,0.406947,-0.821821,0.752766,1.482346,-0.181109,-1.534694,...,0.274924,0.355893,-1.375442,0.350609,0.787092,-1.610009,-0.226000,-0.825250,-0.127884,0.444532
10767,-0.213047,0.636174,-0.672236,-0.724225,-0.956726,-0.726587,-0.148193,-0.231243,0.424235,-1.202856,...,-1.172578,1.532292,-0.477644,1.473400,-1.402732,1.499483,-0.013882,1.662085,-0.327482,-0.788191
10768,-0.347305,0.034951,0.924751,0.720853,0.597888,-0.513045,-0.159958,0.822029,-0.623273,-1.105767,...,-1.459180,0.008741,-0.210554,-0.239894,0.748212,-0.011773,2.530394,-0.043769,-0.460207,-0.951734


In [18]:
# !pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cu111.html
# import torch_geometric
# torch_geometric.typing.WITH_TORCH_SPARSE

In [19]:
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 10):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):        
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data.edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    
#     break
    
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")
    

Device: 'cpu'


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 192.39it/s]


Epoch: 001, Loss: 0.1989


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 246.49it/s]


Epoch: 002, Loss: 0.0183


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 235.80it/s]


Epoch: 003, Loss: 0.0090


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 211.18it/s]


Epoch: 004, Loss: 0.0062


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 237.50it/s]


Epoch: 005, Loss: 0.0038


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 244.89it/s]


Epoch: 006, Loss: 0.0037


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 241.66it/s]


Epoch: 007, Loss: 0.0034


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 241.20it/s]


Epoch: 008, Loss: 0.0020


100%|██████████████████████████████████████████| 72/72 [00:00<00:00, 235.91it/s]

Epoch: 009, Loss: 0.0031





In [20]:
# !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric

In [23]:
test_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[3, 2],
    neg_sampling_ratio=2.0,
    batch_size=128,
    shuffle=True,
)

In [24]:
from sklearn.metrics import roc_auc_score
preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(test_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data.edge_label)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")

100%|████████████████████████████████████████| 103/103 [00:00<00:00, 759.03it/s]


Validation AUC: 0.9999





In [25]:
model.emb

Embedding(10770, 64)

In [26]:
weights = model.emb.weight.detach().numpy()
df_rst = pd.DataFrame(weights, columns = [f"col_{i}" for i in range(weights.shape[1])])
df_rst["company_id"] = node_order
df_rst.to_csv("embedding.csv", index=False)