https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70

这里，我们尝试，不要初始特征，让初始特征变成embedding。

In [1]:
import os.path as osp
import torch
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd
import numpy as np

In [3]:
# import torch_scatter, torch_sparse, torch_cluster, torch_spline_conv, torch_geometric

In [4]:
# pip install torch==2.1.1 torch-scatter==2.1.2 torch-sparse==0.6.18 torch-cluster==1.6.3 torch-spline-conv==1.2.2 torch-geometric==2.4.0 --target=./

In [5]:
df_ori = pd.read_csv("sample.csv").head(10000).drop(columns = ["report_year"])

node_order = sorted(list(set(df_ori.credit_code.to_list() + df_ori.outcredit_code.to_list())))

mapping = {
    ci: idx for idx, ci in enumerate(node_order)
}
for col in df_ori:
    df_ori[col] = df_ori[col].map(mapping)

In [6]:
from torch_geometric.data import Data

data = Data(
    num_nodes = len(mapping),
    edge_index=torch.tensor(
        df_ori.T.to_numpy(), 
        dtype = torch.long
    )
)

In [7]:
data.is_directed()

True

In [8]:
n_nodes = data.num_nodes
hidden_channels = 64

In [9]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,    
)
train_data, val_data, test_data = transform(data)

In [10]:
from torch_geometric.loader import LinkNeighborLoader
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[
        -1,# 10, 
        -1, # 5
    ],
    neg_sampling_ratio=2.0,
    batch_size=128,
    shuffle=True,
)

In [11]:
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
class GNN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, 128)
        self.conv2 = SAGEConv(128, out_channels)
    def forward(self, x, edge_index) :
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class Classifier(torch.nn.Module):
    def forward(self, x_from, x_to,):
        return (x_from * x_to).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.emb = torch.nn.Embedding(n_nodes, in_channels)
        self.gnn = GNN(in_channels, out_channels)
        self.classifier = Classifier()
    def forward(self, data):
        x_out = self.gnn(
            self.emb(data.n_id), 
            data.edge_label_index
        )
        pred = self.classifier(
            x_out[data.edge_label_index[0]], ## 边的起始点。
            x_out[data.edge_label_index[-1]] ## 边的终结点。
        )
        return pred
        
model = Model(in_channels=hidden_channels, out_channels=64)

weights = model.emb.weight.detach().numpy()
pd.DataFrame(weights, columns = [f"col_{i}" for i in range(weights.shape[1])])

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_54,col_55,col_56,col_57,col_58,col_59,col_60,col_61,col_62,col_63
0,0.204166,-1.913129,0.224228,-0.798225,2.311201,-0.734143,-0.890415,2.116769,0.367596,0.745665,...,-0.932779,0.261779,-0.184092,0.589380,0.075864,1.148578,-1.244520,-1.590388,-0.749513,0.494159
1,-0.213372,1.305606,-1.012688,0.645469,-1.282616,-0.434324,-1.887497,-1.587650,0.387500,-0.392872,...,0.438843,-1.588360,2.654335,0.668084,0.635473,-0.326247,-0.343727,0.970685,-0.365338,-0.759091
2,-0.625264,-2.347703,-1.274370,-0.675747,-0.079451,0.771739,-0.031718,-1.899585,-0.215473,1.388394,...,-0.464952,-0.567257,0.549657,-0.690892,-0.913559,-0.637127,-1.107562,-0.776902,-0.323069,-2.251067
3,-0.454974,0.771563,-0.169056,0.671824,0.625613,0.636769,-1.380048,0.127225,0.915895,-1.412789,...,-0.618219,0.665512,-0.049737,-0.096232,0.808627,-0.845300,-2.135978,-0.529514,0.482361,-1.698244
4,-1.815911,-0.360211,-0.820130,-0.984728,-0.906629,1.167074,-0.149966,-1.214211,-0.305907,-0.194863,...,-0.417437,2.206745,-0.103622,-0.939314,0.134920,0.378280,0.385108,1.428602,-1.430276,-1.149042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10267,0.272824,-1.647135,0.213967,-0.534543,-0.419478,-0.547089,0.557933,-1.523871,0.265893,-0.057150,...,0.759211,-0.390345,1.357873,-1.888997,-2.419361,0.382690,0.532714,-0.316429,-2.431507,-1.294023
10268,-0.920462,0.869821,0.366213,0.600005,-0.803093,1.714525,-0.464417,-0.901931,0.164131,-0.013455,...,0.814906,-0.869550,0.693782,-1.796077,-0.344530,-1.417625,0.754893,0.429027,0.163029,0.081340
10269,0.843906,1.481964,0.667393,-0.876184,1.009999,-0.296200,-1.767485,0.388130,0.151410,0.232196,...,1.391059,-0.405664,0.583687,1.340488,-0.233576,0.604424,0.726584,2.412103,-0.505816,-0.276736
10270,-0.041046,-0.788669,-0.207257,1.542667,0.908246,-2.995111,-0.451971,-0.303380,0.504326,1.612450,...,2.596516,-0.365549,-1.624916,-0.740118,-0.695535,0.925779,-1.271072,-0.153407,1.005416,-0.803567


In [12]:
# !pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cu111.html
# import torch_geometric
# torch_geometric.typing.WITH_TORCH_SPARSE

In [13]:
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 10):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):        
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data.edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    
#     break
    
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")
    

Device: 'cpu'


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 206.03it/s]


Epoch: 001, Loss: 0.6267


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 283.42it/s]


Epoch: 002, Loss: 0.4416


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 259.05it/s]


Epoch: 003, Loss: 0.3564


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 264.26it/s]


Epoch: 004, Loss: 0.2915


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 214.93it/s]


Epoch: 005, Loss: 0.2605


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 265.72it/s]


Epoch: 006, Loss: 0.2346


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 256.10it/s]


Epoch: 007, Loss: 0.2140


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 270.14it/s]


Epoch: 008, Loss: 0.1983


100%|█████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 275.58it/s]

Epoch: 009, Loss: 0.1846





In [14]:
# !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric

In [15]:
test_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[10, 5],
    neg_sampling_ratio=2.0,
    batch_size=128,
    shuffle=True,
)

In [16]:
from sklearn.metrics import roc_auc_score
preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(test_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data.edge_label)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")

100%|██████████████████████████████████████████| 63/63 [00:00<00:00, 401.29it/s]


Validation AUC: 0.7791





In [17]:
model.emb

Embedding(15810, 64)

In [18]:
weights = model.emb.weight.detach().numpy()
df_rst = pd.DataFrame(weights, columns = [f"col_{i}" for i in range(weights.shape[1])])
df_rst["company_id"] = node_order
df_rst.to_csv("embedding.csv", index=False)