In [13]:
import os.path as osp
import torch, tqdm
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np

In [2]:
df_ori = pd.read_csv("tst.csv").drop(columns=["amount"])
df_ori

Unnamed: 0,company_id,investor_id
0,4708640,4708666
1,5310348,5310352
2,5835331,1468298
3,6339545,2901375
4,7140015,7140222
...,...,...
99995,71211319,12986379
99996,121778646,121778647
99997,36615212,4773902
99998,78441325,123692


In [3]:
## 涉及到的所有的节点：
node_order = sorted(
    list(
        set(
            df_ori.company_id.to_list() + df_ori.investor_id.to_list()
        )
    )
)

In [4]:
## 节点转为映射值：
mapping = {
    ci: idx for idx, ci in enumerate(node_order)
}
for col in ["investor_id", "company_id"]:
    df_ori[col] = df_ori[col].map(mapping)

In [5]:
from torch_geometric.data import Data
data = Data(
    num_nodes = len(mapping),
    edge_index=torch.tensor(
        df_ori.T.to_numpy(), 
        dtype = torch.long
    )
)

In [6]:
n_nodes = data.num_nodes
hidden_channels = 64

In [7]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,    
)
train_data, val_data, test_data = transform(data)

In [10]:
from torch_geometric.loader import LinkNeighborLoader
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[
        -1, # 10, 
        -1, # 5
    ],
    neg_sampling_ratio=2.0,
    batch_size=128,
    shuffle=True,
)

In [14]:
## 我们这边看一下这些数据里面具体都是什么样子的。
for sampled_data in tqdm.tqdm(train_loader):
    break

  0%|                                                   | 0/438 [00:00<?, ?it/s]


In [15]:
sampled_data
## 我们进去仔细看了一下，发现了一些华点。

Data(edge_index=[2, 603], num_nodes=1239, edge_label=[384], edge_label_index=[2, 384], n_id=[1239], e_id=[603], input_id=[128])

In [16]:
sampled_data.edge_index
## 这些就是抽出来的边。

EdgeIndex([[ 767,  768,  769,  ..., 1237,  764, 1238],
           [   4,    5,    5,  ...,  747,  761,  798]],
          sparse_size=(1239, 1239), nnz=603, sort_order=col)

In [17]:
from_node, to_node = sampled_data.edge_index

In [25]:
sampled_data.node_stores

[{'edge_index': EdgeIndex([[ 767,  768,  769,  ..., 1237,  764, 1238],
            [   4,    5,    5,  ...,  747,  761,  798]],
           sparse_size=(1239, 1239), nnz=603, sort_order=col), 'num_nodes': 1239, 'edge_label': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [None]:
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
class GNN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, 128)
        self.conv2 = SAGEConv(128, out_channels)
    def forward(self, x, edge_index) :
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class Classifier(torch.nn.Module):
    def forward(self, x_from, x_to,):
        return (x_from * x_to).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.emb = torch.nn.Embedding(n_nodes, in_channels)
        self.gnn = GNN(in_channels, out_channels)
        self.classifier = Classifier()
    def forward(self, data):
        x_out = self.gnn(
            self.emb(data.n_id), 
            data.edge_label_index
        )
        pred = self.classifier(
            x_out[data.edge_label_index[0]], ## 边的起始点。
            x_out[data.edge_label_index[-1]] ## 边的终结点。
        )
        return pred
        
model = Model(in_channels=hidden_channels, out_channels=64)

weights = model.emb.weight.detach().numpy()
# pd.DataFrame(weights, columns = [f"col_{i}" for i in range(weights.shape[1])])

In [None]:
import tqdm
import torch.nn.functional as F
device = torch.device("mps")#('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 10):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):        
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data.edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    
#     break
    
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")
    

In [None]:
!pip3 install pyg_lib

In [None]:
import torch_geometric

In [None]:
torch_geometric.__version__

In [None]:
!pip3 install pyg-lib -f https://data.pyg.org/whl/torch-2.2.2+cpu.html


In [None]:
import pip3._internal

In [None]:
pip._internal#.pep425tags.get_supported()

In [None]:
pip._internal.Optional

In [None]:
pip._internal.pep425tags

In [None]:
!pwd

In [None]:
!pip3 install ../../../../../Downloads/torch_scatter-2.1.2-cp39-cp39-macosx_11_0_x86_64.whl

In [None]:
!pip3 install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.0+cpu.html