In [1]:
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn import metrics

from util import load_data
from models.clf_model import Classifier
from models.dci import DCI
from sklearn.cluster import KMeans

### Define Functions

In [2]:
# def设置随机种子的函数
def setup_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True 

In [3]:
# def生成adj的函数
def preprocess_neighbors_sumavepool(edge_index, nb_nodes, device):  # 分别是边的索引，节点数，设备
    adj_idx = edge_index
  
    adj_idx_2 = torch.cat([torch.unsqueeze(adj_idx[1], 0), torch.unsqueeze(adj_idx[0], 0)], 0)  # edge_index上下颠倒来一遍，因为是无向图所以反过来的edge的idx也要加上
    adj_idx = torch.cat([adj_idx, adj_idx_2], 1)    # 拼接原来的+颠倒的

    self_loop_edge = torch.LongTensor([range(nb_nodes), range(nb_nodes)])   # 生成自环边，就是node自己到自己的edge的idx
    adj_idx = torch.cat([adj_idx, self_loop_edge], 1)   # 拼成原来+颠倒+自环
        
    adj_elem = torch.ones(adj_idx.shape[1]) # 生成对应的权重，这里是1

    adj = torch.sparse.FloatTensor(adj_idx, adj_elem, torch.Size([nb_nodes, nb_nodes])) # 使用了 PyTorch 的稀疏张量（torch.sparse.FloatTensor）来表示邻接矩阵。稀疏张量是一种高效存储和处理稀疏数据的数据结构，适用于处理大规模的图结构。

    return adj.to(device)

### Set Configuration

In [4]:
# 这一段是传参数or文件or模型，总之是执行句
# 默认wiki dataset, use gpu 0, epochs = 50, layers = 2, hidden_dim = 128, finetune_epochs = 100, lr = 0.01
# cluster = 2, recluster interval = 20, dropout = 0.5, neighbor pooling type = sum, scheme = decoupled
parser = argparse.ArgumentParser(description='PyTorch deep cluster infomax')
parser.add_argument('--dataset', type=str, default="wiki",
                    help='name of dataset (default: wiki)')
parser.add_argument('--device', type=int, default=0,
                    help='which gpu to use if any (default: 0)')
parser.add_argument('--epochs', type=int, default=50,
                    help='number of epochs to train (default: 50)')
parser.add_argument('--num_layers', type=int, default=2,
                    help='number of layers (default: 2)')
parser.add_argument('--num_mlp_layers', type=int, default=2,
                    help='number of layers for MLP EXCLUDING the input one (default: 2). 1 means linear model.')
parser.add_argument('--hidden_dim', type=int, default=128,
                    help='number of hidden units (default: 128)')
parser.add_argument('--finetune_epochs', type=int, default=100,
                    help='number of finetune epochs (default: 100)')
parser.add_argument('--num_folds', type=int, default=10,
                    help='number of folds (default: 10)')
parser.add_argument('--lr', type=float, default=0.01,
                    help='learning rate (default: 0.01)')
parser.add_argument('--num_cluster', type=int, default=2,
                    help='number of clusters (default: 2)')
parser.add_argument('--recluster_interval', type=int, default=20,   # 指重新聚类的间隔，这里是每train 20次就重新聚类一次
                    help='the interval of reclustering (default: 20)')
parser.add_argument('--final_dropout', type=float, default=0.5,     # 指定模型在最后一层有几个node会变成0，是用来防止过拟合的，介于0-1，e.g. 0.5就是一半的node会变成0
                    help='final layer dropout (default: 0.5)')
parser.add_argument('--neighbor_pooling_type', type=str, default="sum", choices=["sum", "average"], # 指的是对每个node，它的adj node的feature是做sum还是average
                    help='Pooling for over neighboring nodes: sum or average')
parser.add_argument('--training_scheme', type=str, default="decoupled", choices=["decoupled", "joint"], # 用decoupled还是joint，因为对比的是DCI和别的所以没有提供joint DGI，我猜的
                    help='Training schemes: decoupled or joint')

# 设置module，DCI就是原始load feature的方法，U-GCN就是新的
parser.add_argument('--module', type=str, default='U-GCN', choices=['DCI', 'U-GCN'],
                    help='module to generate feature matrix: DCI or U-GCN')


_StoreAction(option_strings=['--training_scheme'], dest='training_scheme', nargs=None, const=None, default='decoupled', type=<class 'str'>, choices=['decoupled', 'joint'], help='Training schemes: decoupled or joint', metavar=None)

### Set Parameters

In [5]:
sig = torch.nn.Sigmoid()
args = parser.parse_args([])  # args是用来存上面的参数或者叫命令的
setup_seed(0)
device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

print(f'Device: {device}')

Device: cuda:0


### load data
- edge_index: 2*18257，所有边的二维矩阵形式
- feats: 特征矩阵，每个node有64个特征
- label: 第一列是node第二列是label
- split_idx: 就是一个kfold，这里是10fold，每个fold有两个list组成，前一个是train后一个是test
- nb_nodes: 一共9227个node

In [6]:

edge_index, feats, split_idx, label, nb_nodes = load_data(args.dataset, args.num_folds, args.module)
input_dim = feats.shape[1]

Load the edge_index done!
Ratio of fraudsters:  0.026376564969004496
Number of edges:  18257
Number of users:  8227
Number of objects:  1000
Number of nodes:  9227


### Set Cluster Model

In [11]:
kmeans = KMeans(n_clusters=args.num_cluster, random_state=0).fit(feats) # 用KMeans把raw user data分成两类的意思，可以理解成两种业务or行为？
ss_label = kmeans.labels_   # ss_label是每个node的cluster label
cluster_info = [list(np.where(ss_label==i)[0]) for i in range(args.num_cluster)]
idx = np.random.permutation(nb_nodes)
shuf_feats = feats[idx, :]

In [9]:
split_idx

[(array([   0,    1,    2, ..., 8224, 8225, 8226]),
  array([   3,   17,   20,   23,   39,   45,   49,   51,   57,   60,   64,
           73,   82,   88,   91,   97,  126,  135,  159,  160,  161,  186,
          187,  189,  207,  212,  218,  224,  226,  231,  233,  247,  249,
          257,  260,  264,  277,  302,  307,  330,  333,  334,  335,  337,
          341,  366,  368,  372,  373,  375,  389,  409,  422,  423,  439,
          444,  445,  450,  463,  465,  472,  474,  490,  494,  496,  497,
          506,  512,  521,  529,  537,  538,  539,  540,  546,  581,  589,
          594,  595,  599,  604,  633,  638,  641,  642,  647,  654,  662,
          683,  700,  717,  720,  734,  739,  740,  764,  768,  781,  793,
          794,  796,  815,  824,  831,  844,  859,  868,  882,  888,  894,
          900,  971,  973,  979,  999, 1005, 1014, 1031, 1037, 1038, 1045,
         1058, 1060, 1063, 1064, 1075, 1079, 1089, 1106, 1116, 1117, 1142,
         1150, 1155, 1156, 1174, 1185, 1189, 119