# Signed Graph Convolutional Network(SGCN)を用いたFraud User Detection

In [1]:
from sgcn_master import SignedGCNTrainer, SignedGCNPredictor
from parser import parameter_parser
from utils import tab_printer, read_graph, score_printer, save_logs
import easydict
import argparse
import pandas as pd
import numpy as np
import torch
import json
import networkx as nx
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline

  (fname, cnt))
  (fname, cnt))


## まとめて学習

In [None]:
def train_all(data_name_list,l1_lambda=0.0,l2_lambda=10e-4,iter_num=30):
    for data_name in data_name_list:
        for i in range(iter_num):
            print(f'{i}-th iteration')
            args = easydict.EasyDict({
                    "edge_path": f'../input/{data_name}/{data_name}_network.csv',
                    "features_path":  f'../input/{data_name}/{data_name}_node_feature.csv',
                    "nodes_path": f'../input/{data_name}/{data_name}_gt.csv',
                    "embedding_path": f'../tmp/embedding/{data_name}_sgcn_feature05.pkl', # tmp folder for cross-validation
                    "regression_weights_path": f'../tmp/weights/{data_name}_sgcn_feature05.pkl',
                    "inductive_model_path": f'../output/inductive_master_tmp/{data_name}_model_l1_{l1_lambda}_l2_{l2_lambda}_{i}th', # or None
                    "log_path": f'../logs/{data_name}_logs_feature05.json',
                    "epochs":300,
                    "test_size":0.2,
                    "reduction_iterations": 128,
                    "reduction_dimensions": 30,
                    "seed": 42,
                    "lamb": 0.0,
                    "learning_rate": 0.001,  
                    "weight_decay": l2_lambda, 
                    # "layers": [64, 32,16,8],
                    "layers": [32, 16, ],
                    "spectral_features":False,
                    "general_features": True,  
                    "sample_num":None,
                    "class_weights":False,
                    "node_under_sampling":False,
                    "hidden_residual":False,
                    "eval_freq":1,
                    "subgraph_training":False,
                    "l1_lambda":l1_lambda,
            })    
            edges, nodes_dict = read_graph(args) # nodes_dict['indice']:node_id , nodes_dict['label'] : label
            print(np.unique(nodes_dict['label'],return_counts=True))
            trainer = SignedGCNTrainer(args, edges, nodes_dict)
            trainer.setup_dataset()
            trainer.create_and_train_model()    

In [None]:
otc_list = ['otc_early0.03','otc_early0.04','otc_early0.05','otc_early0.07','otc_early0.1','otc_early0.15','otc_early0.2',]

In [None]:
train_all(otc_list,l1_lambda=10.0)

In [None]:
alpha_list = ['alpha_early0.04','alpha_early0.05','alpha_early0.07','alpha_early0.1','alpha_early0.15','alpha_early0.2','alpha_early0.3']

In [None]:
train_all(alpha_list,l1_lambda=10.0)

In [2]:
# epinions_list = ['epinions_early0.0005','epinions_early0.001','epinions_early0.005','epinions_early0.01',]

# epinions_list = ['epinions_early0.0007','epinions_early0.003','epinions_early0.007',]
epinions_list = ['epinions_early0.0005','epinions_early0.0007','epinions_early0.001',
                 'epinions_early0.003','epinions_early0.005','epinions_early0.007','epinions_early0.01',]

In [None]:
train_all(epinions_list,l1_lambda=10.0)

## まとめて評価

### data loading

In [3]:
data_name = input('推定対象データセット：')

推定対象データセット：epinions


In [4]:
new_args = easydict.EasyDict({
        "edge_path": f'../input/{data_name}/{data_name}_network.csv',
        "features_path":  f'../input/{data_name}/{data_name}_node_feature.csv',
        "nodes_path": f'../input/{data_name}/{data_name}_gt.csv',
        "embedding_path": f'../tmp/embedding/{data_name}_sgcn_feature05.pkl', # tmp folder for cross-validation
        "regression_weights_path": f'../tmp/weights/{data_name}_sgcn_feature05.pkl',
        "inductive_model_path": f'../output/inductive/{data_name}_model_master', # or None
        "log_path": f'../logs/{data_name}_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 0.0,
        "learning_rate": 0.001,  
        "weight_decay": 10e-4, 
        # "layers": [64, 32,16,8],
        "layers": [32, 16, ],
        "spectral_features":False,
        "general_features": True,  
        "sample_num":None,
        "class_weights":False,
        "node_under_sampling":False,
        "hidden_residual":False,
        "eval_freq":1,
        "subgraph_training":False,
        "l1_lambda":0.0,
})

In [5]:
new_edges, new_nodes_dict = read_graph(new_args)
if data_name=='amazon':
    # undirected
    new_edges['positive_edges'] = new_edges['positive_edges'] + [[d,s] for s,d in new_edges['positive_edges']]
    new_edges['negative_edges'] = new_edges['negative_edges'] + [[d,s] for s,d in new_edges['negative_edges']]
X = np.array(pd.read_csv(f'../input/{data_name}/{data_name}_node_feature.csv')) # general node features

In [6]:
def evaluate_all(trained_model_list,l1_lambda=0.0,l2_lambda=10e-4,iter_num=30):
    result_df = pd.DataFrame()
    bagging_df = pd.DataFrame()
    for training_dataset in trained_model_list:
        auc_scores = []
        bagging_pred_scores = []
        for i in range(iter_num):
            predictor = SignedGCNPredictor(new_args, f'../output/inductive_master_tmp/{training_dataset}_model_l1_{l1_lambda}_l2_{l2_lambda}_{i}th', 
                                           X, new_edges,new_nodes_dict)
            predictions = predictor.predict()
            trained_node_raw = np.load(f'../input/{training_dataset}/{training_dataset}_label_encoder.npy')
            newly_added_node_judger = ~np.isin(new_nodes_dict['indice'],trained_node_raw)
            y_true = new_nodes_dict['label'][newly_added_node_judger]
            y_score_indice = new_nodes_dict['indice'][newly_added_node_judger]
            bagging_pred_scores.append(predictions[y_score_indice])
            
            current_auc = roc_auc_score(y_true=[1 if i==-1 else 0 for i in y_true],y_score=predictions[y_score_indice])
            auc_scores.append(current_auc)        
        averaged_auc = np.mean(auc_scores)
        bagging_auc = roc_auc_score(y_true=[1 if i==-1 else 0 for i in y_true],y_score=sum(bagging_pred_scores)/iter_num)
        result_df = result_df.append(pd.io.json.json_normalize({training_dataset:averaged_auc}).T)
        bagging_df = bagging_df.append(pd.io.json.json_normalize({training_dataset:bagging_auc}).T)
    return result_df, bagging_df

In [7]:
result_df, bagging_df = evaluate_all(epinions_list,l1_lambda=10.0)

result_df['rate'] = result_df.reset_index()['index'].str.split('.',expand=True)[1].map(lambda x:'0.'+x).values

bagging_df['rate'] = bagging_df.reset_index()['index'].str.split('.',expand=True)[1].map(lambda x:'0.'+x).values

In [9]:
result_df.set_index('rate').T

rate,0.0005,0.0007,0.001,0.003,0.005,0.007,0.01
0,0.716503,0.737821,0.776753,0.839291,0.865601,0.908839,0.933292


In [None]:
# bagging_df.set_index('rate').T

## training

In [None]:
iter_num = 30

In [None]:
data_name = input('データセット：')

In [None]:
l1_lambda = float(input('l1_lambda:'))

In [None]:
for i in range(iter_num):
    print(f'{i}-th iteration')
    args = easydict.EasyDict({
            "edge_path": f'../input/{data_name}/{data_name}_network.csv',
            "features_path":  f'../input/{data_name}/{data_name}_node_feature.csv',
            "nodes_path": f'../input/{data_name}/{data_name}_gt.csv',
            "embedding_path": f'../tmp/embedding/{data_name}_sgcn_feature05.pkl', # tmp folder for cross-validation
            "regression_weights_path": f'../tmp/weights/{data_name}_sgcn_feature05.pkl',
            "inductive_model_path": f'../output/inductive_master_tmp/{data_name}_model_l1_{l1_lambda}_{i}th', # or None
            "log_path": f'../logs/{data_name}_logs_feature05.json',
            "epochs":50,
            "test_size":0.2,
            "reduction_iterations": 128,
            "reduction_dimensions": 30,
            "seed": 42,
            "lamb": 0.0,
            "learning_rate": 0.001,  
            "weight_decay": 10e-4, 
            # "layers": [64, 32,16,8],
            "layers": [32, 16, ],
            "spectral_features":False,
            "general_features": True,  
            "sample_num":None,
            "class_weights":False,
            "node_under_sampling":False,
            "hidden_residual":False,
            "eval_freq":1,
            "subgraph_training":False,
            "l1_lambda":l1_lambda,
    })    
    edges, nodes_dict = read_graph(args) # nodes_dict['indice']:node_id , nodes_dict['label'] : label
    print(np.unique(nodes_dict['label'],return_counts=True))
    trainer = SignedGCNTrainer(args, edges, nodes_dict)
    trainer.setup_dataset()
    trainer.create_and_train_model()

## evaluation

### data loading

In [None]:
data_name = input('推定対象データセット：')

In [None]:
new_args = easydict.EasyDict({
        "edge_path": f'../input/{data_name}/{data_name}_network.csv',
        "features_path":  f'../input/{data_name}/{data_name}_node_feature.csv',
        "nodes_path": f'../input/{data_name}/{data_name}_gt.csv',
        "embedding_path": f'../tmp/embedding/{data_name}_sgcn_feature05.pkl', # tmp folder for cross-validation
        "regression_weights_path": f'../tmp/weights/{data_name}_sgcn_feature05.pkl',
        "inductive_model_path": f'../output/inductive/{data_name}_model_master', # or None
        "log_path": f'../logs/{data_name}_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 0.0,
        "learning_rate": 0.001,  
        "weight_decay": 10e-4, 
        # "layers": [64, 32,16,8],
        "layers": [32, 16, ],
        "spectral_features":False,
        "general_features": True,  
        "sample_num":None,
        "class_weights":False,
        "node_under_sampling":False,
        "hidden_residual":False,
        "eval_freq":1,
        "subgraph_training":False,
        "l1_lambda":0.0,
})

In [None]:
new_edges, new_nodes_dict = read_graph(new_args)
if data_name=='amazon':
    # undirected
    new_edges['positive_edges'] = new_edges['positive_edges'] + [[d,s] for s,d in new_edges['positive_edges']]
    new_edges['negative_edges'] = new_edges['negative_edges'] + [[d,s] for s,d in new_edges['negative_edges']]
X = np.array(pd.read_csv(f'../input/{data_name}/{data_name}_node_feature.csv')) # general node features

### evaluation and averaging

In [None]:
iter_num = 30

In [None]:
training_dataset = input('学習に使ったデータセット：')

In [None]:
l1_lambda = float(input('l1_lambda:'))

In [None]:
l2_lambda = float(input('l2_lambda:'))

In [None]:
auc_scores = []
for i in range(iter_num):
    predictor = SignedGCNPredictor(new_args, f'../output/inductive_master_tmp/{training_dataset}_model_l1_{l1_lambda}_l2_{l2_lambda}_{i}th', 
                                   X, new_edges,new_nodes_dict)
    predictions = predictor.predict()
    trained_node_raw = np.load(f'../input/{training_dataset}/{training_dataset}_label_encoder.npy')
    newly_added_node_judger = ~np.isin(new_nodes_dict['indice'],trained_node_raw)
    y_true = new_nodes_dict['label'][newly_added_node_judger]
    y_score_indice = new_nodes_dict['indice'][newly_added_node_judger]

    current_auc = roc_auc_score(y_true=[1 if i==-1 else 0 for i in y_true],y_score=predictions[y_score_indice])
    auc_scores.append(current_auc)

In [None]:
auc_scores

In [None]:
np.mean(auc_scores), np.std(auc_scores)