# Signed Graph Convolutional Network(SGCN)を用いたFraud User Detection

```sh
pip install -r /home/ubuntu/SGCN/requirements.txt
pip install torch_scatter
pip install torch_sparse
pip install easydict
```


In [1]:
from sgcn import SignedGCNTrainer, SignedGCNPredictor
from parser import parameter_parser
from utils import tab_printer, read_graph, score_printer, save_logs
import easydict
import argparse
import pandas as pd
import numpy as np
import torch
import json
from sklearn.metrics import roc_auc_score
%matplotlib inline

  (fname, cnt))
  (fname, cnt))


In [2]:
args = easydict.EasyDict({
        "edge_path": '../input/amazon/amazon_network.csv',#'../input/amazon/user_network.csv',
        "features_path":  '../input/amazon/amazon_node_feature.csv',#'../input/amazon/user_network.csv',
        "nodes_path": '../input/amazon/amazon_gt.csv',
        "embedding_path": '../tmp/embedding/amazon_sgcn_feature05.csv', # tmp folder for cross-validation
        "regression_weights_path": '../tmp/weights/amazon_sgcn_feature05.csv',
        "inductive_model_path": '../output/inductive/amazon_model', # or None
        "log_path": '../logs/amazon_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 1.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-5, 
        # "layers": [64, 32,16,8],
        "layers": [32, 16],
        "spectral_features":False,
        "general_features": True,  
})

In [3]:
pd.io.json.json_normalize(args).T

Unnamed: 0,0
edge_path,../input/amazon/amazon_network.csv
embedding_path,../tmp/embedding/amazon_sgcn_feature05.csv
epochs,300
features_path,../input/amazon/amazon_node_feature.csv
general_features,True
inductive_model_path,../output/inductive/amazon_model
lamb,1
layers,"[32, 16]"
learning_rate,0.005
log_path,../logs/amazon_logs_feature05.json


## 10-fold cross-validation
- train : validation : test = 6:3:1

In [4]:
#tab_printer(args)
edges, nodes_dict = read_graph(args) # nodes_dict['indice']:node_id , nodes_dict['label'] : label

In [5]:
from sklearn.model_selection import StratifiedKFold

In [6]:
kf = StratifiedKFold(n_splits=10)
all_indice = nodes_dict['indice']
all_labels = nodes_dict['label']
auc_scores = []

In [7]:
for i, (train_index, test_index) in enumerate(kf.split(X=nodes_dict['indice'],y=nodes_dict['label'])):
    print("==== Training Phase ====")
    print(f'{i}-th fold')
    # training
    train_node_indice = all_indice[train_index]
    train_node_labels = all_labels[train_index]
    tmp_nodes_dict = {}
    tmp_nodes_dict['all_ncount'] = nodes_dict['all_ncount']
    tmp_nodes_dict['indice'] = train_node_indice
    tmp_nodes_dict['label'] = train_node_labels
    trainer = SignedGCNTrainer(args, edges, tmp_nodes_dict)
    trainer.setup_dataset()
    trainer.create_and_train_model()
    
    if args.test_size > 0:
        # trainer.save_model() ## trainer.create_and_train_model()のなかで，すでにbest_modelが保存されている．
        score_printer(trainer.logs)
        save_logs(args, trainer.logs)

    # test
    print("==== Test Phase ====")
    test_node_indice = all_indice[test_index]
    test_node_labels = all_labels[test_index]
    feature = pd.read_csv(args.embedding_path,index_col='id').values
    test_feature = feature[test_node_indice]
    weight = pd.read_csv(args.regression_weights_path)
    predictions = np.dot(test_feature,weight.values.T)
    probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions)).numpy()
    auc_score = roc_auc_score(y_true=[0 if i==-1 else 1for i in test_node_labels],y_score=probabilities[:,1])
    auc_scores.append(auc_score)
    print(f"{i}-th fold's auc_score:{auc_score}")
    print()
    

==== Training Phase ====
0-th fold


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.3027):   4%|▎         | 11/300 [01:37<1:07:48, 14.08s/it]

KeyboardInterrupt: 

In [None]:
np.mean(auc_scores)

結果
- amazon : 
- alpha : 
- epinions : 
- otc : 0.987

#### inductive settings

In [None]:
new_args = easydict.EasyDict({
        "edge_path": '../input/otc/otc_network.csv',#'../input/otc/user_network.csv',
        "features_path":  '../input/otc/otc_node_feature.csv',#'../input/otc/user_network.csv',
        "nodes_path": '../input/otc/otc_gt.csv',
        "embedding_path": '../tmp/embedding/otc_sgcn_feature05.csv', # tmp folder for cross-validation
        "regression_weights_path": '../tmp/weights/otc_sgcn_feature05.csv',
        "inductive_model_path": '../output/inductive/otc_model', # or None
        "log_path": '../logs/otc_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 1.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-5, 
        # "layers": [64, 32,16,8],
        "layers": [64, 32],
        "spectral_features":False,
        "general_features": True,  
})

In [None]:
new_edges, new_nodes_dict = read_graph(new_args)

X = np.array(pd.read_csv('../input/otc/otc_node_feature.csv')) # general node features

In [None]:
predictor = SignedGCNPredictor(new_args, '../output/inductive/alpha_model', X, new_edges,new_nodes_dict)

In [None]:
predictions = predictor.predict()
predict_labels = predictions.argmax(1)

In [None]:
y_true = new_nodes_dict['label']

In [None]:
roc_auc_score(y_true=[0 if i==-1 else 1 for i in new_nodes_dict['label']],y_score=predictions[:,1][new_nodes_dict['indice']])

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
confusion_matrix([0 if i==-1 else 1 for i in new_nodes_dict['label']],predict_labels[new_nodes_dict['indice']])

## single-validation

In [None]:
trainer = SignedGCNTrainer(args, edges, nodes_dict)
trainer.setup_dataset()
trainer.create_and_train_model()

In [None]:
if args.test_size > 0:
    trainer.save_model()
    score_printer(trainer.logs)
    save_logs(args, trainer.logs)

In [None]:
import json

In [None]:
performance = pd.DataFrame(json.load(open('../logs/otc_logs_feature05.json','r'))['performance'])

performance.columns = performance.iloc[0,:]

performance = performance.iloc[1:,:]

In [None]:
performance['AUC'].plot()

In [None]:
feature = pd.read_csv(args.embedding_path,index_col='id')

weight = pd.read_csv(args.regression_weights_path)

In [None]:
predictions = np.dot(feature.values,weight.values.T)

In [None]:
probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions)).numpy()

In [None]:
predict_labels = probabilities.argmax(1)

In [None]:
roc_auc_score(y_true=[0 if i==-1 else 1 for i in nodes_dict['label']],y_score=probabilities[:,1][nodes_dict['indice']])

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
confusion_matrix([0 if i==-1 else 1 for i in nodes_dict['label']],predict_labels[nodes_dict['indice']])

# TODO