# Signed Graph Convolutional Network(SGCN)を用いたFraud User Detection

```sh
pip install -r /home/ubuntu/SGCN/requirements.txt
pip install torch_scatter
pip install torch_sparse
pip install easydict
```


In [1]:
from sgcn import SignedGCNTrainer, SignedGCNPredictor
from parser import parameter_parser
from utils import tab_printer, read_graph, score_printer, save_logs
import easydict
import argparse
import pandas as pd
import numpy as np
import torch
import json
import networkx as nx
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score,confusion_matrix
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline

  (fname, cnt))
  (fname, cnt))


In [2]:
args = easydict.EasyDict({
        "edge_path": '../input/amazon/amazon_network.csv',#'../input/amazon/user_network.csv',
        "features_path":  '../input/amazon/amazon_node_feature.csv',#'../input/amazon/user_network.csv',
        "nodes_path": '../input/amazon/amazon_gt.csv',
        "embedding_path": '../tmp/embedding/amazon_sgcn_feature05.pkl', # tmp folder for cross-validation
        "regression_weights_path": '../tmp/weights/amazon_sgcn_feature05.pkl',
        "inductive_model_path": None, # '../output/inductive/amazon_model', # or None
        "log_path": '../logs/amazon_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 0.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-5, 
        # "layers": [64, 32,16,8],
        "layers": [32,16,8],
        "spectral_features":False,
        "general_features": True,  
        "sample_num":None,
        "class_weights":False,
        "node_under_sampling":False,
})

In [3]:
display(pd.io.json.json_normalize(args).T)

Unnamed: 0,0
class_weights,False
edge_path,../input/amazon/amazon_network.csv
embedding_path,../tmp/embedding/amazon_sgcn_feature05.pkl
epochs,300
features_path,../input/amazon/amazon_node_feature.csv
general_features,True
inductive_model_path,
lamb,0
layers,"[32, 16, 8]"
learning_rate,0.005


## 10-fold cross-validation
- train : validation : test = 6:3:1

In [4]:
#tab_printer(args)
edges, nodes_dict = read_graph(args) # nodes_dict['indice']:node_id , nodes_dict['label'] : label

In [5]:
from sklearn.model_selection import StratifiedKFold

In [6]:
kf = StratifiedKFold(n_splits=10)
all_indice = nodes_dict['indice']
all_labels = nodes_dict['label']
auc_scores = []

In [7]:
for i, (train_index, test_index) in enumerate(kf.split(X=nodes_dict['indice'],y=nodes_dict['label'])):
    print("==== Training Phase ====")
    print(f'{i}-th fold')
    # training
    train_node_indice = all_indice[train_index]
    train_node_labels = all_labels[train_index]
    print(f'labels:{np.unique(train_node_labels,return_counts=True)}')
    tmp_nodes_dict = {}
    tmp_nodes_dict['all_ncount'] = nodes_dict['all_ncount']
    tmp_nodes_dict['indice'] = train_node_indice
    tmp_nodes_dict['label'] = train_node_labels
    trainer = SignedGCNTrainer(args, edges, tmp_nodes_dict)
    trainer.setup_dataset()
    trainer.create_and_train_model()
    
    if args.test_size > 0:
        # trainer.save_model() ## trainer.create_and_train_model()のなかで，すでにbest_modelが保存されている．
        # score_printer(trainer.logs)
        display(pd.DataFrame(trainer.logs['performance']))
        save_logs(args, trainer.logs)

    # test
    print("==== Test Phase ====")
    test_node_indice = all_indice[test_index]
    test_node_labels = all_labels[test_index]
    # feature = pd.read_csv(args.embedding_path,index_col='id').values
    feature = pd.read_pickle(args.embedding_path).drop('id',1).values
    test_feature = feature[test_node_indice]
    # weight = pd.read_csv(args.regression_weights_path)
    weight = pd.read_pickle(args.regression_weights_path)
    predictions = np.dot(test_feature,weight.values.T)
    probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions)).numpy()
    predict_labels = probabilities.argmax(1)
    auc_score = roc_auc_score(y_true=[1 if i==-1 else 0 for i in test_node_labels],y_score=probabilities[:,1])
    auc_scores.append(auc_score)
    cmx = confusion_matrix(y_true=[1 if i==-1 else 0 for i in test_node_labels],y_pred=predict_labels)
    print(f"{i}-th fold's auc_score:{auc_score}")
    print(cmx)
    print()
    

==== Training Phase ====
0-th fold
labels:(array([-1,  1]), array([ 216, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1586): 100%|██████████| 300/300 [05:35<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.813074,0
2,11,0.876303,0
3,21,0.878936,0
4,31,0.885606,0
5,41,0.898425,0
6,51,0.900474,0
7,61,0.905839,0
8,71,0.906823,0
9,81,0.903347,0




==== Test Phase ====
0-th fold's auc_score:0.8254237288135594
[[226  10]
 [ 18   7]]

==== Training Phase ====
1-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1588): 100%|██████████| 300/300 [05:35<00:00,  1.07s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.797401,0
2,11,0.844881,0
3,21,0.858254,0
4,31,0.872996,0
5,41,0.870139,0
6,51,0.869087,0
7,61,0.881845,0
8,71,0.882679,0
9,81,0.873948,0




==== Test Phase ====
1-th fold's auc_score:0.910840395480226
[[232   4]
 [ 16   8]]

==== Training Phase ====
2-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1389): 100%|██████████| 300/300 [05:35<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.769742,0
2,11,0.832877,0
3,21,0.845476,0
4,31,0.860417,0
5,41,0.847837,0
6,51,0.850873,0
7,61,0.85371,0
8,71,0.858452,0
9,81,0.854663,0




==== Test Phase ====
2-th fold's auc_score:0.879590395480226
[[236   0]
 [ 24   0]]

==== Training Phase ====
3-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1507): 100%|██████████| 300/300 [05:35<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.786071,0
2,11,0.853492,0
3,21,0.873333,0
4,31,0.884206,0
5,41,0.879266,0
6,51,0.885655,0
7,61,0.891925,0
8,71,0.891508,0
9,81,0.893016,0




==== Test Phase ====
3-th fold's auc_score:0.8012005649717514
[[231   5]
 [ 21   3]]

==== Training Phase ====
4-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1535): 100%|██████████| 300/300 [05:36<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.809296,0
2,11,0.889673,0
3,21,0.879345,0
4,31,0.895089,0
5,41,0.893065,0
6,51,0.8937,0
7,61,0.898542,0
8,71,0.897887,0
9,81,0.900347,0




==== Test Phase ====
4-th fold's auc_score:0.7925494350282487
[[230   6]
 [ 17   7]]

==== Training Phase ====
5-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1751): 100%|██████████| 300/300 [05:35<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.771835,0
2,11,0.846181,0
3,21,0.850526,0
4,31,0.855585,0
5,41,0.859514,0
6,51,0.870228,0
7,61,0.874395,0
8,71,0.869335,0
9,81,0.862688,0




==== Test Phase ====
5-th fold's auc_score:0.9230225988700566
[[230   6]
 [ 13  11]]

==== Training Phase ====
6-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1481): 100%|██████████| 300/300 [05:35<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.757569,0
2,11,0.812589,0
3,21,0.81626,0
4,31,0.835109,0
5,41,0.837649,0
6,51,0.83503,0
7,61,0.842986,0
8,71,0.844633,0
9,81,0.841002,0




==== Test Phase ====
6-th fold's auc_score:0.8938912429378532
[[232   4]
 [ 16   8]]

==== Training Phase ====
7-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1723): 100%|██████████| 300/300 [05:35<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.792401,0
2,11,0.849484,0
3,21,0.857004,0
4,31,0.877659,0
5,41,0.873512,0
6,51,0.875675,0
7,61,0.881984,0
8,71,0.878968,0
9,81,0.882083,0




==== Test Phase ====
7-th fold's auc_score:0.8504590395480225
[[229   7]
 [ 19   5]]

==== Training Phase ====
8-th fold
labels:(array([-1,  1]), array([ 217, 2123]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1552): 100%|██████████| 300/300 [05:36<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.771903,0
2,11,0.831164,0
3,21,0.838613,0
4,31,0.839802,0
5,41,0.85169,0
6,51,0.842021,0
7,61,0.854028,0
8,71,0.861418,0
9,81,0.859536,0




==== Test Phase ====
8-th fold's auc_score:0.9146276595744681
[[233   2]
 [ 18   6]]

==== Training Phase ====
9-th fold
labels:(array([-1,  1]), array([ 217, 2123]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
SGCN (Loss=0.1495): 100%|██████████| 300/300 [05:35<00:00,  1.06s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.789854,0
2,11,0.834334,0
3,21,0.834591,0
4,31,0.85171,0
5,41,0.868392,0
6,51,0.854979,0
7,61,0.857396,0
8,71,0.8739,0
9,81,0.877982,0


==== Test Phase ====
9-th fold's auc_score:0.9278368794326242
[[232   3]
 [ 17   7]]





In [8]:
np.mean(auc_scores)

0.8719441940137036

In [None]:
_ = plt.hist(probabilities[:,1][test_node_labels==1],alpha=0.5,bins=10)
_ = plt.hist(probabilities[:,1][test_node_labels==-1],alpha=0.5,bins=10)
# plt.xlim(0,0.03)

結果 (32,16) (class-weights)
- amazon : 0.867
- alpha : 
- otc : 

---

結果 (32,) (no-class-weights, 0.33)
- amazon : 0.856184081
- alpha : 0.983
- otc : 0.998

結果 (32,16) (no-class-weights,test_size=0.33)
- amazon : 0.871
- alpha : 0.9828
- otc : 0.997

結果 (32,16,8) (no-class-weights,test_size=0.33)
- amazon : 0.867
- alpha : 0.97123376
- otc : 0.9883699633699635

結果 (32,16,8) (no-class-weights,test_size=0.33,lamb=0)
- amazon : 0.871944
- alpha : 0.96454545
- otc : 0.994810744810

---

結果 (32,16) (no-class-weights,test_size=0.66)
- amazon : 0.8635876758
- alpha : 0.9831
- otc : 0.993

結果 (64,32)
- amazon : 0.8597141032576031
- alpha : 
- otc : 

結果 (64,32,16,8)
- amazon : 0.8516649987979326
- alpha : 
- otc : 

結果
- amazon : 0.8704, (weighted classes loss : 0.871)
- alpha : (sampled: 0.9804), (normal: 0.9857)
- epinions : 
- otc : (0.9947), (normal:0.996)

## single-validation

In [None]:
args = easydict.EasyDict({
        "edge_path": '../input/amazon/amazon_network.csv',#'../input/amazon/user_network.csv',
        "features_path":  '../input/amazon/amazon_node_feature.csv',#'../input/amazon/user_network.csv',
        "nodes_path": '../input/amazon/amazon_gt.csv',
        "embedding_path": '../tmp/embedding/amazon_sgcn_feature05.pkl', # tmp folder for cross-validation
        "regression_weights_path": '../tmp/weights/amazon_sgcn_feature05.pkl',
        "inductive_model_path": '../output/inductive/amazon_model', # or None
        "log_path": '../logs/amazon_logs_feature05.json',
        "epochs":500,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 1.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-5, 
        # "layers": [64, 32,16,8],
        "layers": [32,16],
        "spectral_features":False,
        "general_features": True,  
        "sample_num":None,
        "class_weights":False,
        "node_under_sampling":False,
})

In [None]:
#tab_printer(args)
edges, nodes_dict = read_graph(args) # nodes_dict['indice']:node_id , nodes_dict['label'] : label

In [None]:
trainer = SignedGCNTrainer(args, edges, nodes_dict)
trainer.setup_dataset()
trainer.create_and_train_model()

In [None]:
if args.test_size > 0:
    trainer.save_model()
    # score_printer(trainer.logs)
    display(pd.DataFrame(trainer.logs['performance']))
    save_logs(args, trainer.logs)

#### inductive settings

In [None]:
new_args = easydict.EasyDict({
        "edge_path": '../input/amazon_music/amazon_music_network.csv',#'../input/amazon_music/user_network.csv',
        "features_path":  '../input/amazon_music/amazon_music_node_feature.csv',#'../input/amazon_music/user_network.csv',
        "nodes_path": '../input/amazon_music/amazon_music_gt.csv',
        "embedding_path": '../tmp/embedding/amazon_music_sgcn_feature05.csv', # tmp folder for cross-validation
        "regression_weights_path": '../tmp/weights/amazon_music_sgcn_feature05.csv',
        "inductive_model_path": None, # '../output/inductive/amazon_music_model', # or None
        "log_path": '../logs/amazon_music_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 1.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-5, 
        # "layers": [64, 32,16,8],
        "layers": [32, 16],
        "spectral_features":False,
        "general_features": True,  
        "class_weights":False,
        "sample_num":None,
        "node_under_sampling":False,
})

In [None]:
new_edges, new_nodes_dict = read_graph(new_args)

X = np.array(pd.read_csv('../input/amazon_elec/amazon_elec_node_feature.csv')) # general node features

In [None]:
predictor = SignedGCNPredictor(new_args, '../output/inductive/amazon_model', X, new_edges,new_nodes_dict)

In [None]:
predictions = predictor.predict()
predict_labels = predictions.argmax(1)

In [None]:
y_true = new_nodes_dict['label']

In [None]:
roc_auc_score(y_true=[1 if i==-1 else 0 for i in new_nodes_dict['label']],y_score=predictions[:,1][new_nodes_dict['indice']])

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
confusion_matrix([1 if i==-1 else 0 for i in new_nodes_dict['label']],predict_labels[new_nodes_dict['indice']])

In [None]:
y_true.shape

In [None]:
_ = plt.hist(predictions[:,1][new_nodes_dict['indice']][y_true==1],alpha=0.5,bins=10)
_ = plt.hist(predictions[:,1][new_nodes_dict['indice']][y_true==-1],alpha=0.5,bins=10)

結果
- amazon
    - elec : 0.80787063118
    - movie : 0.55
    - music : 0.76
    

In [None]:
import json

In [None]:
performance = pd.DataFrame(json.load(open('../logs/otc_logs_feature05.json','r'))['performance'])

performance.columns = performance.iloc[0,:]

performance = performance.iloc[1:,:]

In [None]:
performance['AUC'].plot()

In [None]:
feature = pd.read_csv(args.embedding_path,index_col='id')

weight = pd.read_csv(args.regression_weights_path)

In [None]:
predictions = np.dot(feature.values,weight.values.T)

In [None]:
probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions)).numpy()

In [None]:
predict_labels = probabilities.argmax(1)

In [None]:
roc_auc_score(y_true=[0 if i==-1 else 1 for i in nodes_dict['label']],y_score=probabilities[:,1][nodes_dict['indice']])

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
confusion_matrix([0 if i==-1 else 1 for i in nodes_dict['label']],predict_labels[nodes_dict['indice']])

# TODO