# Signed Graph Convolutional Network(SGCN)を用いたFraud User Detection

```sh
pip install -r /home/ubuntu/SGCN/requirements.txt
pip install torch_scatter
pip install torch_sparse
pip install easydict
```


In [1]:
from sgcn import SignedGCNTrainer, SignedGCNPredictor
from parser import parameter_parser
from utils import tab_printer, read_graph, score_printer, save_logs
import easydict
import argparse
import pandas as pd
import numpy as np
import torch
import json
import networkx as nx
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score,confusion_matrix
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline

  (fname, cnt))
  (fname, cnt))


In [2]:
data_name = input('データセット：')

データセット：amazon


In [3]:
args = easydict.EasyDict({
        "edge_path": f'../input/{data_name}/{data_name}_network.csv',#'../input/{data_name}/user_network.csv',
        "features_path":  f'../input/{data_name}/{data_name}_node_feature.csv',#'../input/{data_name}/user_network.csv',
        "nodes_path": f'../input/{data_name}/{data_name}_gt.csv',
        "embedding_path": f'../tmp/embedding/{data_name}_sgcn_feature05.pkl', # tmp folder for cross-validation
        "regression_weights_path": f'../tmp/weights/{data_name}_sgcn_feature05.pkl',
        "inductive_model_path": None, # f'../output/inductive/{data_name}_model', # or None
        "log_path": f'../logs/{data_name}_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 1.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-4, 
        # "layers": [64, 32,16,8],
        "layers": [32, 16, ],
        "spectral_features":False,
        "general_features": True,  
        "sample_num":None,
        "class_weights":False,
        "node_under_sampling":False,
        "hidden_residual":False,
        "eval_freq":1,
})

In [4]:
display(pd.io.json.json_normalize(args).T)

Unnamed: 0,0
class_weights,False
edge_path,../input/amazon/amazon_network.csv
embedding_path,../tmp/embedding/amazon_sgcn_feature05.pkl
epochs,300
eval_freq,1
features_path,../input/amazon/amazon_node_feature.csv
general_features,True
hidden_residual,False
inductive_model_path,
lamb,1


## 10-fold cross-validation
- train : validation : test = 6:3:1

In [5]:
#tab_printer(args)
edges, nodes_dict = read_graph(args) # nodes_dict['indice']:node_id , nodes_dict['label'] : label

In [6]:
from sklearn.model_selection import StratifiedKFold

In [7]:
kf = StratifiedKFold(n_splits=10)
all_indice = nodes_dict['indice']
all_labels = nodes_dict['label']
auc_scores = []

In [8]:
for i, (train_index, test_index) in enumerate(kf.split(X=nodes_dict['indice'],y=nodes_dict['label'])):
    print("==== Training Phase ====")
    print(f'{i}-th fold')
    # training
    train_node_indice = all_indice[train_index]
    train_node_labels = all_labels[train_index]
    print(f'labels:{np.unique(train_node_labels,return_counts=True)}')
    tmp_nodes_dict = {}
    tmp_nodes_dict['all_ncount'] = nodes_dict['all_ncount']
    tmp_nodes_dict['indice'] = train_node_indice
    tmp_nodes_dict['label'] = train_node_labels
    trainer = SignedGCNTrainer(args, edges, tmp_nodes_dict)
    trainer.setup_dataset()
    trainer.create_and_train_model()
    
    if args.test_size > 0:
        # trainer.save_model() ## trainer.create_and_train_model()のなかで，すでにbest_modelが保存されている．
        # score_printer(trainer.logs)
        display(pd.DataFrame(trainer.logs['performance']))
        save_logs(args, trainer.logs)

    # test
    print("==== Test Phase ====")
    test_node_indice = all_indice[test_index]
    test_node_labels = all_labels[test_index]
    # feature = pd.read_csv(args.embedding_path,index_col='id').values
    feature = pd.read_pickle(args.embedding_path).drop('id',1).values
    test_feature = feature[test_node_indice]
    # weight = pd.read_csv(args.regression_weights_path)
    weight = pd.read_pickle(args.regression_weights_path)
    predictions = np.dot(test_feature,weight.values.T)
    probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions)).numpy()
    predict_labels = probabilities.argmax(1)
    auc_score = roc_auc_score(y_true=[1 if i==-1 else 0 for i in test_node_labels],y_score=probabilities[:,1])
    auc_scores.append(auc_score)
    cmx = confusion_matrix(y_true=[1 if i==-1 else 0 for i in test_node_labels],y_pred=predict_labels)
    print(f"{i}-th fold's auc_score:{auc_score}")
    print(cmx)
    print()
    

==== Training Phase ====
0-th fold
labels:(array([-1,  1]), array([ 216, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.183): 100%|██████████| 300/300 [08:27<00:00,  1.72s/it] 


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.293936,0
2,2,0.213448,0
3,3,0.200589,0
4,4,0.201533,0
5,5,0.217426,0
6,6,0.224157,0
7,7,0.227954,0
8,8,0.23798,0
9,9,0.250678,0




==== Test Phase ====
0-th fold's auc_score:0.8145762711864406
[[225  11]
 [ 20   5]]

==== Training Phase ====
1-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.2155): 100%|██████████| 300/300 [08:27<00:00,  1.65s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.325337,0
2,2,0.206726,0
3,3,0.184246,0
4,4,0.177937,0
5,5,0.190933,0
6,6,0.203353,0
7,7,0.213135,0
8,8,0.225853,0
9,9,0.23869,0




==== Test Phase ====
1-th fold's auc_score:0.9143714689265536
[[233   3]
 [ 17   7]]

==== Training Phase ====
2-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.2361): 100%|██████████| 300/300 [08:24<00:00,  1.65s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.269028,0
2,2,0.182103,0
3,3,0.175,0
4,4,0.166548,0
5,5,0.179147,0
6,6,0.19871,0
7,7,0.210913,0
8,8,0.222103,0
9,9,0.240853,0




==== Test Phase ====
2-th fold's auc_score:0.8774717514124294
[[230   6]
 [ 21   3]]

==== Training Phase ====
3-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.174): 100%|██████████| 300/300 [08:23<00:00,  1.65s/it] 


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.353284,0
2,2,0.257827,0
3,3,0.242986,0
4,4,0.233105,0
5,5,0.240169,0
6,6,0.248026,0
7,7,0.258006,0
8,8,0.266478,0
9,9,0.280565,0




==== Test Phase ====
3-th fold's auc_score:0.8200918079096046
[[231   5]
 [ 20   4]]

==== Training Phase ====
4-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.1898): 100%|██████████| 300/300 [08:20<00:00,  1.65s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.324802,0
2,2,0.219802,0
3,3,0.202837,0
4,4,0.200694,0
5,5,0.220099,0
6,6,0.226091,0
7,7,0.23881,0
8,8,0.254425,0
9,9,0.270159,0




==== Test Phase ====
4-th fold's auc_score:0.7547669491525424
[[229   7]
 [ 20   4]]

==== Training Phase ====
5-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.1942): 100%|██████████| 300/300 [08:20<00:00,  1.66s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.365139,0
2,2,0.24873,0
3,3,0.223948,0
4,4,0.214266,0
5,5,0.222857,0
6,6,0.227044,0
7,7,0.231687,0
8,8,0.241329,0
9,9,0.249881,0




==== Test Phase ====
5-th fold's auc_score:0.9336158192090396
[[235   1]
 [ 15   9]]

==== Training Phase ====
6-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.2065): 100%|██████████| 300/300 [08:26<00:00,  1.65s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.355486,0
2,2,0.225685,0
3,3,0.218978,0
4,4,0.220387,0
5,5,0.238065,0
6,6,0.248998,0
7,7,0.259038,0
8,8,0.268284,0
9,9,0.280883,0




==== Test Phase ====
6-th fold's auc_score:0.891949152542373
[[236   0]
 [ 24   0]]

==== Training Phase ====
7-th fold
labels:(array([-1,  1]), array([ 217, 2122]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.201): 100%|██████████| 300/300 [08:24<00:00,  1.65s/it] 


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.321419,0
2,2,0.204613,0
3,3,0.192391,0
4,4,0.184851,0
5,5,0.200446,0
6,6,0.212093,0
7,7,0.221181,0
8,8,0.240655,0
9,9,0.259812,0




==== Test Phase ====
7-th fold's auc_score:0.8395127118644067
[[231   5]
 [ 20   4]]

==== Training Phase ====
8-th fold
labels:(array([-1,  1]), array([ 217, 2123]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.2057): 100%|██████████| 300/300 [08:29<00:00,  1.65s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.332699,0
2,2,0.188481,0
3,3,0.176652,0
4,4,0.178119,0
5,5,0.198962,0
6,6,0.210552,0
7,7,0.219686,0
8,8,0.234021,0
9,9,0.251248,0




==== Test Phase ====
8-th fold's auc_score:0.9188829787234043
[[234   1]
 [ 21   3]]

==== Training Phase ====
9-th fold
labels:(array([-1,  1]), array([ 217, 2123]))


Loss:   0%|          | 0/300 [00:00<?, ?it/s]


Training started.



SGCN (Loss=0.1985): 100%|██████████| 300/300 [08:24<00:00,  1.65s/it]


Unnamed: 0,0,1,2
0,Epoch,AUC,F1
1,1,0.305754,0
2,2,0.232624,0
3,3,0.209978,0
4,4,0.199616,0
5,5,0.213227,0
6,6,0.226026,0
7,7,0.231574,0
8,8,0.241441,0
9,9,0.254398,0


==== Test Phase ====
9-th fold's auc_score:0.925709219858156
[[233   2]
 [ 21   3]]





In [9]:
np.mean(auc_scores)

0.869094813078495

In [None]:
_ = plt.hist(probabilities[:,1][test_node_labels==1],alpha=0.5,bins=10)
_ = plt.hist(probabilities[:,1][test_node_labels==-1],alpha=0.5,bins=10)
# plt.xlim(0,0.03)

# new_balance_theoryの結果


結果 (32,16) (non-class-weights,10e-3, test_size=0.33)
- amazon : 0.8650228
- alpha : 0.984
- otc : 0.9979548

結果 (32,16) (non-class-weights,10e-4, test_size=0.33)
- amazon : 0.8690948130
- alpha : 0.97264
- otc : 0.996

結果 (32,16, 8) (non-class-weights,10e-4, test_size=0.33)
- amazon : 0.8722951
- alpha : 
- otc : 

結果 (32,16) (non-class-weights,10e-3, test_size-0.5)
- amazon : 
- alpha : 
- otc : 

結果 (32,32,32,32) (non-class-weights,10e-3, hidden_residual,) (100 epoch)
- amazon : 0.86009158252
- alpha : 
- otc : 

結果 (32,32,32,) (class-weights,10e-3,hidden_residual) (100 epoch)
- amazon : 0.864321831
- alpha : 
- otc : 

---

結果 (32,) (no-class-weights, 0.33)
- amazon : 0.856184081
- alpha : 0.983
- otc : 0.998

結果 (32,16) (no-class-weights,test_size=0.33,weight_decay=10e-5)
- amazon : 0.871
- alpha : 0.9828
- otc : 0.997

結果 (32,16,8) (no-class-weights,test_size=0.33)
- amazon : 0.867
- alpha : 0.97123376
- otc : 0.9883699633699635

結果 (32,16,8) (no-class-weights,test_size=0.33,lamb=0)
- amazon : 0.871944
- alpha : 0.96454545
- otc : 0.994810744810

結果 (32,32,32) (no-class-weights,test_size=0.33,lamb=1.0,residual)
- amazon : 0.869
- alpha : 0.
- otc : 0.

結果 (32,16) (no-class-weights,test_size=0.33,weight_decay=10e-3)
- amazon : 0.871668935569  -->  0.866652196778459
- alpha : 0.9868181818181819 --> 0.9778071928071927
- otc : 0.9963980463980464  --> 0.99557387057

結果 (32,16) (no-class-weights,test_size=0.33,weight_decay=10e-3, amazon_unsigned)
- amazon : 
- alpha : 0.9868181818181819 --> 0.9778071928071927
- otc : 0.9963980463980464  --> 0.99557387057

---

結果 (32,16) (no-class-weights,test_size=0.66)
- amazon : 0.8635876758
- alpha : 0.9831
- otc : 0.993

結果 (64,32)
- amazon : 0.8597141032576031
- alpha : 
- otc : 

結果 (64,32,16,8)
- amazon : 0.8516649987979326
- alpha : 
- otc : 

結果
- amazon : 0.8704, (weighted classes loss : 0.871)
- alpha : (sampled: 0.9804), (normal: 0.9857)
- epinions : 
- otc : (0.9947), (normal:0.996)

## single-validation

In [None]:
args = easydict.EasyDict({
        "edge_path": '../input/amazon/amazon_network.csv',#'../input/amazon/user_network.csv',
        "features_path":  '../input/amazon/amazon_node_feature.csv',#'../input/amazon/user_network.csv',
        "nodes_path": '../input/amazon/amazon_gt.csv',
        "embedding_path": '../tmp/embedding/amazon_sgcn_feature05.pkl', # tmp folder for cross-validation
        "regression_weights_path": '../tmp/weights/amazon_sgcn_feature05.pkl',
        "inductive_model_path": '../output/inductive/amazon_model', # or None
        "log_path": '../logs/amazon_logs_feature05.json',
        "epochs":500,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 1.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-5, 
        # "layers": [64, 32,16,8],
        "layers": [32,16],
        "spectral_features":False,
        "general_features": True,  
        "sample_num":None,
        "class_weights":False,
        "node_under_sampling":False,
})

In [None]:
#tab_printer(args)
edges, nodes_dict = read_graph(args) # nodes_dict['indice']:node_id , nodes_dict['label'] : label

In [None]:
trainer = SignedGCNTrainer(args, edges, nodes_dict)
trainer.setup_dataset()
trainer.create_and_train_model()

In [None]:
if args.test_size > 0:
    trainer.save_model()
    # score_printer(trainer.logs)
    display(pd.DataFrame(trainer.logs['performance']))
    save_logs(args, trainer.logs)

#### inductive settings

In [None]:
new_args = easydict.EasyDict({
        "edge_path": '../input/amazon_music/amazon_music_network.csv',#'../input/amazon_music/user_network.csv',
        "features_path":  '../input/amazon_music/amazon_music_node_feature.csv',#'../input/amazon_music/user_network.csv',
        "nodes_path": '../input/amazon_music/amazon_music_gt.csv',
        "embedding_path": '../tmp/embedding/amazon_music_sgcn_feature05.csv', # tmp folder for cross-validation
        "regression_weights_path": '../tmp/weights/amazon_music_sgcn_feature05.csv',
        "inductive_model_path": None, # '../output/inductive/amazon_music_model', # or None
        "log_path": '../logs/amazon_music_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 1.0,
        "learning_rate": 0.005,  
        "weight_decay": 10e-5, 
        # "layers": [64, 32,16,8],
        "layers": [32, 16],
        "spectral_features":False,
        "general_features": True,  
        "class_weights":False,
        "sample_num":None,
        "node_under_sampling":False,
})

In [None]:
new_edges, new_nodes_dict = read_graph(new_args)

X = np.array(pd.read_csv('../input/amazon_elec/amazon_elec_node_feature.csv')) # general node features

In [None]:
predictor = SignedGCNPredictor(new_args, '../output/inductive/amazon_model', X, new_edges,new_nodes_dict)

In [None]:
predictions = predictor.predict()
predict_labels = predictions.argmax(1)

In [None]:
y_true = new_nodes_dict['label']

In [None]:
roc_auc_score(y_true=[1 if i==-1 else 0 for i in new_nodes_dict['label']],y_score=predictions[:,1][new_nodes_dict['indice']])

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
confusion_matrix([1 if i==-1 else 0 for i in new_nodes_dict['label']],predict_labels[new_nodes_dict['indice']])

In [None]:
y_true.shape

In [None]:
_ = plt.hist(predictions[:,1][new_nodes_dict['indice']][y_true==1],alpha=0.5,bins=10)
_ = plt.hist(predictions[:,1][new_nodes_dict['indice']][y_true==-1],alpha=0.5,bins=10)

結果
- amazon
    - elec : 0.80787063118
    - movie : 0.55
    - music : 0.76
    

In [None]:
import json

In [None]:
performance = pd.DataFrame(json.load(open('../logs/otc_logs_feature05.json','r'))['performance'])

performance.columns = performance.iloc[0,:]

performance = performance.iloc[1:,:]

In [None]:
performance['AUC'].plot()

In [None]:
feature = pd.read_csv(args.embedding_path,index_col='id')

weight = pd.read_csv(args.regression_weights_path)

In [None]:
predictions = np.dot(feature.values,weight.values.T)

In [None]:
probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions)).numpy()

In [None]:
predict_labels = probabilities.argmax(1)

In [None]:
roc_auc_score(y_true=[0 if i==-1 else 1 for i in nodes_dict['label']],y_score=probabilities[:,1][nodes_dict['indice']])

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
confusion_matrix([0 if i==-1 else 1 for i in nodes_dict['label']],predict_labels[nodes_dict['indice']])

# TODO

In [None]:
from torch_scatter import scatter_add, scatter_mean
from torch_geometric.utils import remove_self_loops, add_self_loops

In [None]:
positive_edges = edges["positive_edges"]

In [None]:
positive_edges = torch.from_numpy(np.array(positive_edges, dtype=np.int64).T).type(torch.long)

In [None]:
positive_edges.shape

In [None]:
positive_edges, _ = remove_self_loops(positive_edges, None)

In [None]:
positive_edges.shape

In [None]:
positive_edges = add_self_loops(positive_edges, num_nodes=nodes_dict['all_ncount'])

In [None]:
positive_edges.shape

In [None]:
row, col = positive_edges

In [None]:
row,col

In [None]:
row.shape, col.shape

In [None]:
x = torch.from_numpy(pd.read_csv(args.features_path).values)

In [None]:
row.shape,col.shape,x.shape

In [None]:
x[col].shape

In [None]:
out = scatter_mean(x[col], row, dim=0, dim_size=x.size(0))

In [None]:
out.shape

In [None]:
(out.numpy()==x.numpy()).all(1).sum()

In [None]:
x.shape

In [None]:
np.unique(row.numpy()).shape