In [None]:
!git branch

# モデルのアウトプットの分析

In [None]:
from sgcn import SignedGCNTrainer, SignedGCNPredictor
from parser import parameter_parser
from utils import tab_printer, read_graph, score_printer, save_logs
import easydict
import argparse
import pandas as pd
import numpy as np
import torch
import json
import networkx as nx
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data_name = input()

In [None]:
new_args = easydict.EasyDict({
        "edge_path": f'../input/{data_name}/{data_name}_network.csv',
        "features_path":  f'../input/{data_name}/{data_name}_node_feature.csv',
        "nodes_path": f'../input/{data_name}/{data_name}_gt.csv',
        "embedding_path": f'../tmp/embedding/{data_name}_sgcn_feature05.pkl', # tmp folder for cross-validation
        "regression_weights_path": f'../tmp/weights/{data_name}_sgcn_feature05.pkl',
        "inductive_model_path": f'../output/inductive/{data_name}_model', # or None
        "log_path": f'../logs/{data_name}_logs_feature05.json',
        "epochs":300,
        "test_size":0.33,
        "reduction_iterations": 128,
        "reduction_dimensions": 30,
        "seed": 42,
        "lamb": 0.0,
        "learning_rate": 0.001,  
        "weight_decay": 10e-4, 
        # "layers": [64, 32,16,8],
        "layers": [32, 2,],
        "spectral_features":False,
        "general_features": True,  
        "sample_num":None,
        "class_weights":False,
        "node_under_sampling":False,
        "hidden_residual":False,
        "eval_freq":1,
        "subgraph_training":False,
        "l1_lambda":0.01,
})

In [None]:
new_edges, new_nodes_dict = read_graph(new_args)
if data_name  == 'amazon':
    new_edges['positive_edges'] = new_edges['positive_edges'] + [[d,s] for s,d in new_edges['positive_edges']]
    new_edges['negative_edges'] = new_edges['negative_edges'] + [[d,s] for s,d in new_edges['negative_edges']]
X = np.array(pd.read_csv(f'../input/{data_name}/{data_name}_node_feature.csv')) # general node features

In [None]:
training_dataset = input('学習に使ったデータセット：')

In [None]:
predictor = SignedGCNPredictor(new_args, f'../output/inductive/{training_dataset}_model', X, new_edges,new_nodes_dict)

predictions = predictor.predict()
predict_labels = predictions.argmax(1)

In [None]:
weights = predictor.model.regression_weights.cpu().detach().numpy()

all_z = predictor.z.cpu().detach().numpy()

Z = all_z[new_nodes_dict['indice']]

y = new_nodes_dict['label']

y = np.array([1 if i==-1 else 0 for i in y])

In [None]:
used_train_indcie = np.load(f'../output/inductive/{training_dataset}_train_indice.npy')

used_test_indcie = np.load(f'../output/inductive/{training_dataset}_test_indice.npy')

In [None]:
_ = plt.hist(weights[:,1])

## ロジスティック回帰で確認

In [None]:
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)

In [None]:
auc_scores = []
for train_idx, test_idx in kf.split(X=np.arange(len(y)),y=y):
    Z_train, Z_test, y_train, y_test = Z[train_idx], Z[test_idx], y[train_idx], y[test_idx]
    logistic = LogisticRegression()
    logistic.fit(Z_train,y_train)
    y_pred = logistic.predict_proba(Z_test)[:,1]
    auc_score = roc_auc_score(y_true=y_test,y_score=y_pred)
    auc_scores.append(auc_score)

In [None]:
np.mean(auc_scores)

In [None]:
np.dot(Z_train,weights)

## 保存

In [None]:
np.save(arr=Z,file=f'../for_analysis/Z_{data_name}_2.npy')

np.save(arr=y,file=f'../for_analysis/y_{data_name}_2.npy')

np.save(arr=weights, file=f'../for_analysis/weights_{data_name}_2.npy')

## LIME

In [None]:
import itertools

import lime
import lime.lime_tabular

In [None]:
feature_names = [j+'-'+k+'-'+i+str(num) for i,j,k,num in itertools.product(['F','E'],['O','I'],['O','I'],range(2))]

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(Z_train, 
                                                   feature_names=feature_names, 
                                                   class_names=['benign','fraud'], 
                                                   discretize_continuous=True)

In [None]:
i = 0 # np.random.randint(0, Z_test.shape[0])
exp = explainer.explain_instance(Z_test[i], logistic.predict_proba, num_features=16, top_labels=2)

In [None]:
y_test[i]

In [None]:
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
exp_df = pd.DataFrame()
for i in range(len(y_test)):
    exp = explainer.explain_instance(Z_test[i], logistic.predict_proba, num_features=16, top_labels=2)
    current_exp_df = pd.DataFrame(exp.as_list())
    current_exp_df['label'] = y_test[i]
    exp_df = exp_df.append(current_exp_df)

In [None]:
exp_df['relation'] = exp_df[0].str.extract('.*(.-.-.\d).*')

In [None]:
agg = exp_df.groupby(['relation','label'])[1].mean().unstack(1).loc[feature_names]

In [None]:
fig,ax = plt.subplots(1,2,figsize=[20,5])
ax[0].barh(agg.index,agg[0])
ax[1].barh(agg.index,agg[1])

In [None]:
fig,ax = plt.subplots(1,2,figsize=[20,5])
ax[0].barh(agg.index,weights[:,0])
ax[1].barh(agg.index,weights[:,1])

## Random Forestで特徴量の重要度をみる

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
auc_scores = []
feature_importances = []
for train_idx, test_idx in kf.split(X=np.arange(len(y)),y=y):
    Z_train, Z_test, y_train, y_test = Z[train_idx], Z[test_idx], y[train_idx], y[test_idx]
    rf = RandomForestClassifier(n_estimators=500)
    rf.fit(Z_train,y_train)
    y_pred = rf.predict_proba(Z_test)[:,1]
    auc_score = roc_auc_score(y_true=y_test,y_score=y_pred)
    auc_scores.append(auc_score)
    feature_importances.append(rf.feature_importances_)

In [None]:
np.mean(auc_scores)

In [None]:
averaged_importance = sum(feature_importances) / 10

In [None]:
weights_df = pd.DataFrame(weights[:,1],columns=['weight'])

weights_df['group'] = np.array([[i]*16 for i in range(8)]).flatten()

weights_df['weight_abs'] = weights_df.weight.abs()
weights_df['rf_importance'] = averaged_importance

In [None]:
weights_df.groupby('group')['rf_importance'].mean()

In [None]:
weights_df.groupby('group')['rf_importance'].max()

In [None]:
weights_df.groupby('group')['weight_abs'].max()

In [None]:
fig,ax = plt.subplots(8,1,figsize=[10,20])
for g in range(8):
    ax[g].hist(weights_df.loc[weights_df.group==g,'rf_importance'],bins=30)