In [4]:
from __future__ import division
import os
import shap
from tqdm import tqdm
import os.path
from scipy import linalg
import scipy.sparse as sparse
from scipy.sparse import csr_matrix
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from xgboost import XGBClassifier  
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split 
import warnings
warnings.filterwarnings('ignore')

In [5]:
def performance(X,y,Is_sort_feature=0):
    '''输入：
        参数1：样本的特征数据
        参数2：样本标签
        参数3：是否对特征进行重要性得分计算，只有融合多特征时才需要
        
        输出，特征得分、auc值、precision值、recall值、f-score值
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y) #按照y的标签平衡样本
    xgbc=XGBClassifier(**best_params)
    
    #训练模型
    xgbc.fit(X_train, y_train)
    
    #prediction on test set
    dtree_predictions = xgbc.predict(X_test) #离散的0或者1
    dtree_proba = xgbc.predict_proba(X_test) #预测为0和1的概率二元组[0.22396779 0.7760322],前为0概率,后为1概率
    
    #评估
    auc_measure = roc_auc_score(y_test, dtree_proba[:, 1])#AUC计算，是以概率计算
    precision_total, recall_total, f_measure_total, _ = precision_recall_fscore_support(y_test, dtree_predictions,
                                                                                        average=None)
    
    if Is_sort_feature == 1:
        explainer = shap.TreeExplainer(xgbc)
        shap_values = explainer.shap_values(X_test)
        importances = np.mean(np.abs(shap_values), axis=0) #每个特征的重要性,根据shap_values计算得出
        feature_importances = importances
    else:
        feature_importances = None
    return feature_importances,auc_measure,precision_total[1], recall_total[1], f_measure_total[1]

# This is the program that calculates the performance and SHAP VALUES of one time

In [None]:
global best_params 
best_params = {'eval_metric':'auc'}
node_features = ['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11', 'N12', 'N13', 'N14', 'N15']
edge_features = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'M10', 'M11', 'M12']
A = defaultdict(list) #stow performance
Features_importance_score =  defaultdict(list)#stow shap values
n = 550 #number of networks
for i in tqdm(range(n)):
    data = pd.read_csv(f'./features/n_{i}_features.csv')
    features = data.columns.tolist()[2:]
    
    #single feature
    for f in features:
        X = data[f].values
        X = X.reshape(-1, 1)
        y = data['label']
        _, auc,precision, recall, f_score = performance(X,y)
        A[f'n_{i}_auc'].append(auc)
        A[f'n_{i}_precision'].append(precision)
        A[f'n_{i}_recall'].append(recall)
        A[f'n_{i}_f-score'].append(f_score)
        
    #multi-feature fusing
    XX = data[node_features].values
    yy = data['label']
    _,auc,precision, recall, f_score = performance(XX,yy,Is_sort_feature=0)
    A[f'n_{i}_auc'].append(auc)
    A[f'n_{i}_precision'].append(precision)
    A[f'n_{i}_recall'].append(recall)
    A[f'n_{i}_f-score'].append(f_score)
    
    XX = data[edge_features].values
    yy = data['label']
    _,auc,precision, recall, f_score = performance(XX,yy,Is_sort_feature=0)
    A[f'n_{i}_auc'].append(auc)
    A[f'n_{i}_precision'].append(precision)
    A[f'n_{i}_recall'].append(recall)
    A[f'n_{i}_f-score'].append(f_score)
    
        
    XX = data[node_features+edge_features].values
    yy = data['label']
    feature_importances,auc,precision, recall, f_score = performance(XX,yy,Is_sort_feature=1)
    A[f'n_{i}_auc'].append(auc)
    A[f'n_{i}_precision'].append(precision)
    A[f'n_{i}_recall'].append(recall)
    A[f'n_{i}_f-score'].append(f_score)
    
    
    feature_importances1 = []
    for j in range(len(feature_importances)):
        feature_importances1.append((features[j],feature_importances[j]))
            
    #feature importance score
    sort_feature_importances = sorted(feature_importances1, key=lambda x: x[1],reverse=True)#按照得分排序
    Features_importance_score[f'n_{i}']  = sort_feature_importances    
    
    
df1 = pd.DataFrame(A) #
auc_col = [f'n_{x}_auc' for x in range(n)]
precision_col = [f'n_{x}_precision' for x in range(n)]
recall_col = [f'n_{x}_recall' for x in range(n)]
fscore_col = [f'n_{x}_f-score' for x in range(n)]

#mean
df1['auc_avg'] = df1[auc_col].mean(axis=1) 
df1['precision_avg'] = df1[precision_col].mean(axis=1) 
df1['recall_avg'] = df1[recall_col].mean(axis=1) 
df1['f-score_avg'] = df1[fscore_col].mean(axis=1)

#
df1['auc_std'] = df1[auc_col].std(axis=1) 
df1['precision_std'] = df1[precision_col].std(axis=1) 
df1['recall_std'] = df1[recall_col].std(axis=1) 
df1['f-score_std'] = df1[fscore_col].std(axis=1)
df1.index = features + ['ND','ED','ALL']


df2 = pd.DataFrame(Features_importance_score)#
df2.index = node_features+edge_features 
# df2['average'] = df2.iloc[:,0:].mean(axis=1) #
# df2['std'] = df2.iloc[:,0:].std(axis=1) #
    
    
df1.to_csv(f'./results/performance1.csv')#
df2.to_csv(f'./results/shap_values1.csv')

In [None]:
#add domain information to the results of shap_values
import pickle 
infile = open('OLP_updated.pickle','rb')  
df = pickle.load(infile)
infile.close()  

# read edge lists for all networks
df_edgelists = df['edges_id']                               
all_name = list(df['network_name'])
domain_list = []

for i in range(len(df)):
    edges_orig = df_edgelists.iloc[i] # a numpy array of edge list for original graph
    file = df['network_name'][i]
    num_edges = df['number_edges'][i] #
    num_nodes = df['number_nodes'][i]
    ave_degree = df['ave_degree'][i]
    domain = df['networkDomain'][i]
    sub_domain = df['subDomain'][i]
    domain_list.append(domain+'/'+sub_domain)
    
shap_values =  pd.read_csv('./results/shap_values1.csv',index_col = 0)
shap_values.loc['domain'] = domain_list
indexss = ['Top'+str(i) for i in range(1,28)]
indexss.append('domain')
shap_values.index = indexss
shap_values .to_csv(f'./results/shap_values1_domain.csv')