# Figure 3CDE
global shap analysis

In [None]:
import numpy as np
import pandas as pd
import os,re,glob
import optuna
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import logging

In [None]:
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['font.family'] = 'Arial'

In [None]:
from optuna.integration import lightgbm as lgb

# Import data

In [None]:
ipt_DIR = "../../../1_data_processing/processed_data/"

In [None]:
y = pd.read_csv(os.path.join(ipt_DIR, "y.csv"),index_col="group.cmp")
X = pd.read_csv(os.path.join(ipt_DIR, "X_scaled.csv"),index_col="group.cmp")

# Import models

In [None]:
ipt_DIR_male = "../../../2_model_construction/lasso/out/MAPE/0.99/Lasso/male/tsfresh/"
ipt_DIR_female = "../../../2_model_construction/lightGBM/out/MAPE/0.99/LGBM/female/both/"
ipt_DIR_both = "../../../2_model_construction/lasso/out/MAPE/0.99/Lasso/both/tsfresh/"

In [None]:
models_male = []
for i in range(5):
    bestDIR = ipt_DIR_male+"optuna/outer_"+str(i)+"/bestmodel.pkl"
    print(bestDIR)
    best_lasso_cv = joblib.load(bestDIR)
    models_male.extend(best_lasso_cv['estimator'])

In [None]:
models_female = lgb.CVBooster()
for i in range(5):
    for j in range(5):
        bestDIR = ipt_DIR_female+"optuna/outer_"+str(i)+"/model_"+str(j)+".txt"
        tmp_booster = lgb.Booster(model_file = bestDIR)
        models_female.boosters.append(tmp_booster)

In [None]:
models_both = []
for i in range(5):
    bestDIR = ipt_DIR_both+"optuna/outer_"+str(i)+"/bestmodel.pkl"
    print(bestDIR)
    best_lasso_cv = joblib.load(bestDIR)
    models_both.extend(best_lasso_cv['estimator'])

# Import features used

In [None]:
coef_male = pd.read_csv(ipt_DIR_male + "feature_coefs.csv",index_col=0)
features_male = list(coef_male.columns)
print(len(features_male))

coef_female = pd.read_csv(ipt_DIR_female + "feature_importance.csv",index_col=0)
features_female = list(coef_female.columns)
print(len(features_female))

coef_both = pd.read_csv(ipt_DIR_both + "feature_coefs.csv",index_col=0)
features_both = list(coef_both.columns)
print(len(features_both))


In [None]:
X_male = X[features_male]
X_female = X[features_female]
X_both = X[features_female]

# import folds used

In [None]:
#import 5-folds
folds_out = joblib.load(ipt_DIR+"indices_5folds.pkl")

# calculate shap values
## calculate shap values in male best model

In [None]:
df_male = pd.DataFrame()

for k in range(len(folds_out)):
    # select test, train, model sets in a fold
    train_rows_index = y.loc[folds_out[k][0],].loc[y["SEX.男1.女0"]==1].index
    test_rows_index = y.loc[folds_out[k][1],].loc[y["SEX.男1.女0"]==1].index
    
    x_train_tmp = X.loc[train_rows_index,features_male]
    x_test_tmp = X.loc[test_rows_index,features_male]
    
    print(x_test_tmp.shape)
    # calculate shap
    model_tmp = models_male[k*5:(k+1)*5]
    explainers = []
    shap_values = []
    for j in range(5):
        explainer = shap.KernelExplainer(model_tmp[j].predict, x_train_tmp)
        explainers.append(explainer)
        shap_value = explainer.shap_values(x_test_tmp)
        shap_values.append(shap_value)
    
    shap_values_ave = np.mean(shap_values,axis=0)
    
    tmp_df = pd.DataFrame(shap_values_ave, test_rows_index)
    tmp_df.columns = features_male
    tmp_df = pd.merge(y.loc[test_rows_index], tmp_df, left_index=True, right_index=True)
    
    df_male = pd.concat([df_male, tmp_df],axis=0)

In [None]:
shap.summary_plot(np.array(df_male.iloc[:,y.shape[1]:]), X.loc[y["SEX.男1.女0"]==1,features_male],
                  plot_type="bar",color="gray",max_display=10,
                show=False)

# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')

plt.savefig("../figure/shap_bar_male.pdf", bbox_inches='tight')

In [None]:
shap.summary_plot(np.array(df_male.iloc[:,y.shape[1]:]), X.loc[y["SEX.男1.女0"]==1,features_male],
                 plot_type="dot", max_display=10,
                show=False)

# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')

plt.savefig("../figure/shap_dot_male.pdf", bbox_inches='tight')

In [None]:
shap.summary_plot(np.array(df_male.iloc[:,y.shape[1]:]), X.loc[y["SEX.男1.女0"]==1,features_male],
                 plot_type="violin", max_display=10,
                show=False)

# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')

plt.savefig("../figure/shap_violin_male.pdf", bbox_inches='tight')

In [None]:
df_male.to_csv("../out/shap_male.csv")

## calc shap values in female best model

In [None]:
df_female = pd.DataFrame()

for k in range(len(folds_out)):
    # select test, train, model sets in a fold
    train_rows_index = y.loc[folds_out[k][0],].loc[y["SEX.男1.女0"]==0].index
    test_rows_index = y.loc[folds_out[k][1],].loc[y["SEX.男1.女0"]==0].index
    
    x_train_tmp = X.loc[train_rows_index,features_female]
    x_test_tmp = X.loc[test_rows_index,features_female]
    
    print(x_test_tmp.shape)
    # calculate shap
    model_tmp = models_female.boosters[k*5:(k+1)*5]
    explainers = []
    shap_values = []
    for j in range(5):
        explainer = shap.KernelExplainer(model_tmp[j].predict, x_train_tmp)
        explainers.append(explainer)
        shap_value = explainer.shap_values(x_test_tmp)
        shap_values.append(shap_value)
    
    shap_values_ave = np.mean(shap_values,axis=0)
    
    tmp_df = pd.DataFrame(shap_values_ave, test_rows_index)
    tmp_df.columns = features_female
    tmp_df = pd.merge(y.loc[test_rows_index], tmp_df, left_index=True, right_index=True)#.drop(["index"],axis=1)
    
    df_female = pd.concat([df_female, tmp_df],axis=0)

In [None]:
shap.summary_plot(np.array(df_female.iloc[:,,y.shape[1]:]), X.loc[y["SEX.男1.女0"]==0,features_female],
                  plot_type="bar",color="gray",max_display=10,
                show=False)
# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')
plt.savefig("../figure/shap_bar_female.pdf", bbox_inches='tight')

In [None]:
shap.summary_plot(np.array(df_female.iloc[:,y.shape[1]:]), X.loc[y["SEX.男1.女0"]==0,features_female],
                 plot_type="dot", max_display=10,
                show=False)
# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')
plt.savefig("../figure/shap_dot_female.pdf", bbox_inches='tight')

In [None]:
shap.summary_plot(np.array(df_female.iloc[:,y.shape[1]:]), X.loc[y["SEX.男1.女0"]==0,features_female],
                 plot_type="violin", max_display=10,
                show=False)
# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')
plt.savefig("../figure/shap_violin_female.pdf", bbox_inches='tight')

In [None]:
df_female.to_csv("../out/shap_female.csv")

## calc shap values in both best model

In [None]:
df_both = pd.DataFrame()

for k in range(len(folds_out)):
    # select test, train, model sets in a fold
    train_rows_index = y.loc[folds_out[k][0],].index
    test_rows_index = y.loc[folds_out[k][1],].index
    
    x_train_tmp = X.loc[train_rows_index,features_both]
    x_test_tmp = X.loc[test_rows_index,features_both]
    
    print(x_test_tmp.shape)
    # calculate shap
    model_tmp = models_both[k*5:(k+1)*5]
    explainers = []
    shap_values = []
    for j in range(5):
        explainer = shap.KernelExplainer(model_tmp[j].predict, x_train_tmp)
        explainers.append(explainer)
        shap_value = explainer.shap_values(x_test_tmp)
        shap_values.append(shap_value)
    
    shap_values_ave = np.mean(shap_values,axis=0)
    
    tmp_df = pd.DataFrame(shap_values_ave, test_rows_index)
    tmp_df.columns = features_both
    tmp_df = pd.merge(y.loc[test_rows_index], tmp_df, left_index=True, right_index=True)#.drop(["index"],axis=1)
    
    df_both = pd.concat([df_both, tmp_df],axis=0)


In [None]:
shap.summary_plot(np.array(df_both.iloc[:,y.shape[1]:]), X.loc[:,features_both],
                  plot_type="bar",color="gray",max_display=10,
                show=False)
# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')
plt.savefig("../figure/shap_bar_both.pdf", bbox_inches='tight')

In [None]:
shap.summary_plot(np.array(df_both.iloc[:,y.shape[1]:]), X.loc[:,features_both],
                 plot_type="dot", max_display=10,
                show=False)
# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')
plt.savefig("../figure/shap_dot_both.pdf", bbox_inches='tight')

In [None]:
shap.summary_plot(np.array(df_both.iloc[:,y.shape[1]:]), X.loc[:,features_both],
                 plot_type="violin", max_display=10,
                show=False)
# print(f'Original size: {plt.gcf().get_size_inches()}')
# w, h = plt.gcf().get_size_inches()
# plt.gcf().set_size_inches(w*1.5, h*0.9)
# plt.tight_layout()
# print(f'New size: {plt.gcf().get_size_inches()}')
plt.savefig("../figure/shap_violin_both.pdf", bbox_inches='tight')

In [None]:
df_both.to_csv("../out/shap_both.csv")