In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
import shap

import pickle
import joblib

from sklearn.metrics import roc_auc_score

import common_f as cf

random_seed = 923
skFold = StratifiedKFold(n_splits = 5, random_state = random_seed,
                         shuffle = True)

current_path = os.getcwd()
imp_df = pd.read_csv(current_path + '/Data/feature_imp.csv', 
                        encoding = 'utf-8')
imp_df.sort_values(by = 'Importances', ascending = False, inplace = True)
features_list = imp_df['Features'].to_list()

X_train_filePath = current_path + '/Data/X_train.csv'
y_train_filePath = current_path + '/Data/y_train.csv'

X_train_val = pd.read_csv(X_train_filePath, encoding = 'utf-8')
y_train_val = pd.read_csv(y_train_filePath, encoding = 'utf-8')
y_train_val = y_train_val.values.ravel()
X = X_train_val[features_list]

lgbm_pipeline = Pipeline(steps = [('scaler', StandardScaler()),
                                  ('LGBM', LGBMClassifier(random_state = random_seed))])
lgbm_best_params_path = current_path + \
                        f'/Results/best_params/best_lgbm_params_923.pkl'
with open(lgbm_best_params_path, 'rb') as f:
    loaded_best_lgbm_params = pickle.load(f)

best_model = lgbm_pipeline
best_params = loaded_best_lgbm_params
best_model.set_params(**best_params)

tmp_f, pipList, train_list, AUC_cv_lst= [], [], [], []

for f in features_list:
    tmp_f.append(f)
    part_X = X[tmp_f]
    AUC_cv = []
    for train_idx, test_idx in skFold.split(part_X, y_train_val):
        y = pd.Series(y_train_val)
        X_train, X_test = part_X.iloc[train_idx, :], part_X.iloc[test_idx, :]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        best_model.fit(X_train, y_train)
        y_pred_prob = best_model.predict_proba(X_test)[:, 1]
        AUC_cv.append(roc_auc_score(y_test, y_pred_prob))
        if(len(tmp_f) == 10):
            pipList.append(best_model)
            train_list.append(X_train)
            print(X_test, X_test.dtypes)            

    if(len(tmp_f) == 10):
        max_auc = max(AUC_cv)
        max_index = AUC_cv.index(max_auc)
        final_model_pip = pipList[max_index]
        selected_train = train_list[max_index]

        explainer = shap.Explainer(final_model_pip.predict, selected_train)
        shap_values = explainer(selected_train)
        shap_name = 'final_features_shap'

        feature_names = selected_train.columns.tolist()
        feature_imp = np.abs(shap_values.values).mean(axis = 0)
        total_imp = dict(zip(feature_names, feature_imp.tolist()))
        imp_df = pd.DataFrame({'Features': list(total_imp.keys()),
                          'imp_cv': list(total_imp.values())})

        imp_df.sort_values(by = 'imp_cv', ascending = False, inplace = True)
        cf.plot_importances(imp_df['imp_cv'], imp_df['Features'], 'best model')

        imp_df.columns = ['Features', 'Importances']
        imp_df.to_csv(current_path + '/Data/final_feature_imp.csv')

        joblib.dump(final_model_pip, current_path + '/Results/best_model.pkl')

    tmp_out = np.array([np.mean(AUC_cv), np.std(AUC_cv)] + AUC_cv)
    AUC_cv_lst.append(np.round(tmp_out, 3))

model = joblib.load(current_path + '/Results/best_model.pkl')
print(model)
AUC_df = pd.concat((pd.DataFrame({'Features':tmp_f}), 
                    pd.DataFrame(AUC_cv_lst)), axis = 1)

AUC_df.columns = ['Features', 'AUC_mean', 'AUC_std', 
                  'AUC0', 'AUC1', 'AUC2', 'AUC3', 'AUC4']
AUC_df.to_csv(current_path + '/Data/features_auc.csv')