In [25]:
from xgboost import XGBClassifier
from random import random
from imblearn.pipeline import make_pipeline,Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import chi2, SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from loguru import logger
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from utils import readJSON, preprocess
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import shap
import collections
from collections import Counter
from datetime import datetime
from sklearn.svm import LinearSVC
from pprint import pprint
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from imblearn.combine import SMOTETomek
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
id2feature = {
    0: '非DPN组',
    1: 'DPN组'
}
plt.rcParams["font.sans-serif"]=["SimHei"] #设置中易黑体字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
data = pd.read_csv('./output/dpn/data/red_preprocress.csv')
X = data.drop(columns='分组')
y = data['分组'].astype(int)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=64)
# X_train.reset_index(drop=True,inplace=True)
# X_test.reset_index(drop=True,inplace=True)
# y_train.reset_index(drop=True,inplace=True)
# y_test.reset_index(drop=True,inplace=True)
logger.add('./log/dpn/pipeline_{time}.log')
Accuracy = []
Precision = []
Recall = []
F1 = []
class FeatureSelector_KBest(BaseEstimator, TransformerMixin):
    def __init__(self, score_func=None, k='all'):
        self.score_func = score_func
        self.k = k

    def fit(self, X, y=None):
        if self.score_func == 'chi2':
            from sklearn.feature_selection import chi2, SelectKBest
            self.cols = X.columns[SelectKBest(chi2,self.k).fit(X, y).get_support()]
        elif self.score_func == 'mutual_info_classif':
            from sklearn.feature_selection import SelectKBest,mutual_info_classif
            self.cols = X.columns[SelectKBest(mutual_info_classif,self.k).fit(X, y).get_support()]
        elif self.score_func == 'f_classif':
            from sklearn.feature_selection import SelectKBest, f_classif
            self.cols = X.columns[SelectKBest(f_classif,self.k).fit(X, y).get_support()]
        return self

    def transform(self, X, y=None):
        return X.loc[:, self.cols]

class FeatureSelector_FromModel(BaseEstimator, TransformerMixin):
    def __init__(self, C=1.0, penalty='l2'):
        self.C = C
        self.penalty = penalty

    def fit(self, X, y=None):
        select_k = SelectFromModel(LinearSVC(C=self.C, penalty=self.penalty, dual=False, random_state=43))
        self.cols = X.columns[select_k.fit(X, y).get_support()]
        return self

    def transform(self, X, y=None):
        return X.loc[:, self.cols]

def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(
        X, y, test_size=0.1, random_state=64)
    print(f'Optuna start {trial.number} trial...')
    # if score_func=='chi2':
    #     Selector = FeatureSelector_KBest('chi2',k=k)
    # elif score_func=='mutual_info_classif':
    #     Selector = FeatureSelector_KBest('mutual_info_classif',k=k)
    # elif score_func=='f_classif':
    #     Selector = FeatureSelector_KBest('f_classif',k=k)
    # selector_fromModel_params = {
    #     'C':trial.suggest_float('C',1e-8,1.0,log=True),
    #     'penality':trial.suggest_categorical("penality", ['l1','l2']),
    # }
    # select_kbest_fromModel = trial.suggest_categorical("chi_embed", ['chi','embed'])
    # if select_kbest_fromModel == 'embed':
    #     Selector = FeatureSelector_FromModel(**selector_fromModel_params)
    # else:
    #     Selector = FeatureSelector_KBest(**selector_kbest_params)

    pipeline_params = {
        # 'selector__score_func':trial.suggest_categorical("score_func", ['chi2','mutual_info_classif','f_classif']),
        # 'selector__k' : trial.suggest_int('k',10,15,step=1),
        'selector__penalty':trial.suggest_categorical("penalty", ['l1','l2']),
        'selector__C':trial.suggest_float('C', 1e-8, 1e-3, log=True),

        'sampler__random_state':43,

        'clf__verbosity': 0,
        'clf__eval_metric': 'logloss',
        'clf__objective': 'binary:logistic',
        'clf__tree_method': 'exact',
        'clf__random_state':43,
        'clf__n_estimators':trial.suggest_int('n_estimators',10,500),
        'clf__max_depth': trial.suggest_int("max_depth", 10,40,step=1),
        'clf__grow_policy': trial.suggest_categorical("grow_policy", ['depthwise', 'lossguide']),
        'clf__learning_rate': trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
        'clf__gamma': trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        'clf__reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'clf__reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'clf__subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'clf__colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
    }
    model = Pipeline(steps=[
        ('selector',FeatureSelector_FromModel()),
        ('sampler',SMOTETomek(random_state=43)),
        ('clf',XGBClassifier())
    ])
    # print(model.get_params())
    model.set_params(**pipeline_params)
    print(train_x.loc[0,'性别'],train_y.loc[0])
    model.fit(train_x,train_y)
    preds = model.predict(valid_x)
    acc = accuracy_score(valid_y, preds)
    return acc
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, timeout=600)
logger.info(study.best_value)
logger.info(study.best_params)

pipeline_params = {
    'selector__penalty':study.best_params['penalty'],
    'selector__C':study.best_params['C'],

    # 'selector__score_func':study.best_params['score_func'],
    # 'selector__k':study.best_params['k'],

    'sampler__random_state':43,

    'clf__n_estimators':study.best_params['n_estimators'],
    'clf__max_depth':study.best_params['max_depth'],
    'clf__grow_policy':study.best_params['grow_policy'],
    'clf__learning_rate':study.best_params['learning_rate'],
    'clf__gamma':study.best_params['gamma'],
    'clf__reg_lambda':study.best_params['reg_lambda'],
    'clf__reg_alpha':study.best_params['reg_alpha'],
    'clf__subsample':study.best_params['subsample'],
    'clf__colsample_bytree':study.best_params['colsample_bytree'],
    'clf__verbosity': 0,
    # 'clf__eval_metric': 'logloss',
    'clf__objective': 'binary:logistic',
    'clf__tree_method': 'exact',
    'clf__random_state':43
}
model = Pipeline(steps=[
    ('selector',FeatureSelector_FromModel()),
    ('sampler',SMOTETomek()),
    ('clf',XGBClassifier())
])
model.set_params(**pipeline_params)

selector = model['selector']
logger.info(f'测试集特征筛选前:{X.shape}')
X= selector.fit_transform(X,y)
logger.info(f'测试集特征筛选后:{X.shape}')
sampler = model['sampler']
# logger.info(f'数据平衡前:{Counter(y_test)}')
# X_test,y_test = sampler.fit_resample(X_test,y_test)
# logger.info(f'数据平衡后:{Counter(y_test)}')
clf = model['clf']

kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=64)
for train_index, test_index in kf.split(X,y):
    clf.fit(X.loc[train_index], y[train_index])
    preds = clf.predict(X.loc[test_index])
    accuracy = accuracy_score(y[test_index], preds)
    Accuracy.append(accuracy)
    precision = precision_score(y[test_index], preds)
    Precision.append(precision)
    recall = recall_score(y[test_index], preds)
    Recall.append(recall)
    f1 = f1_score(y[test_index], preds)
    F1.append(f1)
    logger.warning(f'{round(np.mean(accuracy), 3)}\t{round(np.mean(precision), 3)}\t'
                   f'{round(np.mean(recall), 3)}\t{round(np.mean(f1), 3)}')
    explainer = shap.TreeExplainer(clf)
    shap_values_XGBoost_train = explainer.shap_values(X.loc[test_index])
    shap.summary_plot(shap_values_XGBoost_train, X.iloc[test_index])
logger.warning(f'accuracy\t\tmean:{round(np.mean(Accuracy), 3)}\tstd:{round(np.std(Accuracy), 3)}')
logger.warning(f'precision\t\tmean:{round(np.mean(Precision), 3)}\tstd:{round(np.std(Precision), 3)}')
logger.warning(f'recall\t\tmean:{round(np.mean(Recall), 3)}\tstd:{round(np.std(Recall), 3)}')
logger.warning(f'f1\t\tmean:{round(np.mean(F1), 3)}\tstd:{round(np.std(F1), 3)}')

Optuna start 0 trial...
-1.1621263418443806 1


ValueError: at least one array or dtype is required