In [150]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
# !pip install hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials      # для оптимизации гиперпараметров
from functools import partial 

In [151]:
df = pd.read_csv('Bank_Personal_Loan_Modelling_train.csv')
X_test = pd.read_csv('Bank_Personal_Loan_Modelling_reserved.csv')

In [152]:
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,409,60,36,89,91745,2,2.8,1,0,0,0,0,1,0
1,3738,44,19,30,91423,1,0.5,3,0,0,0,0,1,0
2,3890,26,0,19,93014,1,0.1,2,121,0,0,0,1,0
3,569,34,9,41,92101,2,0.1,1,161,0,0,0,1,1
4,1042,56,32,51,92780,4,1.5,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,4588,37,11,59,94720,4,0.2,3,0,0,0,0,0,0
3996,2342,36,10,91,92028,1,1.5,3,289,0,0,0,1,0
3997,1586,57,31,131,90502,2,2.7,1,0,0,0,0,0,0
3998,3153,40,15,83,90275,1,1.0,3,0,0,0,0,0,0


In [155]:
X_test.drop(columns=['ID', 'ZIP Code'], inplace= True)
df.drop(columns=['ID', 'ZIP Code'], inplace= True)

In [157]:
X_train, y_train = df.drop(columns=['Personal Loan']), df['Personal Loan']

In [158]:
# функции для подбора гиперпараметров
def objective(params, pipeline,  X_train, y_train):
    """
    Кросс-валидация с текущими гиперпараметрами
    :params: гиперпараметры
    :pipeline: модель
    :X_train: матрица признаков
    :y_train: вектор меток объектов
    :return: значение Loss-функции
    """ 

    scores=[]  
    for i in range(10):
        pipeline.set_params(**params)    # задаём модели требуемые параметры
        X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.2, 
                                                            random_state=i, stratify=y_train)
        pipeline.fit(X_tr, y_tr)
        score = f1_score(y_te, pipeline.predict(X_te))
        scores.append(score)
    mean_score, std_score = np.mean(scores), np.std(scores)    

    # возвращаем результаты, которые записываются в Trials()
    return   {'loss': -mean_score, 'score': mean_score,'std': std_score, 'params': params, 'status': STATUS_OK}


def get_hp_results(pipeline, pipeline_space, max_evals, obj_func=objective, X=X_train, y=y_train):
    '''
    Подбор оптимальных гиперпараметров с помощью hyperopt, вывод лучших 5 результатов и оптимальной комбинации гиперпараметров
    pipeline - пайплайн модели, 
    pipeline_space - пространство гиперпараметров, 
    max_evals - число итераций подбора оптимальных гиперпараметров
    return: оптимальные гиперпараметры, история подбора гиперпараметров
    '''
    trials = Trials() # сюда будет сохраняться история поиска
    best = fmin( 
                fn=partial(obj_func, pipeline=pipeline, X_train=X, y_train=y), # функция для оптимизации 
                space=pipeline_space,   # пространство поиска гиперпараметров
                algo=tpe.suggest, # алгоритм поиска (байесовская оптимизация)
                max_evals=max_evals,     # число итераций
                trials=trials,    # куда сохранять историю поиска
                rstate=np.random.default_rng(0), # random state
                show_progressbar=True # progressbar
            )
    
    # результаты в виде датафрейма
    results = pd.DataFrame([{**x, **x['params']} for x in  trials.results])
    results.drop(labels=['status', 'loss'], axis=1, inplace=True)
    results.sort_values(by=['score'], ascending=False, inplace=True)
    display(results.head())
    
    #оптимальные значения гиперпараметров
    best_params = results.iloc[0]['params'] 
#     best_params = {key.replace('model__', ''): value for key, value in best_params.items()}
    
    return  best_params, results

In [159]:
#пространство параметров модели
param_space = {
    'max_features': hp.choice('max_features', options=['auto', 'sqrt', 'log2', None]), 
    'min_samples_leaf': hp.choice('min_samples_leaf', options =range(1,25)), 
    'max_depth': hp.choice('max_depth', options =range(3,30)), 
    'criterion': hp.choice('criterion', options=['entropy', 'gini']),
    'class_weight': hp.choice('class_weight', options=[None, 'balanced']), # None, 
}

In [160]:
model = DecisionTreeClassifier(random_state=0)
best_params, history = get_hp_results(model, param_space, max_evals=300)

100%|█████████████████████████████████████████████| 300/300 [00:22<00:00, 13.61trial/s, best loss: -0.9048991366340386]


Unnamed: 0,score,std,params,class_weight,criterion,max_depth,max_features,min_samples_leaf
217,0.904899,0.020506,"{'class_weight': None, 'criterion': 'entropy',...",,entropy,4,,3
157,0.904899,0.020506,"{'class_weight': None, 'criterion': 'entropy',...",,entropy,4,,3
262,0.904899,0.020506,"{'class_weight': None, 'criterion': 'entropy',...",,entropy,4,,3
261,0.904899,0.020506,"{'class_weight': None, 'criterion': 'entropy',...",,entropy,4,,3
260,0.904899,0.020506,"{'class_weight': None, 'criterion': 'entropy',...",,entropy,4,,3


In [161]:
best_params

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 4,
 'max_features': None,
 'min_samples_leaf': 3}

In [162]:
best_model = DecisionTreeClassifier(random_state=0, **best_params)

In [163]:
best_model.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=3,
                       random_state=0)

In [164]:
best_model.predict(X_test)

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,