In [1]:
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm.notebook import tqdm

import re
import os

from functools import partial
from scipy.stats import mode

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, cohen_kappa_score, log_loss, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibrationDisplay
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.manifold import TSNE
# import optuna

from View import MyClass
mc = MyClass()

In [2]:
print(mc.__doc__)

Визуализация и предобработка данных
# Первичный обзор работа с памятью
* Метод "view_csv" принимает DataFrame и выводит данные о нем, переводит название столбцов в lower
возвращает data.columns, categorical, numerical
* Метод optim_memory оптимизирует память занимаемую данными, принимает DataFrame, возвращает DataFrame

# Статистика
* Метод "stat_frame" принимает DataFrame и возвращает DataFrame со статистиками
* Метод "corr" принимает DataFrame и target : str
* Метод corr_sign принимает data: DataFrame, drop_sign:List ( корреляция между признаками)
* Метод nonlianer_comun принимает data: DataFrame, numerical: list(список числовых признаков
target: str(по умолчанию "class"), num_sign: int(кол-во столбцов с нелинейной связью)
* Метод "percentile_99_1" принимает data: DataFrame, data_stat : DataFrame( возвращает метод "stat_frame"), flag: bool(
если флаг равен True возвращает фрейм где максимум заменен на 99 перцентиль, а минимум на 1 перцентиль)
по умолчанию flag = False
возвращат данны

<a id="3"></a>
# <h1 style="background-color:lightgray;font-family:newtimeroman;font-size:350%;text-align:center;border-radius: 15px 50px;">Reading Data Files</h1> 

In [3]:
os.listdir('data')

['sample_submission.csv',
 'test.csv',
 'test_new.csv',
 'train.csv',
 'train_new.csv']

In [4]:
train = pd.read_csv('data/train_new.csv')
test = pd.read_csv('data/test_new.csv')
submission = pd.read_csv('data/sample_submission.csv')

print('The dimension of the train dataset is:', train.shape)
print('The dimension of the test dataset is:', test.shape)

The dimension of the train dataset is: (159256, 63)
The dimension of the test dataset is: (106171, 62)


In [5]:
submission.head()

Unnamed: 0,id,smoking
0,159256,0.5
1,159257,0.5
2,159258,0.5
3,159259,0.5
4,159260,0.5


In [6]:
train.describe()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,cholesterol,triglyceride,hdl,ldl,hemoglobin,urine protein,serum creatinine,ast,alt,gtp,dental caries,smoking,norm_cholesterol,norm blood sugar,norm_triglyceride,norm_hdl,norm_ldl,norm_hemoglobin,norm_ast,norm_alt,norm_gtp,height_weight,eye,hearing,hearing(left)_freq,hearing(right)_freq,urine protein_freq,serum creatinine_freq,dental caries_freq,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,le_hearing(left),le_hearing(right),le_urine protein,le_serum creatinine,le_dental caries
count,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0
mean,44.284021,165.27896,67.133515,82.979313,0.997513,0.993627,1.023974,1.023421,122.476579,76.879835,98.037619,5.063418,3.293621,1.443737,2.961048,14.805244,1.070195,0.89197,25.368953,26.301232,35.528608,0.197996,0.437365,0.964403,0.359277,0.831749,0.168653,0.245743,0.030743,0.240707,0.256399,0.084945,2.530084,1.991141,2.047395,0.953202,0.954254,0.898909,0.16002,0.682413,0.976026,0.023974,0.976579,0.023421,0.947292,0.03522,0.017488,0.018586,0.070032,0.124215,0.187133,0.224645,0.196621,0.111958,0.04808,0.018731,0.802004,0.197996,0.023974,0.023421,0.070195,3.919695,0.197996
std,11.778226,8.780626,12.394069,8.805302,0.304748,0.304063,0.152969,0.151238,12.447793,8.836436,13.071558,0.71952,1.680392,0.356101,0.658343,1.383302,0.316613,0.172688,7.868008,14.047925,25.892977,0.39849,0.496063,0.185283,0.47979,0.37409,0.374446,0.430528,0.172621,0.427514,0.436646,0.278801,0.388064,0.563109,0.268093,0.145634,0.144154,0.205126,0.0606,0.240691,0.152969,0.152969,0.151238,0.151238,0.22345,0.184336,0.13108,0.13506,0.255202,0.329828,0.390019,0.417349,0.397444,0.315316,0.213936,0.135573,0.39849,0.39849,0.152969,0.151238,0.316613,1.726875,0.39849
min,20.0,145.0,45.0,63.0,0.2,0.2,1.0,1.0,95.0,59.0,75.0,3.518,0.879,0.854,1.552,10.9,1.0,0.5,14.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.4,2.0,0.023974,0.023421,0.017488,0.018586,0.197996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,160.0,60.0,77.0,0.8,0.8,1.0,1.0,114.0,70.0,90.0,4.527,1.992,1.164,2.457,13.8,1.0,0.8,20.0,16.0,18.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.266667,1.6,2.0,0.976026,0.976579,0.947292,0.111958,0.802004,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0
50%,40.0,165.0,65.0,83.0,1.0,1.0,1.0,1.0,121.0,78.0,96.0,5.07,2.975,1.397,2.949,15.0,1.0,0.9,24.0,22.0,27.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.0,2.0,0.976026,0.976579,0.947292,0.187133,0.802004,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0
75%,55.0,170.0,75.0,89.0,1.2,1.2,1.0,1.0,130.0,82.0,103.0,5.613,4.268,1.655,3.44,15.8,1.0,1.0,29.0,32.0,44.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.769231,2.4,2.0,0.976026,0.976579,0.947292,0.196621,0.802004,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
max,75.0,185.0,100.0,104.0,1.5,1.5,2.0,2.0,154.0,100.0,153.0,6.725,8.51,2.431,4.501,17.6,3.0,1.3,54.0,81.0,152.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.888889,3.0,4.0,0.976026,0.976579,0.947292,0.224645,0.802004,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,8.0,1.0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 63 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   age                    159256 non-null  int64  
 1   height(cm)             159256 non-null  int64  
 2   weight(kg)             159256 non-null  int64  
 3   waist(cm)              159256 non-null  float64
 4   eyesight(left)         159256 non-null  float64
 5   eyesight(right)        159256 non-null  float64
 6   hearing(left)          159256 non-null  float64
 7   hearing(right)         159256 non-null  float64
 8   systolic               159256 non-null  int64  
 9   relaxation             159256 non-null  int64  
 10  fasting blood sugar    159256 non-null  int64  
 11  cholesterol            159256 non-null  float64
 12  triglyceride           159256 non-null  float64
 13  hdl                    159256 non-null  float64
 14  ldl                    159256 non-nu

In [8]:
test.describe()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,cholesterol,triglyceride,hdl,ldl,hemoglobin,urine protein,serum creatinine,ast,alt,gtp,dental caries,norm_cholesterol,norm blood sugar,norm_triglyceride,norm_hdl,norm_ldl,norm_hemoglobin,norm_ast,norm_alt,norm_gtp,height_weight,eye,hearing,hearing(left)_freq,hearing(right)_freq,urine protein_freq,serum creatinine_freq,dental caries_freq,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,le_hearing(left),le_hearing(right),le_urine protein,le_serum creatinine,le_dental caries
count,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0,106171.0
mean,44.401315,165.184749,67.113845,82.976691,0.996773,0.992755,1.024216,1.02398,122.442739,76.825508,98.07486,5.065461,3.288494,1.44551,2.962255,14.801892,1.069096,0.891744,25.402925,26.312703,35.429223,0.195458,0.964651,0.363395,0.829577,0.168003,0.247591,0.031817,0.242467,0.25911,0.08394,2.529439,1.989528,2.048196,0.952741,0.95319,0.899152,0.160182,0.685491,0.975784,0.024216,0.97602,0.02398,0.947406,0.036093,0.016502,0.018819,0.069501,0.125232,0.185748,0.226926,0.195374,0.111791,0.04797,0.01864,0.804542,0.195458,0.024216,0.02398,0.069096,3.917435,0.195458
std,11.826446,8.703285,12.38696,8.793837,0.305211,0.303201,0.153719,0.152988,12.46712,8.858678,13.007866,0.721091,1.686918,0.357204,0.658276,1.385492,0.311971,0.172598,7.890071,14.08724,25.56077,0.396555,0.18466,0.480979,0.376006,0.37387,0.431615,0.175512,0.428577,0.438148,0.277299,0.387902,0.562893,0.269347,0.146274,0.145651,0.204809,0.060943,0.241535,0.153719,0.153719,0.152988,0.152988,0.223223,0.186522,0.127395,0.135885,0.254306,0.330983,0.388905,0.418847,0.39649,0.315111,0.213703,0.13525,0.396555,0.396555,0.153719,0.152988,0.311971,1.725977,0.396555
min,20.0,145.0,45.0,63.0,0.2,0.2,1.0,1.0,95.0,59.0,75.0,3.518,0.854,0.854,1.578,10.9,1.0,0.5,14.0,8.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.4,2.0,0.024216,0.02398,0.016502,0.01864,0.195458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,160.0,60.0,77.0,0.8,0.8,1.0,1.0,114.0,70.0,90.0,4.527,1.992,1.164,2.457,13.8,1.0,0.8,20.0,16.0,18.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.266667,1.6,2.0,0.975784,0.97602,0.947406,0.111791,0.804542,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0
50%,40.0,165.0,65.0,83.0,1.0,1.0,1.0,1.0,121.0,78.0,96.0,5.07,2.949,1.397,2.949,15.0,1.0,0.9,24.0,22.0,27.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.0,2.0,0.975784,0.97602,0.947406,0.185748,0.804542,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0
75%,55.0,170.0,75.0,89.0,1.2,1.2,1.0,1.0,130.0,82.0,103.0,5.613,4.242,1.655,3.44,15.8,1.0,1.0,29.0,33.0,44.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.769231,2.4,2.0,0.975784,0.97602,0.947406,0.195374,0.804542,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
max,75.0,180.0,100.0,104.1,1.5,1.5,2.0,2.0,154.0,100.0,152.0,6.751,8.562,2.431,4.501,17.6,3.0,1.3,54.0,81.0,151.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,3.0,4.0,0.975784,0.97602,0.947406,0.226926,0.804542,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,8.0,1.0


Ни в обучающих, ни в наборах тестовых данных нет пропущенных значений. Поскольку это синтетический набор данных, в качестве проверки на работоспособность мы проверим наличие дубликатов. Сначала давайте взглянем на набор данных о поездах.

In [10]:
to_check = pd.merge(train.drop(columns = ['smoking'], axis = 1), test)
print(f"Есть {to_check.shape[0]} дублированные наблюдения в наборах обучающих и тестовых данных")

Есть 0 дублированные наблюдения в наборах обучающих и тестовых данных


<a id="5"></a>
# <h1 style="background-color:lightgray;font-family:newtimeroman;font-size:350%;text-align:center;border-radius: 15px 50px;">Baseline Modeling 1.0</h1>

Во-первых, мы начинаем с создания некоторых стандартных моделей без разработки функциональных возможностей или HPO. Сначала мы определяем входные и целевые функции.

In [9]:
train.shape, test.shape

((159256, 63), (106171, 62))

In [10]:
# X = train.drop(columns = ['defects','v', 'n', 'e', 'b', 'total_Opnd'], axis = 1)
# X = train.drop(columns = ['defects','v'], axis = 1)
X = train.drop(columns = ['smoking'], axis = 1)
Y = train['smoking']

# test_cv = test.drop(columns = ['v', 'n', 'e', 'b', 'total_Opnd'], axis = 1)
# test_cv = test.drop(columns = ['anomal'], axis = 1)
test_cv = test

Сначала мы определяем функцию ансамбля Хилла для прогнозирования модели ансамбля.

In [11]:
# pipline = make_pipeline(StandardScaler(),
#                            HistGradientBoostingClassifier(l2_regularization = 0.01,
#                                              early_stopping = False,
#                                              learning_rate = 0.005,
#                                              max_iter = 500,
#                                              max_depth = 9,
#                                              max_bins = 255,
#                                              min_samples_leaf = 15,
#                                              max_leaf_nodes = 10))
# pipline.fit(X_train, Y_train)

In [12]:
# pipline.predict_proba(test_cv)

In [13]:
def hill_climbing(x, y, x_test):
    
    # Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = roc_auc_score(y, x[col])

    # Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    # Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.51, 0.01) 
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0

    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True
        
    hill_ens_pred_1 = current_best_ensemble
    hill_ens_pred_2 = current_best_test_preds
    
    return [hill_ens_pred_1, hill_ens_pred_2]

Затем мы создаем несколько стандартных моделей в рамках процедуры 10-кратной перекрестной проверки.

In [14]:


params_cb = {
    "n_estimators": 700,
    'subsample': 0.3,
    'max_depth': 9,
    'leaf_estimation_iterations': 50,
    'l2_leaf_reg': 30,
    "learning_rate": 0.01,
    'colsample_bylevel': 0.3,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": False,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42,
}

params_xgb = {
    "n_estimators" : 700,
    "gamma" : 2,
    "subsample" : 0.3,
    "min_child_weight" : 10,
    "booster": "gbtree",
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "learning_rate": 0.01,
    "nthread": 6,
    "max_depth" : 9,
    "seed": 27
    }

params_lgb = {
    "n_estimators" : 700,
    "objective" : 'binary',
    "subsample" : 0.3,
    "num_leaves" : 20,
    "reg_alpha": 3,
    "reg_lambda": 3,
    "learning_rate": 0.01,
    "max_depth" : 9,
    "seed": 27,
    "colsample_bytree": 0.7
    }


params_rf = {
    "max_leaf_nodes" : 92,
    "n_estimators" : 700,
    "max_depth" : 9,
    "min_samples_split": 15,
    "min_samples_leaf": 10,
    }

params_et = {
    "max_leaf_nodes" : 92,
    "n_estimators" : 700,
    "max_depth" : 9,
    "min_samples_split": 15,
    "min_samples_leaf": 10,
    }

params_gb = {
    "l2_regularization": 0.01,
    "learning_rate" : 0.01,
    "max_iter" : 700,
    "max_bins" : 255,
    "max_depth" : 9,
    "min_samples_leaf": 15,
    "max_leaf_nodes": 92,
    }



In [15]:
    params = {
                    "n_estimators": 5000,
                    "learning_rate": 0.01,
                    "loss_function": "Logloss",
                    "eval_metric": "AUC",
                    "task_type": "CPU",
                    "max_bin": 20,
                    "verbose": 100,
                    "max_depth": 6,
                    "l2_leaf_reg": 10,
                    "early_stopping_rounds": 30,
                    "thread_count": 6,
                    "random_seed": 1234123
                }

In [22]:
ens_cv_scores, ens_preds = list(), list()
hill_ens_cv_scores, hill_ens_preds =  list(), list()

sk = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 1, random_state = 42)
for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    print('----------------------------------------------------------')
    
    ########
    ## RF ##
    ########

    RF_md = RandomForestClassifier(n_estimators = 8000, 
                                   max_depth = 6,
                                   min_samples_split = 15,
                                   min_samples_leaf = 10).fit(X_train, Y_train)
    
    RF_pred = RF_md.predict_proba(X_test)[:, 1]
    RF_score = roc_auc_score(Y_test, RF_pred)

    print('Fold', i, '==> RF oof ROC-AUC score is ==>', RF_score)

    RF_pred_test = RF_md.predict_proba(test_cv)[:, 1]
    
    #################
    ## Extra Trees ##
    #################

    ET_md = ExtraTreesClassifier(n_estimators = 8000, 
                                 max_depth = 6,
                                 min_samples_split = 15,
                                 min_samples_leaf = 10).fit(X_train, Y_train)

    ET_pred = ET_md.predict_proba(X_test)[:, 1]
    ET_score = roc_auc_score(Y_test, ET_pred)

    print('Fold', i, '==> ET oof ROC-AUC score is ==>', ET_score)

    ET_pred_test = ET_md.predict_proba(test_cv)[:, 1]

    ##########################
    ## HistGradientBoosting ##
    ##########################

    hist_md = make_pipeline(StandardScaler(),
                           HistGradientBoostingClassifier(l2_regularization = 0.01,
                                             early_stopping = False,
                                             learning_rate = 0.01,
                                             max_iter = 8000,
                                             max_depth = 6,
                                             max_bins = 20,
                                             min_samples_leaf = 15,
                                             max_leaf_nodes = 10)).fit(X_train, Y_train)
    
#     hist_md = HistGradientBoostingClassifier(l2_regularization = 0.01,
#                                              early_stopping = False,
#                                              learning_rate = 0.01,
#                                              max_iter = 500,
#                                              max_depth = 5,
#                                              max_bins = 255,
#                                              min_samples_leaf = 15,
#                                              max_leaf_nodes = 10).fit(X_train, Y_train)
    
    hist_pred = hist_md.predict_proba(X_test)[:, 1]
    hist_score = roc_auc_score(Y_test, hist_pred)

    print('Fold', i, '==> Hist oof ROC-AUC score is ==>', hist_score)  

    hist_pred_test = hist_md.predict_proba(test_cv)[:, 1]

    ##########
    ## LGBM ##
    ##########

    LGBM_md = LGBMClassifier(objective = 'binary',
                             n_estimators = 8000,
                             max_depth = 6,
                             learning_rate = 0.01,
                             num_leaves = 20,
                             reg_alpha = 3,
                             reg_lambda = 3,
                             subsample = 0.7,
                             colsample_bytree = 0.7).fit(X_train, Y_train)

    lgb_pred = LGBM_md.predict_proba(X_test)[:, 1]
    lgb_score = roc_auc_score(Y_test, lgb_pred)

    print('Fold', i, '==> LGBM oof ROC-AUC score is ==>', lgb_score) 

    lgb_pred_test = LGBM_md.predict_proba(test_cv)[:, 1]

    #########
    ## XGB ##
    #########

    XGB_md = XGBClassifier(objective = 'binary:logistic',
                           tree_method = 'hist',
                           colsample_bytree = 0.7, 
                           gamma = 2, 
                           learning_rate = 0.01, 
                           max_depth = 6, 
                           min_child_weight = 10, 
                           n_estimators = 8000, 
                           subsample = 0.7).fit(X_train, Y_train)

    xgb_pred = XGB_md.predict_proba(X_test)[:, 1]
    xgb_score = roc_auc_score(Y_test, xgb_pred)

    print('Fold', i, '==> XGB oof ROC-AUC score is ==>', xgb_score)

    xgb_pred_test = XGB_md.predict_proba(test_cv)[:, 1]

    ##############
    ## CatBoost ##
    ##############

    Cat_md = CatBoostClassifier(loss_function = 'Logloss',
                                iterations = 8000,
#                                 max_bin = 20,
                                learning_rate = 0.01,
                                depth = 6,
                                random_strength = 0.5,
                                bagging_temperature = 0.7,
                                border_count = 30,
                                l2_leaf_reg = 10,
                                verbose = False, 
                                task_type = 'CPU').fit(X_train, Y_train)

    cat_pred = Cat_md.predict_proba(X_test)[:, 1]
    cat_score = roc_auc_score(Y_test, cat_pred)

    print('Fold', i, '==> CatBoost oof ROC-AUC score is ==>', cat_score)

    cat_pred_test = Cat_md.predict_proba(test_cv)[:, 1]    
    
    ##############
    ## Ensemble ##
    ##############
    
    ens_pred_1 = (RF_pred + ET_pred + hist_pred + lgb_pred + xgb_pred + cat_pred) / 6
    ens_pred_2 = (RF_pred_test + ET_pred_test + hist_pred_test + lgb_pred_test + xgb_pred_test + cat_pred_test) / 6
    
    ens_score_fold = roc_auc_score(Y_test, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    
    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)
    
    ############################
    ## Hill Climbing Ensemble ##
    ############################
    
    x = pd.DataFrame({'RF': RF_pred,
                      'ET': ET_pred, 
                      'Hist': hist_pred, 
                      'LGBM': lgb_pred,
                      'XGB': xgb_pred,
                      'Cat': cat_pred})
    y = Y_test
        
    x_test = pd.DataFrame({'RF': RF_pred_test,
                           'ET': ET_pred_test, 
                           'Hist': hist_pred_test, 
                           'LGBM': lgb_pred_test,
                           'XGB': xgb_pred_test,
                           'Cat': cat_pred_test})
    
    hill_results = hill_climbing(x, y, x_test)
    
    hill_ens_score_fold = roc_auc_score(y, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])

    print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

----------------------------------------------------------
Fold 0 ==> RF oof ROC-AUC score is ==> 0.837551473679029
Fold 0 ==> ET oof ROC-AUC score is ==> 0.8230278374507637
Fold 0 ==> Hist oof ROC-AUC score is ==> 0.8622076895273596
Fold 0 ==> LGBM oof ROC-AUC score is ==> 0.8696375859417069
Fold 0 ==> XGB oof ROC-AUC score is ==> 0.8700516813335565
Fold 0 ==> CatBoost oof ROC-AUC score is ==> 0.8655223592745885
Fold 0 ==> Average Ensemble oof ROC-AUC score is ==> 0.8674324084162823
Fold 0 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.8706308589203693
----------------------------------------------------------
Fold 1 ==> RF oof ROC-AUC score is ==> 0.8374330845839699
Fold 1 ==> ET oof ROC-AUC score is ==> 0.8232419772001294
Fold 1 ==> Hist oof ROC-AUC score is ==> 0.8660317079329758
Fold 1 ==> LGBM oof ROC-AUC score is ==> 0.8727600540162228
Fold 1 ==> XGB oof ROC-AUC score is ==> 0.8737440210745306
Fold 1 ==> CatBoost oof ROC-AUC score is ==> 0.8686499974042987
Fold 1 ==> Aver

In [23]:
print('Средняя совокупность оценок ROC-AUC за 10-кратный период составляет', np.mean(ens_cv_scores))
print('Hill Climbing Ensemble ROC-AUC набрал более 10 баллов, это', np.mean(hill_ens_cv_scores))

Средняя совокупность оценок ROC-AUC за 10-кратный период составляет 0.8681734667630273
Hill Climbing Ensemble ROC-AUC набрал более 10 баллов, это 0.8718838492221701


Средняя совокупность оценок ROC-AUC за 10-кратный период составляет 0.8681734667630273  
Hill Climbing Ensemble ROC-AUC набрал более 10 баллов, это 0.8718838492221701

In [24]:
submission.head()

Unnamed: 0,id,smoking
0,159256,0.634426
1,159257,0.356203
2,159258,0.406226
3,159259,0.024013
4,159260,0.627077


In [25]:
ens_preds_test = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

submission['smoking'] = ens_preds_test
submission.to_csv('ens_sub.csv', index = False)

ens_preds_test = pd.DataFrame(hill_ens_preds).apply(np.mean, axis = 0)

submission['smoking'] = ens_preds_test
submission.to_csv('hill_sub.csv', index = False)