# Imports

In [None]:
!pip install catboost
!pip install --upgrade holidays
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import sys, os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from datetime import date
from sklearn.model_selection import StratifiedKFold
import optuna

import holidays
ru_holidays = holidays.RU()

original_stdout = sys.stdout
seed = 52

import warnings
warnings.filterwarnings("ignore")

# Data prepose

In [None]:
train_df = pd.read_excel("/content/drive/MyDrive/HSE&PSB_hack/train.xlsx").drop(columns=["Unnamed: 0", "Статус брони"])
test_df = pd.read_excel("/content/drive/MyDrive/HSE&PSB_hack/test.xlsx").drop(columns=["Unnamed: 0"])
spb_weather = pd.read_csv("/content/drive/MyDrive/HSE&PSB_hack/spb_weather_old.csv")

In [None]:
spb_weather = spb_weather[["datetime", "tempmax", "tempmin", "temp", "feelslike", "humidity", "windspeed"]]

In [None]:
spb_weather["datetime"] = pd.to_datetime(spb_weather["datetime"])

In [None]:
spb_weather = spb_weather.set_index("datetime")

In [None]:
def f(x, d):
    a = d.copy()
    for i in x.split("\n"):
        if i[0].isdigit():
            for j in i.split("\n"):
                a[j[3:]] += 1
        else:
            a[i] += 1
    return list(a.values())

def prepose(train, test, one_hot=False):
    cat_features = ["Способ оплаты", "Источник", "Гостиница", "Регион"]

    d = {"Номер «Стандарт»": 0, "Коттедж с 3 спальнями": 0, "Апартаменты с 2 спальнями с отдельным входом": 0, "Коттедж с 2 спальнями": 0, "Номер «Стандарт» для маломобильных групп населения": 0, "Номер «Люкс»": 0, "Номер «Студия»": 0}
    train[list(d.keys())] = train["Категория номера"].apply(lambda x: f(x, d)).apply(pd.Series)
    test[list(d.keys())] = test["Категория номера"].apply(lambda x: f(x, d)).apply(pd.Series)

    enc = OrdinalEncoder()
    col_to_enc = ["Способ оплаты", "Источник"]
    train[col_to_enc] = enc.fit_transform(train[col_to_enc])
    test[col_to_enc] = enc.transform(test[col_to_enc])

    train["Дата отмены"] = train["Дата отмены"].fillna(0)
    train["Дата отмены"] = train["Дата отмены"].apply(lambda x: 1 if x else 0)

    cols_to_drop = ["№ брони", "Категория номера"]

    train["is_предоплата"] = train["Внесена предоплата"].apply(lambda x: 1 if x > 0 else 0)
    train["процент_предоплаты"] = train["Внесена предоплата"] / train["Стоимость"]

    test["is_предоплата"] = test["Внесена предоплата"].apply(lambda x: 1 if x > 0 else 0)
    test["процент_предоплаты"] = test["Внесена предоплата"] / test["Стоимость"]

    train["Регион"] = train["Гостиница"].apply(lambda x: 1 if x in [1, 2] else 2)
    test["Регион"] = test["Гостиница"].apply(lambda x: 1 if x in [1, 2] else 2)

    train["Цена за ночь"] = train["Стоимость"] / train["Ночей"]
    test["Цена за ночь"] = test["Стоимость"] / test["Ночей"]

    for col1 in cat_features:
        for col2 in ["Стоимость", "Внесена предоплата", "Ночей", "Гостей"]:
            df = pd.concat([train, test], axis=0)
            temp = df.groupby(col1)[col2].agg(["mean", "max", "min", "median", "std"]).reset_index()
            temp.columns = [col1] + [f"{col1}_{col2}_{x}" for x in ["mean", "max", "min", "median", "std"]]
            df = df.merge(temp, on=col1, how="left")
            for col in df.columns[df.isna().any()].tolist():
                df[col] = df[col].fillna(df[col].median())
            train, test = df.iloc[:train.shape[0]].copy(), df.iloc[train.shape[0]:].copy()


    date_cols = ["Дата бронирования", "Заезд", "Выезд"]
    for col in date_cols:
        train[col] = pd.to_datetime(train[col]).dt.normalize()
        train[f"{col}_is_holiday"] = pd.to_datetime(train[col]).dt.strftime('%m/%d/%Y').apply(lambda x: int(x in ru_holidays))
        train[f"{col}_dayOfweek"] = train[col].dt.dayofweek
        train[f"{col}_month"] = train[col].dt.month
        train[f"{col}_year"] = train[col].dt.year
        train = train.join(spb_weather, on='Дата бронирования', how='left')
        train.columns = list(train.columns[:-(len(spb_weather.columns))]) + [f"{col}_{x}" for x in train.columns[-(len(spb_weather.columns)):]]

        test[col] = pd.to_datetime(test[col]).dt.normalize()
        test[f"{col}_is_holiday"] = pd.to_datetime(test[col]).dt.strftime('%m/%d/%Y').apply(lambda x: int(x in ru_holidays))
        test[f"{col}_dayOfweek"] = test[col].dt.dayofweek
        test[f"{col}_month"] = test[col].dt.month
        test[f"{col}_year"] = test[col].dt.year
        test = test.join(spb_weather, on='Дата бронирования', how='left')
        test.columns = list(test.columns[:-(len(spb_weather.columns))]) + [f"{col}_{x}" for x in test.columns[-(len(spb_weather.columns)):]]


    train["Выезд - Заезд"] = (train["Выезд"] - train["Заезд"]).dt.days
    train["Заезд - бронирование"] = (train["Заезд"] - train["Дата бронирования"]).dt.days

    test["Выезд - Заезд"] = (test["Выезд"] - test["Заезд"]).dt.days
    test["Заезд - бронирование"] = (test["Заезд"] - test["Дата бронирования"]).dt.days

    train = train.drop(columns=cols_to_drop + date_cols)
    test = test.drop(columns=cols_to_drop + date_cols)
    for col in train.columns:
        if col.startswith("datetime"):
            try:
                train = train.drop(columns=[col])
                test = test.drop(columns=[col])
            except:
                pass

    if one_hot:
        oh = pd.DataFrame()
        df = pd.concat([train, test], axis=0)
        for col in cat_features:
            temp = pd.get_dummies(df[col]).astype(int)
            temp.columns = [f"{col}_{x}" for x in temp.columns]
            df = pd.concat([df, temp], axis=1)
        df = df.drop(columns=cat_features)
        train, test = df.iloc[:train.shape[0]], df.iloc[train.shape[0]:]


    if "Дата отмены" in test.columns:
        test = test.drop(columns=["Дата отмены"])
    print("Train shape:", train.shape, "Test shape:", test.shape,)
    return train, test


train, test = prepose(train_df.copy(), test_df.copy(), one_hot=False)
train_oh, test_oh = prepose(train_df.copy(), test_df.copy(), one_hot=True)

Train shape: (26174, 132) Test shape: (11218, 131)
Train shape: (26174, 175) Test shape: (11218, 174)


# Model

In [None]:
def hill_climbing(x, y, x_test):
    # Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = roc_auc_score(y, x[col])

    # Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    # Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.51, 0.01)
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0

    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True

    hill_ens_pred_1 = current_best_ensemble
    hill_ens_pred_2 = current_best_test_preds

    return [hill_ens_pred_1, hill_ens_pred_2]

In [None]:
cat_features = ["Способ оплаты", "Источник", "Гостиница", "Регион"]
train[cat_features] = train[cat_features].astype(int)
test[cat_features] = test[cat_features].astype(int)
X, y = train.drop(columns=["Дата отмены"]), train["Дата отмены"]
X_oh = train_oh.drop(columns=["Дата отмены"])

clf_avarage_auc = []
clf_predictions = []

ens_cv_scores, ens_preds = list(), list()
hill_ens_cv_scores, hill_ens_preds = list(), list()
ridge_ens_cv_scores, ridge_ens_preds = list(), list()
hill_ens_cv_recall = list()

skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=seed)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    oof_preds = pd.DataFrame()
    oof_test_preds = pd.DataFrame()
    print('----------------------------------------------------------')
    x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    x_train_oh_fold, x_test_oh_fold = X_oh.iloc[train_index], X_oh.iloc[test_index]


    # XGBoost with one hot
    xg = XGBClassifier(random_state=seed)
    xg.fit(x_train_oh_fold, y_train_fold,
           eval_set=[(x_test_oh_fold, y_test_fold)],
           verbose=0)
    xg_pred = xg.predict_proba(x_test_oh_fold)[:, 1]
    xg_score = roc_auc_score(y_test_fold, xg_pred)
    print('Fold', i, '==> XG with one hot oof ROC-AUC score is ==>', xg_score)
    xg_test_pred = xg.predict_proba(test_oh)[:, 1]

    # Catboost
    catb = CatBoostClassifier(eval_metric="AUC", random_state=seed, task_type="GPU", n_estimators=12500, early_stopping_rounds=200)
    catb.fit(x_train_fold, y_train_fold, eval_set=[(x_test_fold, y_test_fold)], verbose=200, cat_features=cat_features)
    catb_pred = catb.predict_proba(x_test_fold)[:, 1]
    catb_score = roc_auc_score(y_test_fold, catb_pred)
    print('Fold', i, '==> Catboost oof ROC-AUC score is ==>', catb_score)
    catb_test_pred = catb.predict_proba(test)[:, 1]

    # LightGBM
    sys.stdout = open(os.devnull, 'w')
    li = LGBMClassifier(random_state=seed)
    li.fit(x_train_fold, y_train_fold,
           eval_set=[(x_test_fold, y_test_fold)],
           categorical_feature=cat_features,
           eval_metric='auc')
    li_pred = li.predict_proba(x_test_fold)[:, 1]
    li_score = roc_auc_score(y_test_fold, li_pred)
    sys.stdout = original_stdout
    print('Fold', i, '==> LightGBM oof ROC-AUC score is ==>', li_score)
    li_test_pred = li.predict_proba(test)[:, 1]

    # Logreg
    # logreg = LogisticRegression(random_state=seed)
    # logreg.fit(x_train_fold, y_train_fold)
    # logreg_pred = logreg.predict_proba(x_test_fold)[:, 1]
    # logreg_score = roc_auc_score(y_test_fold, logreg_pred)
    # print('Fold', i, '==> Logreg oof ROC-AUC score is ==>', logreg_score)
    # logreg_test_pred = logreg.predict_proba(test)[:, 1]

    # RF
    rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=seed)
    rf.fit(x_train_fold, y_train_fold)
    rf_pred = rf.predict_proba(x_test_fold)[:, 1]
    rf_score = roc_auc_score(y_test_fold, rf_pred)
    print('Fold', i, '==> RF oof ROC-AUC score is ==>', rf_score)
    rf_test_pred = rf.predict_proba(test)[:, 1]

    # KNN
    # knn = KNeighborsClassifier(n_neighbors=15, weights="uniform", algorithm="kd_tree", p=1)
    # knn.fit(x_train_fold, y_train_fold)
    # knn_pred = knn.predict_proba(x_test_fold)[:, 1]
    # knn_score = roc_auc_score(y_test_fold, knn_pred)
    # print('Fold', i, '==> KNN oof ROC-AUC score is ==>', knn_score)
    # knn_test_pred = knn.predict_proba(test)[:, 1]

    # Ensemble
    ens_pred_1 = (xg_pred + catb_pred + li_pred + rf_pred) / 4
    ens_pred_2 = (xg_test_pred + catb_test_pred + li_test_pred + rf_test_pred) / 4

    ens_score_fold = roc_auc_score(y_test_fold, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

    # Hill Climbing Ensemble
    x = pd.DataFrame({'Xgboost_oh': xg_pred,
                      'Catboost': catb_pred,
                      'LightGBM': li_pred,

                      'rf': rf_pred})
    y_fold = y_test_fold

    x_test = pd.DataFrame({'Xgboost_oh': xg_test_pred,
                           'Catboost': catb_test_pred,
                           'LightGBM': li_test_pred,
                           'rf': rf_test_pred})

    hill_results = hill_climbing(x, y_fold, x_test)

    hill_ens_score_fold = roc_auc_score(y_fold, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])

    print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

----------------------------------------------------------
Fold 0 ==> XG with one hot oof ROC-AUC score is ==> 0.8349091625724134
Learning rate set to 0.019133
0:	test: 0.8061592	best: 0.8061592 (0)	total: 76.6ms	remaining: 15m 57s


Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.8360696	best: 0.8360696 (200)	total: 11.9s	remaining: 12m 7s
400:	test: 0.8374015	best: 0.8374015 (400)	total: 20.4s	remaining: 10m 16s
600:	test: 0.8392228	best: 0.8392256 (596)	total: 32s	remaining: 10m 32s
800:	test: 0.8397875	best: 0.8398644 (749)	total: 47s	remaining: 11m 25s
1000:	test: 0.8397398	best: 0.8399391 (823)	total: 1m 2s	remaining: 12m 1s
bestTest = 0.8399391472
bestIteration = 823
Shrink model to first 824 iterations.
Fold 0 ==> Catboost oof ROC-AUC score is ==> 0.8399386778616997
Fold 0 ==> LightGBM oof ROC-AUC score is ==> 0.832327949695681
Fold 0 ==> RF oof ROC-AUC score is ==> 0.8039566070250055
Fold 0 ==> Average Ensemble oof ROC-AUC score is ==> 0.8367680758231282
Fold 0 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.8441322321624991
----------------------------------------------------------
Fold 1 ==> XG with one hot oof ROC-AUC score is ==> 0.8423631480530909
Learning rate set to 0.019133
0:	test: 0.8319279	best: 0.8319279 (0)	total: 102ms	r

Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.8650762	best: 0.8650762 (200)	total: 11.4s	remaining: 11m 37s
400:	test: 0.8658480	best: 0.8662852 (355)	total: 22.1s	remaining: 11m 8s
bestTest = 0.8662852049
bestIteration = 355
Shrink model to first 356 iterations.
Fold 1 ==> Catboost oof ROC-AUC score is ==> 0.8662851983574099
Fold 1 ==> LightGBM oof ROC-AUC score is ==> 0.8607432903131188
Fold 1 ==> RF oof ROC-AUC score is ==> 0.8376223692894331
Fold 1 ==> Average Ensemble oof ROC-AUC score is ==> 0.8657920547041138
Fold 1 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.869978276013786
----------------------------------------------------------
Fold 2 ==> XG with one hot oof ROC-AUC score is ==> 0.8677579285851322
Learning rate set to 0.019133
0:	test: 0.8326572	best: 0.8326572 (0)	total: 77.8ms	remaining: 16m 12s


Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.8734327	best: 0.8734327 (200)	total: 12.8s	remaining: 13m 1s
400:	test: 0.8763931	best: 0.8764105 (392)	total: 29.4s	remaining: 14m 47s
600:	test: 0.8772275	best: 0.8772275 (600)	total: 40.7s	remaining: 13m 26s
800:	test: 0.8778894	best: 0.8779353 (788)	total: 51.8s	remaining: 12m 36s
1000:	test: 0.8775424	best: 0.8779362 (827)	total: 1m	remaining: 11m 32s
bestTest = 0.8779361844
bestIteration = 827
Shrink model to first 828 iterations.
Fold 2 ==> Catboost oof ROC-AUC score is ==> 0.8779361857788964
Fold 2 ==> LightGBM oof ROC-AUC score is ==> 0.8749422837372783
Fold 2 ==> RF oof ROC-AUC score is ==> 0.8371749644981875
Fold 2 ==> Average Ensemble oof ROC-AUC score is ==> 0.8736741323742565
Fold 2 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.8825608304165393
----------------------------------------------------------
Fold 3 ==> XG with one hot oof ROC-AUC score is ==> 0.8632214073863965
Learning rate set to 0.019133


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8393877	best: 0.8393877 (0)	total: 76.1ms	remaining: 15m 51s
200:	test: 0.8620859	best: 0.8620859 (200)	total: 13.5s	remaining: 13m 47s
400:	test: 0.8655576	best: 0.8655576 (400)	total: 24.8s	remaining: 12m 28s
600:	test: 0.8677515	best: 0.8677598 (596)	total: 36.3s	remaining: 11m 58s
800:	test: 0.8693790	best: 0.8694865 (787)	total: 44.7s	remaining: 10m 52s
1000:	test: 0.8702658	best: 0.8704310 (943)	total: 56.1s	remaining: 10m 44s
1200:	test: 0.8716611	best: 0.8716629 (1198)	total: 1m 8s	remaining: 10m 40s
1400:	test: 0.8727039	best: 0.8727305 (1396)	total: 1m 19s	remaining: 10m 26s
1600:	test: 0.8733978	best: 0.8734125 (1595)	total: 1m 27s	remaining: 9m 56s
1800:	test: 0.8736035	best: 0.8736576 (1768)	total: 1m 39s	remaining: 9m 49s
2000:	test: 0.8736218	best: 0.8737733 (1920)	total: 1m 50s	remaining: 9m 39s
bestTest = 0.8737732768
bestIteration = 1920
Shrink model to first 1921 iterations.
Fold 3 ==> Catboost oof ROC-AUC score is ==> 0.8737741891955156
Fold 3 ==> LightGB

Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.8728696	best: 0.8728962 (199)	total: 12.1s	remaining: 12m 22s
400:	test: 0.8729614	best: 0.8731671 (279)	total: 25.7s	remaining: 12m 55s
600:	test: 0.8730606	best: 0.8734752 (497)	total: 35.6s	remaining: 11m 45s
bestTest = 0.8734752238
bestIteration = 497
Shrink model to first 498 iterations.
Fold 4 ==> Catboost oof ROC-AUC score is ==> 0.8734757021550941
Fold 4 ==> LightGBM oof ROC-AUC score is ==> 0.8721633228085837
Fold 4 ==> RF oof ROC-AUC score is ==> 0.8476055735253871
Fold 4 ==> Average Ensemble oof ROC-AUC score is ==> 0.8751012525003169
Fold 4 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.8761482171294432
----------------------------------------------------------
Fold 5 ==> XG with one hot oof ROC-AUC score is ==> 0.841843135310076
Learning rate set to 0.019133


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8086190	best: 0.8086190 (0)	total: 130ms	remaining: 27m 9s
200:	test: 0.8585643	best: 0.8585643 (200)	total: 10.6s	remaining: 10m 50s
400:	test: 0.8608915	best: 0.8609512 (399)	total: 23.4s	remaining: 11m 46s
600:	test: 0.8619476	best: 0.8620294 (598)	total: 36.3s	remaining: 11m 57s
800:	test: 0.8627402	best: 0.8629817 (766)	total: 50.5s	remaining: 12m 17s
1000:	test: 0.8630295	best: 0.8631195 (992)	total: 1m 3s	remaining: 12m 4s
1200:	test: 0.8636347	best: 0.8636420 (1189)	total: 1m 17s	remaining: 12m 11s
1400:	test: 0.8640902	best: 0.8641499 (1341)	total: 1m 27s	remaining: 11m 31s
1600:	test: 0.8642133	best: 0.8642234 (1572)	total: 1m 39s	remaining: 11m 16s
1800:	test: 0.8642977	best: 0.8644498 (1673)	total: 1m 52s	remaining: 11m 9s
bestTest = 0.8644497693
bestIteration = 1673
Shrink model to first 1674 iterations.
Fold 5 ==> Catboost oof ROC-AUC score is ==> 0.8644511425690308
Fold 5 ==> LightGBM oof ROC-AUC score is ==> 0.8580490457009244
Fold 5 ==> RF oof ROC-AUC score 

Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.8641035	best: 0.8641228 (184)	total: 11.5s	remaining: 11m 43s
400:	test: 0.8665474	best: 0.8665630 (394)	total: 19.8s	remaining: 9m 58s
600:	test: 0.8679075	best: 0.8679075 (600)	total: 32s	remaining: 10m 34s
800:	test: 0.8692943	best: 0.8693402 (771)	total: 44.8s	remaining: 10m 54s
1000:	test: 0.8698471	best: 0.8699629 (978)	total: 56.5s	remaining: 10m 48s
1200:	test: 0.8704551	best: 0.8704891 (1183)	total: 1m 10s	remaining: 11m 3s
1400:	test: 0.8711173	best: 0.8711843 (1352)	total: 1m 22s	remaining: 10m 56s
1600:	test: 0.8714401	best: 0.8715443 (1575)	total: 1m 39s	remaining: 11m 18s
bestTest = 0.8715443313
bestIteration = 1575
Shrink model to first 1576 iterations.
Fold 6 ==> Catboost oof ROC-AUC score is ==> 0.8715443279313633
Fold 6 ==> LightGBM oof ROC-AUC score is ==> 0.8642936386796489
Fold 6 ==> RF oof ROC-AUC score is ==> 0.8332098098748969
Fold 6 ==> Average Ensemble oof ROC-AUC score is ==> 0.8675392290299413
Fold 6 ==> Hill Climbing Ensemble oof ROC-AUC score 

Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.8496642	best: 0.8497436 (177)	total: 11.7s	remaining: 11m 55s
400:	test: 0.8502142	best: 0.8504007 (375)	total: 23.2s	remaining: 11m 39s
600:	test: 0.8506762	best: 0.8507524 (598)	total: 34.7s	remaining: 11m 26s
800:	test: 0.8511616	best: 0.8512098 (770)	total: 43.1s	remaining: 10m 28s
1000:	test: 0.8513049	best: 0.8514036 (941)	total: 54.5s	remaining: 10m 26s
1200:	test: 0.8516051	best: 0.8516451 (1193)	total: 1m 6s	remaining: 10m 21s
1400:	test: 0.8520620	best: 0.8520620 (1400)	total: 1m 17s	remaining: 10m 13s
1600:	test: 0.8526296	best: 0.8526416 (1599)	total: 1m 26s	remaining: 9m 50s
1800:	test: 0.8532146	best: 0.8532367 (1783)	total: 1m 38s	remaining: 9m 43s
2000:	test: 0.8536114	best: 0.8536794 (1969)	total: 1m 49s	remaining: 9m 36s
2200:	test: 0.8535756	best: 0.8537813 (2096)	total: 2m 1s	remaining: 9m 29s
bestTest = 0.8537812829
bestIteration = 2096
Shrink model to first 2097 iterations.
Fold 7 ==> Catboost oof ROC-AUC score is ==> 0.8537812872521953
Fold 7 ==> Lig

Default metric period is 5 because AUC is/are not implemented for GPU


In [None]:
print('The average ensemble oof ROC-AUC score over the 10-folds is', np.mean(ens_cv_scores))
print('The hill climbing ensemble oof ROC-AUC score over the 10-folds is', np.mean(hill_ens_cv_scores))

The average ensemble oof ROC-AUC score over the 10-folds is 0.8589821571508421
The hill climbing ensemble oof ROC-AUC score over the 10-folds is 0.8654688970974037


In [None]:
# 0.863 -> 0.864 -> 0.865
# 0.852 -> 0.8532 -> 0.8536

SyntaxError: invalid syntax (<ipython-input-10-957cc8ed5a46>, line 1)

# Submission

In [None]:
ens_preds_test = pd.DataFrame(hill_ens_preds).apply(np.mean, axis = 0)

In [None]:
ens_preds_test.to_csv("submission.csv", index=False, header=False)

# Optuna

## LightGBM

In [None]:
def objective(trial):
    X, y = train.drop(columns=["Дата отмены"]), train["Дата отмены"]
    params = {
        "objective": "binary",
        "eval_metric": "log_loss",
        "verbosity": -1,
        "n_estimators": 12500,
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int("scale_pos_weight", 1, 6),
        "early_stopping_round": 300,
        "random_state": 42,
        "num_leaves": trial.suggest_int("num_leaves", 16, 200),
        "max_depth": trial.suggest_int("max_depth", 4, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 25, 200),
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=52)
    aucs = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        model = LGBMClassifier(**params)
        model.fit(x_train_fold, y_train_fold,
              eval_set=[(x_test_fold, y_test_fold)],
              eval_metric='auc',
                  categorical_feature=cat_features
              )

        aucs.append(roc_auc_score(y_test_fold, model.predict_proba(x_test_fold)[:, 1]))

    auc = np.mean(aucs)
    return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best auc:', study.best_value)

## Xgboost

In [None]:
def objective(trial):
    X, y = train_oh.drop(columns=["Дата отмены"]), train_oh["Дата отмены"]
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 16),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 1000),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "subsample": trial.suggest_loguniform("subsample", 0.1, 1),
        "colsample_bynode": trial.suggest_loguniform("colsample_bynode", 0.1, 1),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.1, 1),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int("scale_pos_weight", 1, 6),
        "n_estimators": 12500,
        "eval_metric": "auc",
        "early_stopping_rounds": 300,
        "verbose": 0,
        "random_state": 52
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=52)
    aucs = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        model = XGBClassifier(**params)
        model.fit(x_train_fold, y_train_fold,
                  eval_set=[(x_test_fold, y_test_fold)],
                  verbose=0
                  )

        aucs.append(roc_auc_score(y_test_fold, model.predict_proba(x_test_fold)[:, 1]))

    auc = np.mean(aucs)
    return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best auc:', study.best_value)