## Import


In [33]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.metrics import recall_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from params import *

import warnings
warnings.filterwarnings("ignore")

## Data upload


In [2]:
path2folder = "Alfa Campus Junior ML_ Материалы"

In [3]:
data = pd.read_csv(f"{path2folder}/dataset.csv")
sub_example = pd.read_csv(f"{path2folder}/submission_example.csv")

## Data preporation

In [4]:
def prep4time_series_2(df, first_date, last_date, is_test=False):
    """
    Добавление колонок с привлечением новых клиентов за каждый месяц и подсчёт приведенных клиентов по дням недели
    :param df: Датасет
    :param first_date: начальная дата
    :param last_date: конечная дата
    :param test: флаг тестовая ли выборка
    :return: обработанный датасет
    """
    if is_test == False:
        df["client_start_date_day"] = pd.to_datetime(
            df[pd.to_datetime(df["client_start_date"]) < pd.to_datetime("2020-09")]["client_start_date"]).dt.dayofweek
    else:
        df["client_start_date_day"] = pd.to_datetime(df["client_start_date"]).dt.dayofweek

    partners_clients_days = df.groupby("clientbankpartner_pin")["client_start_date_day"].apply(lambda x: list(x))
    counts = partners_clients_days.apply(lambda x: pd.Series(x).value_counts()).fillna(0)
    counts.columns = [f"day_{i}" for i in range(7)]
    df = df.join(counts, on="clientbankpartner_pin")
    df = df.drop(columns=["client_start_date_day"])

    first_date = pd.to_datetime(first_date)
    last_date = pd.to_datetime(last_date)
    dates_n = pd.date_range(start=first_date, end=last_date, freq='MS').tolist()
    df["client_start_date"] = pd.to_datetime(df["client_start_date"])
    new_dates = []

    for date in dates_n:
        month_data = df[
            (df['client_start_date'].dt.year == date.year) & (df['client_start_date'].dt.month == date.month)].groupby(
            "clientbankpartner_pin").apply(lambda x: x.shape[0]).reset_index()
        month_data.columns = ['clientbankpartner_pin', f"{date.year}-{date.month}"]

        df = pd.merge(df, month_data, on='clientbankpartner_pin', how='left')
        df[f"{date.year}-{date.month}"] = df[f"{date.year}-{date.month}"].fillna(0).astype(int)
        new_dates.append(f"{date.year}-{date.month}")
    df = df.drop(columns=["client_pin", "client_start_date"])
    df = df.drop_duplicates()
    return df


def max_zero_month(x):
    """
    Подсчёт неактивных месяцев
    :param x: Строка в датасете
    :return: число, кол-во месяцев с отсутствием новых привлечений
    """
    is_zero_start = 0
    k = 0
    max_k = 0
    for i in range(len(x)):
        if x[i] > 0 and is_zero_start == 0:
            is_zero_start = 1
        elif x[i] > 0 and is_zero_start == 1:
            max_k = max(k, max_k)
            k = 0
        elif x[i] == 0 and is_zero_start == 1:
            k += 1
    return max_k


def prep_2(df, first_date="2019-3", last_date="2020-11", n_month=3, is_test=False):
    """
    Обработка исходного датасета
    :param df: датасет
    :param first_date: начальная дата
    :param last_date: конечная дата
    :param n_month: кол-во месяцев, рассматриваемых для создания таргета
    :param is_test: флаг тестовая ли выборка
    :return: обработанный датасет
    """
    df = prep4time_series_2(df, first_date, last_date, is_test)
    df["partnerrolestart_date"] = pd.to_datetime(df["partnerrolestart_date"])
    df["partnerrolestart_date_year"] = df["partnerrolestart_date"].dt.year
    df["partnerrolestart_date_month"] = df["partnerrolestart_date"].dt.month
    df["partnerrolestart_date_day"] = df["partnerrolestart_date"].dt.day
    df["partnerrolestart_date_dayofweek"] = df["partnerrolestart_date"].dt.dayofweek

    if is_test == False:
        # Добавление таргета
        target_cols = df.loc[:,
                      f"{(pd.to_datetime(last_date) - relativedelta(months=n_month - 1)).year}-{(pd.to_datetime(last_date) - relativedelta(months=n_month - 1)).month}":last_date].columns
        df["target"] = df[target_cols].apply(lambda x: 0 if sum(x) > 0 else 1, axis=1)
        df.drop(columns=target_cols, inplace=True)
        new_first_date = first_date
        new_last_date = f"{(pd.to_datetime(last_date) - relativedelta(months=n_month)).year}-{(pd.to_datetime(last_date) - relativedelta(months=n_month)).month}"
    else:
        df = df[df["clientbankpartner_pin"].isin(sub_example["clientbankpartner_pin"])]
        # Удаление дубликатов
        df.drop(index=df[df["clientbankpartner_pin"] == 169900].index[1], inplace=True)
        df.drop(index=df[df["clientbankpartner_pin"] == 2754].index[1], inplace=True)
        df.drop(index=df[df["clientbankpartner_pin"] == 164386].index[1], inplace=True)
        df.drop(index=df[df["clientbankpartner_pin"] == 280151].index[1], inplace=True)
        df.drop(index=df[df["clientbankpartner_pin"] == 230807].index[1], inplace=True)
        cols_to_drop = df.loc[:,
                       first_date:f"{(pd.to_datetime(first_date) + relativedelta(months=n_month - 1)).year}-{(pd.to_datetime(first_date) + relativedelta(months=n_month - 1)).month}"].columns
        df.drop(columns=cols_to_drop, inplace=True)
        new_first_date = f"{(pd.to_datetime(first_date) + relativedelta(months=n_month)).year}-{(pd.to_datetime(first_date) + relativedelta(months=n_month)).month}"
        new_last_date = last_date

    # Добавление новых признаков
    df["clients_count"] = df.loc[:, new_first_date:new_last_date].apply(lambda x: sum(x), axis=1)
    df = df[df["clients_count"] > 0]
    df["days_after_partnership"] = df["partnerrolestart_date"].apply(lambda x: (pd.to_datetime(new_last_date) - x).days)

    df["active_months"] = df.loc[:, new_first_date:new_last_date].apply(lambda x: (x > 0).sum(), axis=1)
    df["last_active_month_ago"] = df.loc[:, new_first_date:new_last_date].apply(
        lambda x: len(x) - np.amax(np.where(x > 0)) if len(np.where(x > 0)[0]) > 0 else len(x), axis=1)
    df["avg_clients_monthly"] = df.loc[:, new_first_date:new_last_date].apply(
        lambda x: np.mean(np.nan_to_num(x[np.amin(np.where(x > 0)):np.amax(np.where(x > 0))])) if len(
            np.where(x > 0)[0]) > 1 else 0, axis=1)

    df["median_clients_monthly"] = df.loc[:, new_first_date:new_last_date].apply(
        lambda x: np.median(np.nan_to_num(x[np.amin(np.where(x > 0)):np.amax(np.where(x > 0))])) if len(
            np.where(x > 0)[0]) > 1 else 0, axis=1)

    df["std_clients_monthly"] = df.loc[:, new_first_date:new_last_date].apply(
        lambda x: np.std(np.nan_to_num(x[np.amin(np.where(x > 0)):np.amax(np.where(x > 0))])) if len(
            np.where(x > 0)[0]) > 1 else 0, axis=1)

    df["var_clients_monthly"] = df.loc[:, new_first_date:new_last_date].apply(
        lambda x: np.var(np.nan_to_num(x[np.amin(np.where(x > 0)):np.amax(np.where(x > 0))])) if len(
            np.where(x > 0)[0]) > 1 else 0, axis=1)

    df["max_clients_monthly"] = df.loc[:, new_first_date:new_last_date].apply(
        lambda x: np.amax(np.nan_to_num(x[np.amin(np.where(x > 0)):np.amax(np.where(x > 0))])) if len(
            np.where(x > 0)[0]) > 1 else x[0] if len(np.where(x > 0)[0]) == 1 else 0, axis=1)

    df["max_zero_month"] = df.loc[:, new_first_date:new_last_date].apply(lambda x: max_zero_month(x), axis=1)
    df["is_any_1_month"] = df.loc[:, new_first_date:new_last_date].apply(lambda x: 1 if x[-1] > 0 else 0, axis=1)
    df["is_any_2_month"] = df.loc[:, new_first_date:new_last_date].apply(lambda x: 1 if x[-2] > 0 else 0, axis=1)
    df["is_any_3_month"] = df.loc[:, new_first_date:new_last_date].apply(lambda x: 1 if x[-3] > 0 else 0, axis=1)

    df["last_month_to_avg"] = df[[new_last_date, "avg_clients_monthly"]].apply(
        lambda x: 0 if x[1] == 0 else x[0] / x[1], axis=1)
    df["last_month_to_all"] = df.loc[:, new_first_date:new_last_date].apply(
        lambda x: x[-1] / sum(x) if sum(x) != 0 else 0, axis=1)

    avg_clients_count_per_ccode_data = df.groupby("partner_src_type_ccode")["clients_count"].mean()
    df["avg_clients_count_per_ccode"] = df["partner_src_type_ccode"].apply(
        lambda x: avg_clients_count_per_ccode_data[x])

    one_hot = pd.get_dummies(df['partner_src_type_ccode']).astype(int)
    one_hot.columns = [str(x) + "_oh" for x in one_hot.columns]
    df.drop(columns=["partner_src_type_ccode"], inplace=True)
    df = df.join(one_hot)

    new_cols = []
    k = 1
    for col in df.columns:  # Изменение названия колонок: (2019-03, 2019-04, ..., 2020-11) -> (1, 2, ..., 21)
        if '-' in col:
            new_cols.append(str(k))
            k += 1
        else:
            new_cols.append(col)
    df.columns = new_cols

    for i in range(1, df.loc[:, '1':str(k - 1)].shape[1]):  # Подсчёт разницы между i + 1 и i месяцами
        df[f"{str(i + 1)}-{str(i)}"] = df[str(i + 1)] - df[str(i)]

    df["max_diff"] = df.loc[:, "2-1":f"{k - 1}-{k - 2}"].apply(lambda x: np.amax(x), axis=1)
    df["min_diff"] = df.loc[:, "2-1":f"{k - 1}-{k - 2}"].apply(lambda x: np.amin(x), axis=1)

    weekdays = ["day_0", "day_1", "day_2", "day_3", "day_4", "day_5", "day_6"]
    df["avg_weekdays"] = df[weekdays].apply(lambda x: np.mean(x), axis=1)
    df["median_weekdays"] = df[weekdays].apply(lambda x: np.median(x), axis=1)
    df["std_weekdays"] = df[weekdays].apply(lambda x: np.std(x), axis=1)
    df["var_weekdays"] = df[weekdays].apply(lambda x: np.var(x), axis=1)

    df["weekday2weekends"] = df[weekdays].apply(lambda x: sum(x[:-2]) / sum(x[-2:]) if sum(x[-2:]) > 0 else 0, axis=1)
    df["weekday2all"] = df[weekdays].apply(lambda x: sum(x[:-2]) / sum(x) if sum(x) > 0 else 0, axis=1)
    df["weekends2all"] = df[weekdays].apply(lambda x: sum(x[-2:]) / sum(x) if sum(x) > 0 else 0, axis=1)
    df["active_in_weekends"] = df[weekdays].apply(lambda x: 1 if sum(x[-2:]) > 0 else 0, axis=1)

    return df

In [5]:
train = prep_2(data)
test = prep_2(data, is_test=True)

In [6]:
train

Unnamed: 0,clientbankpartner_pin,partnerrolestart_date,day_0,day_1,day_2,day_3,day_4,day_5,day_6,1,...,max_diff,min_diff,avg_weekdays,median_weekdays,std_weekdays,var_weekdays,weekday2weekends,weekday2all,weekends2all,active_in_weekends
0,122027,2019-02-01,18.0,17.0,10.0,23.0,4.0,27.0,14.0,5,...,30,-21,16.142857,17.0,7.159979,51.265306,1.756098,0.637168,0.362832,1
1,270277,2020-04-01,46.0,60.0,36.0,51.0,0.0,55.0,25.0,0,...,60,-23,39.000000,46.0,19.331691,373.714286,2.412500,0.706960,0.293040,1
2,238679,2020-02-01,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0,...,1,-1,0.571429,0.0,0.728431,0.530612,1.000000,0.500000,0.500000,1
3,118398,2019-09-01,23.0,33.0,7.0,32.0,20.0,45.0,32.0,0,...,16,-23,27.428571,32.0,11.172123,124.816327,1.493506,0.598958,0.401042,1
4,10402,2019-01-01,69.0,77.0,51.0,71.0,44.0,40.0,19.0,92,...,18,-37,53.000000,51.0,19.146055,366.571429,5.288136,0.840970,0.159030,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130537,315972,2020-07-01,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,...,1,-1,0.142857,0.0,0.349927,0.122449,0.000000,1.000000,0.000000,0
130538,4177,2020-06-01,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,...,2,-2,0.285714,0.0,0.451754,0.204082,0.000000,1.000000,0.000000,0
130630,326846,2020-07-01,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,...,1,-1,0.142857,0.0,0.349927,0.122449,0.000000,1.000000,0.000000,0
130645,4478,2020-06-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,...,1,-1,0.142857,0.0,0.349927,0.122449,0.000000,0.000000,1.000000,1


In [7]:
test

Unnamed: 0,clientbankpartner_pin,partnerrolestart_date,day_0,day_1,day_2,day_3,day_4,day_5,day_6,1,...,max_diff,min_diff,avg_weekdays,median_weekdays,std_weekdays,var_weekdays,weekday2weekends,weekday2all,weekends2all,active_in_weekends
1,270277,2020-04-01,48.0,47.0,52.0,119.0,50.0,63.0,94.0,0,...,60,-23,67.571429,52.0,25.971727,674.530612,2.012739,0.668076,0.331924,1
2,238679,2020-02-01,1.0,0.0,2.0,1.0,1.0,1.0,0.0,0,...,1,-1,0.857143,1.0,0.638877,0.408163,5.000000,0.833333,0.166667,1
5,121046,2020-03-01,2.0,0.0,3.0,3.0,0.0,0.0,2.0,0,...,2,-2,1.428571,2.0,1.293626,1.673469,4.000000,0.800000,0.200000,1
6,13084,2020-05-01,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0,...,2,-2,0.571429,1.0,0.494872,0.244898,3.000000,0.750000,0.250000,1
8,276484,2020-04-01,0.0,0.0,2.0,2.0,1.0,4.0,2.0,0,...,3,-2,1.571429,2.0,1.293626,1.673469,0.833333,0.454545,0.545455,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124535,73807,2020-11-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,...,1,0,0.142857,0.0,0.349927,0.122449,0.000000,0.000000,1.000000,1
125436,120853,2018-12-01,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,...,1,-1,0.285714,0.0,0.451754,0.204082,1.000000,0.500000,0.500000,1
125813,121120,2020-11-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,...,1,0,0.142857,0.0,0.349927,0.122449,0.000000,0.000000,1.000000,1
128169,60667,2020-10-01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,...,1,-1,0.142857,0.0,0.349927,0.122449,0.000000,1.000000,0.000000,0


In [8]:
set(test.columns) - set(train.columns)

{'2_oh'}

In [9]:
train["2_oh"] = 0

In [10]:
train = train[list(test.columns) + ['target']]

In [11]:
train["target"].value_counts()

target
1    4966
0    2929
Name: count, dtype: int64

## Ensemble + CV

### RFE

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns=["clientbankpartner_pin", "partnerrolestart_date", "target"]), train["target"], train_size=0.8, random_state=42)

classifier = RandomForestClassifier(max_depth=6, n_estimators=100, random_state=42)

classifier.fit(X_train, y_train)

In [13]:
roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1])

0.8673588758641213

In [14]:
rfe = RFE(classifier, n_features_to_select= None)  # Отбор признаков
rfe = rfe.fit(X_train, y_train)
print(rfe.ranking_)
X_train.columns[rfe.support_]

[ 1  5  9  1 14  1  7 28 35 34 33 31 32 36 27 26 18 15  8  1  1  1  1  1
  1  4  1 38  1  1  1  1  1  1  1  1  1  2  1  1  1  1  1  1 12 39 30 40
 37 29 17 23 25 20 24 22 16 21 13  6 10 11  3  1  1  1  1  1  1  1  1  1
  1  1  1  1  1 19]


Index(['day_0', 'day_3', 'day_5', '13', '14', '15', '16', '17', '18',
       'partnerrolestart_date_month', 'partnerrolestart_date_dayofweek',
       'clients_count', 'days_after_partnership', 'active_months',
       'last_active_month_ago', 'avg_clients_monthly',
       'median_clients_monthly', 'std_clients_monthly', 'var_clients_monthly',
       'max_zero_month', 'is_any_1_month', 'is_any_2_month', 'is_any_3_month',
       'last_month_to_avg', 'last_month_to_all', '14-13', '15-14', '16-15',
       '17-16', '18-17', 'max_diff', 'min_diff', 'avg_weekdays',
       'median_weekdays', 'std_weekdays', 'var_weekdays', 'weekday2weekends',
       'weekday2all', 'weekends2all'],
      dtype='object')

In [61]:
def hill_climbing(x, y, x_test):
    # Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = roc_auc_score(y, x[col])

    # Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    # Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.51, 0.01) 
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0

    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True
        
    hill_ens_pred_1 = current_best_ensemble
    hill_ens_pred_2 = current_best_test_preds
    
    return [hill_ens_pred_1, hill_ens_pred_2]

In [62]:
import sys, os
original_stdout = sys.stdout

In [63]:
cat_features = ['partnerrolestart_date_year', 'partnerrolestart_date_month',
                'partnerrolestart_date_day', '0_oh', '1_oh', '2_oh', '3_oh', '4_oh', '5_oh', 'active_in_weekends']

one_hot_cols = ['0_oh', '1_oh', '3_oh', '4_oh', '5_oh']

test_drop_cols = ['clientbankpartner_pin', 'partnerrolestart_date']
if 'target' in test.columns:
    test_drop_cols.append('target')

ens_cv_scores, ens_preds = list(), list()
hill_ens_cv_scores, hill_ens_preds = list(), list()
ridge_ens_cv_scores, ridge_ens_preds = list(), list()
hill_ens_cv_recall = list()

X, y = train.drop(columns=["clientbankpartner_pin", "partnerrolestart_date", "target"]), train["target"]

skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=52)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    oof_preds = pd.DataFrame()
    oof_test_preds = pd.DataFrame()
    print('----------------------------------------------------------')
    x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    # XGBoost without one hot
    xg_model_withount_oh = XGBClassifier(random_state=52, **xg_params_without_oh)
    xg_model_withount_oh.fit(x_train_fold.drop(columns=cat_features), y_train_fold,
                             eval_set=[(x_test_fold.drop(columns=cat_features), y_test_fold)],
                             eval_metric='auc',
                             verbose=0)
    xg_model_withount_oh_pred = xg_model_withount_oh.predict_proba(x_test_fold.drop(columns=cat_features))[:, 1]
    xg_model_withount_oh_score = roc_auc_score(y_test_fold, xg_model_withount_oh_pred)
    print('Fold', i, '==> XG without one hot oof ROC-AUC score is ==>', xg_model_withount_oh_score)
    xg_model_withount_oh_test_pred = xg_model_withount_oh.predict_proba(
        test.drop(columns=cat_features + test_drop_cols))[:, 1]

    # XGBoost with one hot
    xg = XGBClassifier(random_state=52, **xg_params)
    xg.fit(x_train_fold, y_train_fold,
           eval_set=[(x_test_fold, y_test_fold)],
           eval_metric='auc',
           verbose=0)
    xg_pred = xg.predict_proba(x_test_fold)[:, 1]
    xg_score = roc_auc_score(y_test_fold, xg_pred)
    print('Fold', i, '==> XG with one hot oof ROC-AUC score is ==>', xg_score)
    xg_test_pred = xg.predict_proba(test.drop(columns=test_drop_cols))[:, 1]

    # LightGBM with categorical features
    sys.stdout = open(os.devnull, 'w')
    li = LGBMClassifier(random_state=52, **li_cat_features_params)
    li.fit(x_train_fold, y_train_fold,
           eval_set=[(x_test_fold, y_test_fold)],
           categorical_feature=cat_features,
           eval_metric='auc')
    li_pred = li.predict_proba(x_test_fold)[:, 1]
    li_score = roc_auc_score(y_test_fold, li_pred)
    sys.stdout = original_stdout
    print('Fold', i, '==> LightGBM oof ROC-AUC score is ==>', li_score)
    li_test_pred = li.predict_proba(test.drop(columns=test_drop_cols))[:, 1]

    # LightGBM without categorical features
    sys.stdout = open(os.devnull, 'w')
    li_withount_cat = LGBMClassifier(random_state=52, **li_params)
    li_withount_cat.fit(x_train_fold.drop(columns=cat_features), y_train_fold,
                        eval_set=[(x_test_fold.drop(columns=cat_features), y_test_fold)],
                        eval_metric='auc')
    li_withount_cat_pred = li_withount_cat.predict_proba(x_test_fold.drop(columns=cat_features))[:, 1]
    li_withount_cat_score = roc_auc_score(y_test_fold, li_withount_cat_pred)
    sys.stdout = original_stdout
    print('Fold', i, '==> LightGBM without cat features oof ROC-AUC score is ==>', li_withount_cat_score)
    li_withount_cat_test_pred = li_withount_cat.predict_proba(test.drop(columns=cat_features + test_drop_cols))[:, 1]

    # Logreg with rfe
    logreg = LogisticRegression(random_state=52)
    logreg.fit(x_train_fold[x_train_fold.columns[rfe.support_]], y_train_fold)
    logreg_pred = logreg.predict_proba(x_test_fold[x_train_fold.columns[rfe.support_]])[:, 1]
    logreg_score = roc_auc_score(y_test_fold, logreg_pred)
    print('Fold', i, '==> Logreg with RFE oof ROC-AUC score is ==>', logreg_score)
    logreg_test_pred = logreg.predict_proba(test[x_train_fold.columns[rfe.support_]])[:, 1]

    # RF with rfe
    rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=52)
    rf.fit(x_train_fold[x_train_fold.columns[rfe.support_]], y_train_fold)
    rf_pred = rf.predict_proba(x_test_fold[x_train_fold.columns[rfe.support_]])[:, 1]
    rf_score = roc_auc_score(y_test_fold, rf_pred)
    print('Fold', i, '==> RF with RFE oof ROC-AUC score is ==>', rf_score)
    rf_test_pred = rf.predict_proba(test[x_train_fold.columns[rfe.support_]])[:, 1]

    # Ensemble 
    ens_pred_1 = (xg_model_withount_oh_pred + xg_pred + li_pred + li_withount_cat_pred + logreg_pred + rf_pred) / 6
    ens_pred_2 = (xg_model_withount_oh_test_pred + xg_test_pred + li_test_pred + li_withount_cat_test_pred + logreg_test_pred + rf_test_pred) / 6

    ens_score_fold = roc_auc_score(y_test_fold, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

    # Hill Climbing Ensemble
    x = pd.DataFrame({'XGB_without_oh': xg_model_withount_oh_pred,
                      'XGB': xg_pred,
                      'LightGBM': li_pred,
                      'LightGBM_without_cat': li_withount_cat_pred,
                      'Logreg': logreg_pred,
                      'rf': rf_pred})
    y_fold = y_test_fold

    x_test = pd.DataFrame({'XGB_without_oh': xg_model_withount_oh_test_pred,
                           'XGB': xg_test_pred,
                           'LightGBM': li_test_pred,
                           'LightGBM_without_cat': li_withount_cat_test_pred,
                           'Logreg': logreg_test_pred,
                           'rf': rf_test_pred})

    hill_results = hill_climbing(x, y_fold, x_test)

    hill_ens_cv_recall.append((recall_score(y_fold, hill_results[0].apply(lambda x: 1 if x > 0.5 else 0))))
    hill_ens_score_fold = roc_auc_score(y_fold, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])

    print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

----------------------------------------------------------
Fold 0 ==> XG without one hot oof ROC-AUC score is ==> 0.8357448444935827
Fold 0 ==> XG with one hot oof ROC-AUC score is ==> 0.8381449104181403
Fold 0 ==> LightGBM oof ROC-AUC score is ==> 0.8368573214028197
Fold 0 ==> LightGBM without cat features oof ROC-AUC score is ==> 0.8330460579174708
Fold 0 ==> Logreg with RFE oof ROC-AUC score is ==> 0.8296845921948072
Fold 0 ==> RF with RFE oof ROC-AUC score is ==> 0.8346014654479781
Fold 0 ==> Average Ensemble oof ROC-AUC score is ==> 0.8379800990241791
Fold 0 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.8388453588424747
----------------------------------------------------------
Fold 1 ==> XG without one hot oof ROC-AUC score is ==> 0.854646651238489
Fold 1 ==> XG with one hot oof ROC-AUC score is ==> 0.8569883464610186
Fold 1 ==> LightGBM oof ROC-AUC score is ==> 0.8569883464610187
Fold 1 ==> LightGBM without cat features oof ROC-AUC score is ==> 0.8549007354708456
Fold 1 

In [64]:
print('The average ensemble oof ROC-AUC score over the 30-folds is', np.mean(ens_cv_scores))
print('The hill climbing ensemble oof ROC-AUC score over the 30-folds is', np.mean(hill_ens_cv_scores))
print('The hill climbing ensemble oof recall score over the 30-folds is', np.mean(hill_ens_cv_recall))

The average ensemble oof ROC-AUC score over the 30-folds is 0.8622724576162868
The hill climbing ensemble oof ROC-AUC score over the 30-folds is 0.8651791989029975
The hill climbing ensemble oof recall score over the 30-folds is 0.9530890991107939


In [65]:
ens_preds_test = pd.DataFrame(hill_ens_preds).apply(np.mean, axis = 0).to_numpy()

test['target'] = ens_preds_test

In [66]:
def f(x):  # Получение скора по clientbankpartner_pin
    return test[test["clientbankpartner_pin"] == x]["target"].values[0]


sub_example["score"] = sub_example["clientbankpartner_pin"].apply(f)

In [59]:
sub_example["score"] = sub_example["score"].apply(lambda x: max(0, x))

In [60]:
sub_example.to_csv("submission.csv", index=False)

In [46]:
sub_example

Unnamed: 0,clientbankpartner_pin,score
0,6781,0.149815
1,236905,0.244713
2,125779,0.435899
3,1952,0.491446
4,4872,0.709634
...,...,...
4503,121120,0.641642
4504,60667,0.815838
4505,5065,0.821993
4506,133125,0.582581
