In [1]:
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm.notebook import tqdm

import re
import os

from functools import partial
from scipy.stats import mode

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibrationDisplay
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVC
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.manifold import TSNE
import optuna


In [18]:
data = pd.DataFrame(columns=['a', 'b', 'c'])
a = {
    "a":1,
    "b":2,
    "c":3
}

In [21]:
data = data.append(a, ignore_index=True)
data

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3
2,1,2,3


In [13]:
data.loc[data].c1 = 4
data.loc[data].c2 = 5
data.loc[data].c3 = 6

ValueError: Cannot index with multidimensional key

<a id="3"></a>
# <h1 style="background-color:lightgray;font-family:newtimeroman;font-size:350%;text-align:center;border-radius: 15px 50px;">Reading Data Files</h1> 

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample_submission.csv')

print('The dimension of the train dataset is:', train.shape)
print('The dimension of the test dataset is:', test.shape)

The dimension of the train dataset is: (5237980, 17)
The dimension of the test dataset is: (33000, 16)


In [3]:
train.describe()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id
count,5237980.0,5237980.0,5237980.0,5237760.0,5237980.0,5237760.0,5237760.0,2343638.0,2380800.0,5237760.0,5237980.0,5237760.0,5237980.0,5237760.0,5237892.0,5237980.0
mean,99.28856,241.51,270.0,5715293.0,-0.01189619,0.9999955,45100250.0,1.001713,0.9996601,0.9997263,51813.59,1.000264,53575.68,0.999992,-0.04756125,13310.05
std,57.87176,138.5319,158.7451,20515910.0,0.8853374,0.002532497,139841300.0,0.7214705,0.0121692,0.002499345,111421.4,0.002510042,129355.4,0.002497509,9.45286,7619.271
min,0.0,0.0,0.0,0.0,-1.0,0.935285,4316.61,7.7e-05,0.786988,0.934915,0.0,0.939827,0.0,0.938008,-385.2898,0.0
25%,49.0,122.0,130.0,84534.15,-1.0,0.998763,5279575.0,0.996332,0.9971,0.998529,7374.72,0.999029,7823.7,0.998781,-4.559755,6729.0
50%,99.0,242.0,270.0,1113604.0,0.0,0.999967,12882640.0,0.999883,0.999889,0.999728,21969.0,1.000207,23017.92,0.999997,-0.06020069,13345.0
75%,149.0,361.0,410.0,4190951.0,1.0,1.001174,32700130.0,1.003318,1.00259,1.000905,55831.68,1.001414,57878.41,1.001149,4.409552,19907.0
max,199.0,480.0,540.0,2982028000.0,1.0,1.077488,7713682000.0,437.9531,1.309732,1.077488,30287840.0,1.077836,54405000.0,1.077675,446.0704,26454.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5237980 entries, 0 to 5237979
Data columns (total 17 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   stock_id                 int64  
 1   date_id                  int64  
 2   seconds_in_bucket        int64  
 3   imbalance_size           float64
 4   imbalance_buy_sell_flag  int64  
 5   reference_price          float64
 6   matched_size             float64
 7   far_price                float64
 8   near_price               float64
 9   bid_price                float64
 10  bid_size                 float64
 11  ask_price                float64
 12  ask_size                 float64
 13  wap                      float64
 14  target                   float64
 15  time_id                  int64  
 16  row_id                   object 
dtypes: float64(11), int64(5), object(1)
memory usage: 679.4+ MB


In [5]:
test.describe()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,time_id
count,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,14771.0,15000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0
mean,99.5,479.0,270.0,6636939.0,-0.147848,1.000151,50988970.0,0.99823,0.998085,0.999915,63294.9,1.000407,66512.53,1.000154,26372.0
std,57.73518,0.816509,158.747484,18468270.0,0.918109,0.00189,169510500.0,0.023915,0.014532,0.001931,104273.6,0.00184,141507.3,0.001838,47.631244
min,0.0,478.0,0.0,0.0,-1.0,0.97335,4316.61,0.804124,0.892179,0.973023,25.87,0.973132,35.34,0.973025,26290.0
25%,49.75,478.0,130.0,333351.3,-1.0,0.999381,5956303.0,0.994066,0.996249,0.999206,9993.66,0.999628,12122.94,0.999399,26331.0
50%,99.5,479.0,270.0,1652495.0,-1.0,1.00024,13227110.0,0.999833,0.999873,1.000034,28766.32,1.000451,32314.84,1.000229,26372.0
75%,149.25,480.0,410.0,5296197.0,1.0,1.00108,34777210.0,1.003095,1.002058,1.000862,71123.38,1.001297,75120.55,1.001059,26413.0
max,199.0,480.0,540.0,478154800.0,1.0,1.009317,2834017000.0,1.364387,1.10214,1.008752,3033798.0,1.010889,13092090.0,1.009047,26454.0


* there are missing values, there are more than  of them in two columns 50%

In [6]:
# одинаковые строки
to_check = pd.merge(train.drop(columns = ['target'], axis = 1), test, how='inner', on='row_id')
to_check.shape

(33000, 31)

In [7]:
train = train[~train['row_id'].isin(test['row_id'])]

In [8]:
to_check = pd.merge(train.drop(columns = ['target'], axis = 1), test, how='inner', on='row_id')
to_check.shape

(0, 31)

In [9]:
train = train.drop_duplicates()

In [10]:
train.shape

(5204980, 17)

In [11]:
train = train.drop(["row_id", "stock_id", "date_id"], axis=1)
test = test.drop(["row_id", "stock_id", "date_id"], axis=1)

# memory

In [12]:
def mem_usage(pandas_obj, flag=True, type_obj='int'):
        if isinstance(pandas_obj, pd.DataFrame):
            usage_b = pandas_obj.memory_usage(deep=True).sum()
        else:  # исходим из предположения о том, что если это не DataFrame, то это Series
            usage_b = pandas_obj.memory_usage(deep=True)
        usage_mb = usage_b / 1024 ** 2  # преобразуем байты в мегабайты
        if flag:
            return "До преобразования {}: {:03.2f} MB".format(type_obj, usage_mb)
        else:
            return "Послеле преобразования {}: {:03.2f} MB".format(type_obj, usage_mb)

def optim_memory(data):
        for dtype in ['float', 'int', 'object']:
            selected_dtype = data.select_dtypes(include=[dtype])
            mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
            mean_usage_mb = mean_usage_b / 1024 ** 2
            print("Average memory usage for {} columns: {:03.2f} MB".format(dtype, mean_usage_mb))
        print("*" * 20)

        gl_int = data.select_dtypes(include=['int'])
        converted_int = gl_int.apply(pd.to_numeric, downcast='unsigned')
        print(mem_usage(gl_int))
        print(mem_usage(converted_int, flag=False))
        print("*" * 20)

        gl_float = data.select_dtypes(include=['float'])
        converted_float = gl_float.apply(pd.to_numeric, downcast='float')

        print(mem_usage(gl_float, type_obj='float'))
        print(mem_usage(converted_float, flag=False, type_obj='float'))
        print("*" * 20)

        optimized_gl = data.copy()

        optimized_gl[converted_int.columns] = converted_int
        optimized_gl[converted_float.columns] = converted_float

        print(mem_usage(data, type_obj='DataFrame numerical'))
        print(mem_usage(optimized_gl, flag=False, type_obj='DataFrame numerical'))

        return optimized_gl

In [13]:
train = optim_memory(train)

Average memory usage for float columns: 39.71 MB
Average memory usage for int columns: 39.71 MB
Average memory usage for object columns: 39.71 MB
********************
До преобразования int: 158.84 MB
Послеле преобразования int: 99.28 MB
********************
До преобразования float: 476.53 MB
Послеле преобразования float: 258.12 MB
********************
До преобразования DataFrame numerical: 595.66 MB
Послеле преобразования DataFrame numerical: 317.69 MB


In [14]:
test = optim_memory(test)

Average memory usage for float columns: 0.23 MB
Average memory usage for int columns: 0.19 MB
Average memory usage for object columns: 0.00 MB
********************
До преобразования int: 0.76 MB
Послеле преобразования int: 0.38 MB
********************
До преобразования float: 2.52 MB
Послеле преобразования float: 1.26 MB
********************
До преобразования DataFrame numerical: 3.27 MB
Послеле преобразования DataFrame numerical: 1.64 MB


In [15]:
print("train\n")
for i in train.columns:
    print(i, train[i].isna().sum())
print("\ntest\n")
for i in train.columns:
    print(i, train[i].isna().sum())

train

seconds_in_bucket 0
imbalance_size 220
imbalance_buy_sell_flag 0
reference_price 220
matched_size 220
far_price 2876113
near_price 2839180
bid_price 220
bid_size 0
ask_price 220
ask_size 0
wap 220
target 88
time_id 0

test

seconds_in_bucket 0
imbalance_size 220
imbalance_buy_sell_flag 0
reference_price 220
matched_size 220
far_price 2876113
near_price 2839180
bid_price 220
bid_size 0
ask_price 220
ask_size 0
wap 220
target 88
time_id 0


In [16]:
train = train.drop(["near_price", "far_price"], axis=1)
test = test.drop(["near_price", "far_price"], axis=1)

In [17]:
def median_mode(data):
        col_obj = []
        col_num = []
        for i in tqdm(data.columns):
            if data[i].isna().sum() == 0:
                continue
            if data[i].dtype == "O":

                col_obj.append(i)
                data[i] = data[i].fillna(data[i].mode()[0])
            else:
                col_num.append(i)
                data[i] = data[i].fillna(data[i].median())
        print(f"на моду заменены значения в колонках: {col_obj}")
        print(f"на медиану заменены значения в колонках: {col_num}")
        return data

In [18]:
train = median_mode(train)
test = median_mode(test)

  0%|          | 0/12 [00:00<?, ?it/s]

на моду заменены значения в колонках: []
на медиану заменены значения в колонках: ['imbalance_size', 'reference_price', 'matched_size', 'bid_price', 'ask_price', 'wap', 'target']


  0%|          | 0/11 [00:00<?, ?it/s]

на моду заменены значения в колонках: []
на медиану заменены значения в колонках: []


In [19]:
# замена на 99 перцентиль все значения выше, замена на 1 перцентиль все значения ниже
data_perc = train.copy()
for col in tqdm(data_perc.columns):
    len_more_quant = data_perc.loc[data_perc[col] > data_perc[col].quantile(0.99)].shape[0]
    len_less_quant = data_perc.loc[data_perc[col] < data_perc[col].quantile(0.01)].shape[0]
    if len_more_quant > 0:
        data_perc.loc[data_perc[col] > data_perc[col].quantile(0.99), col] = data_perc[col].quantile(0.99)
    if len_less_quant > 0:
        data_perc.loc[data_perc[col] < data_perc[col].quantile(0.01), col] = data_perc[col].quantile(0.01)

  0%|          | 0/12 [00:00<?, ?it/s]

In [20]:
train = data_perc

# find anomalies

In [21]:
from sklearn.ensemble import IsolationForest

In [22]:
iso_forest = IsolationForest(n_estimators=100, contamination='auto', max_features=len(train.drop("target", axis=1).columns), n_jobs=-1, random_state=0)
iso_forest.fit(train.drop("target", axis=1))

IsolationForest(max_features=11, n_jobs=-1, random_state=0)

In [23]:
iso_forest_prediction = iso_forest.predict(train.drop("target", axis=1))
iso_forest_prediction_test = iso_forest.predict(test)

In [24]:
# кол-во предполагаемых аномалий
(iso_forest_prediction == -1).sum(), (iso_forest_prediction_test == -1).sum()

(911786, 6107)

In [25]:
train['anomal'] = iso_forest_prediction
test['anomal'] = iso_forest_prediction_test

In [26]:
# train = train.loc[train['anomal'] != -1]
# test = test.loc[test['anomal'] != -1]
# train.shape, test.shape

In [27]:
# train.columns

<a id="5"></a>
# <h1 style="background-color:lightgray;font-family:newtimeroman;font-size:350%;text-align:center;border-radius: 15px 50px;">Baseline Modeling 1.0</h1>

Во-первых, мы начинаем с создания некоторых стандартных моделей без разработки функциональных возможностей или HPO. Сначала мы определяем входные и целевые функции.

In [28]:
X = train.drop(columns = ['target'], axis = 1)
Y = train['target']

test_cv = test

Сначала мы определяем функцию ансамбля Хилла для прогнозирования модели ансамбля.

In [32]:
def hill_climbing(x, y, x_test):
    
    # Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = mean_absolute_error(y, x[col])

    # Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    # Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.51, 0.01) 
    history = [mean_absolute_error(y, current_best_ensemble)]
    j = 0

    while not STOP:
        j += 1
        potential_new_best_cv_score = mean_absolute_error(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = mean_absolute_error(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True
        
    hill_ens_pred_1 = current_best_ensemble
    hill_ens_pred_2 = current_best_test_preds
    
    return [hill_ens_pred_1, hill_ens_pred_2]

Затем мы создаем несколько стандартных моделей в рамках процедуры 5-кратной перекрестной проверки.

In [33]:
ens_cv_scores, ens_preds = list(), list()
hill_ens_cv_scores, hill_ens_preds =  list(), list()

sk = KFold(n_splits = 5)
for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    print('----------------------------------------------------------')
    
    ########
    ## RF ##
    ########
    print("randomForest fit, fold: {}".format(i))
    RF_md = RandomForestRegressor(n_estimators = 1000, 
                                   max_depth = 8,
                                   min_samples_split = 15,
                                   min_samples_leaf = 10).fit(X_train, Y_train)
    
    RF_pred = RF_md.predict(X_test)
    RF_score = mean_absolute_error(Y_test, RF_pred)

    print('Fold', i, '==> RF oof MAE score is ==>', RF_score)

    RF_pred_test = RF_md.predict(test_cv)
    
    #################
    ## Extra Trees ##
    #################
    print("ExtraTrees fit, fold: {}".format(i))
    ET_md = ExtraTreesRegressor(n_estimators = 1000, 
                                 max_depth = 8,
                                 min_samples_split = 15,
                                 min_samples_leaf = 10).fit(X_train, Y_train)

    ET_pred = ET_md.predict(X_test)
    ET_score = mean_absolute_error(Y_test, ET_pred)

    print('Fold', i, '==> ET oof MAE score is ==>', ET_score)

    ET_pred_test = ET_md.predict(test_cv)

    ##########################
    ## HistGradientBoosting ##
    ##########################
    
    print("pipeline fit, fold: {}".format(i))
    hist_md = make_pipeline(StandardScaler(),
                           HistGradientBoostingRegressor(l2_regularization = 0.01,
                                             early_stopping = False,
                                             learning_rate = 0.005,
                                             max_iter = 1000,
                                             max_depth = 8,
                                             max_bins = 255,
                                             min_samples_leaf = 15,
                                             max_leaf_nodes = 10)).fit(X_train, Y_train)
    
#     hist_md = HistGradientBoostingClassifier(l2_regularization = 0.01,
#                                              early_stopping = False,
#                                              learning_rate = 0.01,
#                                              max_iter = 500,
#                                              max_depth = 5,
#                                              max_bins = 255,
#                                              min_samples_leaf = 15,
#                                              max_leaf_nodes = 10).fit(X_train, Y_train)
    
    hist_pred = hist_md.predict(X_test)
    hist_score = mean_absolute_error(Y_test, hist_pred)

    print('Fold', i, '==> Hist oof MAE score is ==>', hist_score)  

    hist_pred_test = hist_md.predict(test_cv)

    ##########
    ## LGBM ##
    ##########
        
    print("lgbm fit, fold: {}".format(i))
    LGBM_md = LGBMRegressor(objective = 'regression',
                             n_estimators = 1000,
                             max_depth = 8,
                             learning_rate = 0.005,
                             num_leaves = 20,
                             reg_alpha = 3,
                             reg_lambda = 3,
                             subsample = 0.7,
                             colsample_bytree = 0.7).fit(X_train, Y_train)

    lgb_pred = LGBM_md.predict(X_test)
    lgb_score = mean_absolute_error(Y_test, lgb_pred)

    print('Fold', i, '==> LGBM oof MAE score is ==>', lgb_score) 

    lgb_pred_test = LGBM_md.predict(test_cv)

    #########
    ## XGB ##
    #########
    
    print("xgb fit, fold: {}".format(i))
    XGB_md = XGBRegressor(
#         objective = 'regression',
                           tree_method = 'hist',
                           colsample_bytree = 0.7, 
                           gamma = 2, 
                           learning_rate = 0.005, 
                           max_depth = 8, 
                           min_child_weight = 10, 
                           n_estimators = 1000, 
                           subsample = 0.7).fit(X_train, Y_train)

    xgb_pred = XGB_md.predict(X_test)
    xgb_score = mean_absolute_error(Y_test, xgb_pred)

    print('Fold', i, '==> XGB oof MAE score is ==>', xgb_score)

    xgb_pred_test = XGB_md.predict(test_cv)

    ##############
    ## CatBoost ##
    ##############
    
    print("CatBoost fit, fold: {}".format(i))
    Cat_md = CatBoostRegressor(loss_function = 'MAE',
                                iterations = 1000,
                                learning_rate = 0.005,
                                depth = 8,
                                random_strength = 0.5,
                                bagging_temperature = 0.7,
                                border_count = 30,
                                l2_leaf_reg = 5,
                                verbose = False, 
                                task_type = 'CPU').fit(X_train, Y_train)

    cat_pred = Cat_md.predict(X_test)
    cat_score = mean_absolute_error(Y_test, cat_pred)

    print('Fold', i, '==> CatBoost oof MAE score is ==>', cat_score)

    cat_pred_test = Cat_md.predict(test_cv)
    
    ##############
    ## Ensemble ##
    ##############
    
    print("ensemble fit, fold: {}".format(i))
    ens_pred_1 = (RF_pred + ET_pred + hist_pred + lgb_pred + xgb_pred + cat_pred) / 6
    ens_pred_2 = (RF_pred_test + ET_pred_test + hist_pred_test + lgb_pred_test + xgb_pred_test + cat_pred_test) / 6
    
    ens_score_fold = mean_absolute_error(Y_test, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    
    print('Fold', i, '==> Average Ensemble oof MAE score is ==>', ens_score_fold)
    
    ############################
    ## Hill Climbing Ensemble ##
    ############################
    
    print("Hill fit, fold: {}".format(i))
    x = pd.DataFrame({'RF': RF_pred,
                      'ET': ET_pred, 
                      'Hist': hist_pred, 
                      'LGBM': lgb_pred,
                      'XGB': xgb_pred,
                      'Cat': cat_pred})
    y = Y_test
        
    x_test = pd.DataFrame({'RF': RF_pred_test,
                           'ET': ET_pred_test, 
                           'Hist': hist_pred_test, 
                           'LGBM': lgb_pred_test,
                           'XGB': xgb_pred_test,
                           'Cat': cat_pred_test})
    
    hill_results = hill_climbing(x, y, x_test)
    
    hill_ens_score_fold = mean_absolute_error(y, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])

    print('Fold', i, '==> Hill Climbing Ensemble oof MAE score is ==>', hill_ens_score_fold)

----------------------------------------------------------
randomForest fit, fold: 0
Fold 0 ==> RF oof MAE score is ==> 5.591070736758035
ExtraTrees fit, fold: 0
Fold 0 ==> ET oof MAE score is ==> 5.587777620849835
pipeline fit, fold: 0
Fold 0 ==> Hist oof MAE score is ==> 5.577308162864549
lgbm fit, fold: 0
Fold 0 ==> LGBM oof MAE score is ==> 5.570430369459393
xgb fit, fold: 0
Fold 0 ==> XGB oof MAE score is ==> 5.549135
CatBoost fit, fold: 0
Fold 0 ==> CatBoost oof MAE score is ==> 5.565437882243595
ensemble fit, fold: 0
Fold 0 ==> Average Ensemble oof MAE score is ==> 5.567780251254094
Hill fit, fold: 0
Fold 0 ==> Hill Climbing Ensemble oof MAE score is ==> 6.295613555249986
----------------------------------------------------------
randomForest fit, fold: 1
Fold 1 ==> RF oof MAE score is ==> 6.846640928546536
ExtraTrees fit, fold: 1
Fold 1 ==> ET oof MAE score is ==> 6.868780761362562
pipeline fit, fold: 1
Fold 1 ==> Hist oof MAE score is ==> 6.844662847390942
lgbm fit, fold: 1
Fo

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [None]:
print('Средняя совокупность оценок ROC-AUC за 10-кратный период составляет', np.mean(ens_cv_scores))
print('Hill Climbing Ensemble ROC-AUC набрал более 10 баллов, это', np.mean(hill_ens_cv_scores))

In [None]:
ens_preds_test = pd.DataFrame(hill_ens_preds).apply(np.mean, axis = 0)

sub['target'] = ens_preds_test
submission.to_csv('submission.csv', index = False)