In [1]:
import pandas as pd
import random
import os
import torch
from tqdm.notebook import tqdm
from catboost import CatBoostClassifier, Pool, metrics, cv
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

In [2]:
seed = 42

random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True

In [3]:
## data_colection

df = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
df = df.drop(columns=['id'])

test_df = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')

df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [None]:
cor_mat = df.corr(method="pearson",numeric_only=True)
mask = np.triu(np.ones_like(cor_mat))

plt.figure(figsize=(8, 6))
sns.heatmap(cor_mat, cmap='coolwarm', fmt='.2f', annot=True, mask=mask)
plt.show()

In [4]:
## feats_engin

nums = [ # рофлсон
]
cats = [c for c in df.columns if c not in nums+['id', 'Response']]

In [12]:
df.nunique()

Gender                      2
Age                        66
Driving_License             2
Region_Code                53
Previously_Insured          2
Vehicle_Age                 3
Vehicle_Damage              2
Annual_Premium          51728
Policy_Sales_Channel      152
Vintage                   290
Response                    2
dtype: int64

In [None]:
for feature in nums:
    X = df[[feature]].values  # Подготовим данные для одного признака
    inertias = []  
    
    # Пробуем разное количество кластеров от 1 до 10
    for k in tqdm(range(1, 6)):
        kmeans =  KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
        
    ratios = []
    for i in range(1, len(inertias) - 1):
        ratio = (inertias[i-1] - inertias[i]) / (inertias[i] - inertias[i+1])
        ratios.append(ratio)
    
    optimal_clusters = np.argmax(ratios) + 2
    print(f'оптимальным будет {optimal_clusters}')
    
    # Построение графика «правила локтя» для текущего признака
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, 6), inertias, marker='o')
    plt.title(f'Elbow Method for {feature}')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.show()

In [None]:
best_n_clusters = [2, 3, 2]

In [None]:
for n, feature in tqdm(enumerate(nums), total = len(nums)):
    X = df[[feature]].values 
    X_test = test_df[[feature]].values 
    
    kmeans =  KMeans(n_clusters=best_n_clusters[n], random_state=42)
    predict_train = kmeans.fit_predict(X)
    predict_test = kmeans.predict(X_test)
    
    df[f'{feature}_KMeans'] = predict_train
    test_df[f'{feature}_KMeans'] = predict_test
    
    df[f'{feature}_KMeans'] = df[f'{feature}_KMeans'].astype(int) 
    test_df[f'{feature}_KMeans'] = test_df[f'{feature}_KMeans'].astype(int) 
    
    cats.append(f'{feature}_KMeans')

In [None]:
nums_post_process = []
for c in tqdm(nums, total = len(nums)):
    df[f"{c}_square"] = df[c]**2
    df[f"{c}_sqrt"] = df[c]**(0.5)
    df[f"{c}_log"] = np.log(df[c] + 1)
    
    test_df[f"{c}_square"] = test_df[c]**2
    test_df[f"{c}_sqrt"] = test_df[c]**(0.5)
    test_df[f"{c}_log"] = np.log(test_df[c] + 1)
    
    nums_post_process.append(f"{c}_square")
    nums_post_process.append(f"{c}_sqrt")
    nums_post_process.append(f"{c}_log")

    
nums = nums + nums_post_process

In [5]:
## feats_transform

# for c in tqdm(cats):
#     df[c] = f'{c}_'+df[c].astype(str)

# uni = set()|
# for c in tqdm(cats):
#     uni = uni | set(df[c])

# tokens = dict(zip(uni, 
#                   range(len(uni))))
# for i,k in enumerate(tokens):
#     print (k, tokens[k])
#     if i == 5:
#         break


# for column in tqdm(cats, total = len(cats)):
#     encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
#     df[column] = encoder.fit_transform(df[[column]])
#     test_df[column] = encoder.transform(test_df[[column]])
#     df[column] = df[column].astype('int')
#     test_df[column] = test_df[column].astype('int')


# for column in cats:
#     df[column] = df[column].astype('str')
#     test_df[column] = test_df[column].astype('str')


gender_mapping = {'Male': 0, 'Female': 1}
vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
vehicle_damage_mapping = {'No': 0, 'Yes': 1}
        
df['Gender'] = df['Gender'].map(gender_mapping)
test_df['Gender'] = test_df['Gender'].map(gender_mapping)
df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_mapping)
test_df['Vehicle_Age'] = test_df['Vehicle_Age'].map(vehicle_age_mapping)
df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage_mapping)
test_df['Vehicle_Damage'] = test_df['Vehicle_Damage'].map(vehicle_damage_mapping)

for column in cats:
    df[column] = df[column].astype('int')
    #df[column] = df[column].astype('category')
    test_df[column] = test_df[column].astype('int')
    #test_df[column] = test_df[column].astype('category')

In [7]:
pd.set_option('display.max_columns', None)
df.head(5)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,21,1,35,0,1,1,65101,124,187,0
1,0,43,1,28,0,2,1,58911,26,288,1
2,1,25,1,14,1,0,0,38043,152,254,0
3,1,35,1,1,0,1,1,2630,156,76,0
4,1,36,1,15,1,1,0,31951,152,294,0


In [None]:
## feats_treat

In [None]:
## feats_select

In [6]:
## data_split

DEBUG = True

if DEBUG:
    df = df.iloc[:100*1000]

# **Just Catboost**

In [None]:
## data_split

train_df, val_df = train_test_split(df, test_size=.20, random_state=2)

train_df.reset_index(drop=True, 
               inplace=True)
val_df.reset_index(drop=True, 
                 inplace=True)

y = train_df['Response']
X = train_df.drop(columns = 'Response')
y_val = val_df['Response']
X_val = val_df.drop(columns = 'Response')

In [None]:
# model = CatBoostClassifier(
#     #scale_pos_weight=scale_pos_weight_value,
#     #auto_class_weights='SqrtBalanced',
#     #auto_class_weights='Balanced',
#     loss_function='Logloss',
#     eval_metric='AUC',
#     random_seed=42,
#     logging_level='Silent',
#     iterations=1000,
#     task_type="GPU",
#     devices='0',
#     use_best_model=True,
#     early_stopping_rounds=100
# )

model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    learning_rate=0.05,
    iterations=1000,
    depth=9,
    l2_leaf_reg=0.5,
    task_type='GPU',
    devices='0',
    random_seed=42,
    logging_level='Silent',  
)

model.fit(
    X, y,
    cat_features=cats,
    eval_set=[(X_val, y_val)],
    logging_level='Verbose', #Verbose
)
    
print(" AUC по test: ", model.get_best_score()['validation']['AUC'])

predict = model.predict_proba(test_df.drop(columns=['id']))[:,1:].flatten()
submission = pd.DataFrame({'id' : test_df.id, 'target' : predict})
submission.to_csv('Catboost_submission.csv', index=False)

# **Catboost KFold**

In [None]:
## data_split
y = df['Response']
X = df.drop(columns = 'Response')

In [None]:
## model_trainer

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

catboost_cross_val_predictions = []
for _, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), total = 5):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = CatBoostClassifier(
        #scale_pos_weight=scale_pos_weight_value,
        auto_class_weights='SqrtBalanced',
        #auto_class_weights='Balanced',
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=0.05,
        random_seed=42,
        logging_level='Silent',
        iterations=5000,
        task_type="GPU",
        depth=9,
        devices='0',
        use_best_model=True,
        early_stopping_rounds=100
    )

    model.fit(
        X_train, y_train,
        cat_features=cats,
        eval_set=[(X_test, y_test)],
        logging_level='Silent', #Verbose
    )
    
    print(" AUC по test: ", model.get_best_score()['validation']['AUC'])

    predict = model.predict_proba(test_df.drop(columns=['id']))[:,1:].flatten()
    catboost_cross_val_predictions.append(predict)

In [None]:
first_iter = True
for predict in catboost_cross_val_predictions:
    if first_iter != True:
        result_proba = result_proba + predict
    else:
        result_proba = predict

result_proba = result_proba/5

submission = pd.DataFrame({'id' : test_df.id, 'target' : result_proba})
submission.to_csv('Catboost_submission_crossval.csv', index=False)

# **Catboost Kfold + Meta Catboost**

In [None]:
## data_split

train_df, val_df = train_test_split(df, test_size=.20, random_state=42)

train_df.reset_index(drop=True, 
               inplace=True)
val_df.reset_index(drop=True, 
                 inplace=True)

y = train_df['Response']
X = train_df.drop(columns = 'Response')
y_val = val_df['Response']
X_val = val_df.drop(columns = 'Response')

In [None]:
## model_trainer

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_holder = []

for _, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), total = 5):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = CatBoostClassifier(
        #scale_pos_weight=scale_pos_weight_value,
        auto_class_weights='SqrtBalanced',
        #auto_class_weights='Balanced',
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        depth=9,
        l2_leaf_reg=0.5,
        logging_level='Silent',
        iterations=1000,
        task_type="GPU",
        devices='0',
        learning_rate=0.05,
        use_best_model=True,
        early_stopping_rounds=100
    )

    model.fit(
        X_train, y_train,
        cat_features=cats,
        eval_set=[(X_test, y_test)],
        logging_level='Silent', #Verbose
    )
    
    y_pred_proba = model.predict_proba(X_val)[:, 1:]
    
#     auc_score = roc_auc_score(y_val.values, y_pred_proba)
#     print("AUC по val: ", auc_score)
    
    print("AUC по test: ",model.get_best_score()['validation']['AUC'])

    models_holder.append(model)

In [None]:
for n, proba in enumerate(catboost_cross_val_predict):
    X_val[f'model_{n}_redict'] = proba
    
for n, proba in enumerate(catboost_cross_test_predict):
    test_df[f'model_{n}_redict'] = proba    

In [None]:
model = CatBoostClassifier(
    auto_class_weights='SqrtBalanced',
    loss_function='Logloss',
    eval_metric='AUC',
    learning_rate=0.01,
    iterations=1000,
    depth=6,
    task_type='GPU',
    devices='0',
    random_seed=42,
    logging_level='Silent',  
)


model.fit(
    X_val, y_val,
    cat_features=cats,
    eval_set=[(X_val_, y_val_)],
    logging_level='Verbose', #Verbose
)
    
print(" AUC по test: ", model.get_best_score()['validation']['AUC'])

predict = model.predict_proba(test_df.drop(columns=['id']))[:,1:].flatten()
submission = pd.DataFrame({'id' : test_df.id, 'target' : predict})
submission.to_csv('Catboost_submission_METACatboost.csv', index=False)

# **All models is all your need**

In [7]:
df

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,21,1,35,0,1,1,65101,124,187,0
1,0,43,1,28,0,2,1,58911,26,288,1
2,1,25,1,14,1,0,0,38043,152,254,0
3,1,35,1,1,0,1,1,2630,156,76,0
4,1,36,1,15,1,1,0,31951,152,294,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,1,21,1,39,0,0,1,33399,160,159,0
99996,0,22,1,8,1,0,0,37709,152,167,0
99997,0,26,1,28,1,0,0,52776,152,216,0
99998,0,59,1,10,1,1,0,32377,124,273,0


In [8]:
y = df['Response']
X = df.drop(columns = 'Response')

In [9]:
X, X_late, y, y_late  = train_test_split(X, y, test_size=.10, random_state=42, stratify=y)

X.reset_index(drop=True, 
               inplace=True)
X_late.reset_index(drop=True, 
                 inplace=True)
y.reset_index(drop=True, 
               inplace=True)
y_late.reset_index(drop=True, 
                 inplace=True)

In [12]:
## Proba holders

base_models_proba_train = pd.DataFrame()
base_models_proba_test = pd.DataFrame()
base_models_proba_late = pd.DataFrame()

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for counter, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), total = 5):
    X_tr, y_tr = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[test_index], y.iloc[test_index]
    eval_set = [(X_val, y_val)]
    
    lgbm_params = {
        'n_estimators': 1,
        'verbose': -1,
        'max_depth': 6,    
        'metric': 'auc',                   
        'max_bin': 262143,                   
        'num_leaves': 223,
        'learning_rate': 0.3, 
        'min_child_samples': 54,
        'subsample': 0.5395,
        'colsample_bytree': 0.5475,
        'reg_alpha': 3.4444,              
        'reg_lambda': 2.8749e-05,  
        'device': 'cpu'                    
    }

    lgbm_model = lgb.LGBMClassifier(**lgbm_params, random_state=42)
    lgbm_model.fit(X_tr, y_tr, eval_set=eval_set, eval_metric='auc', callbacks=[lgb.early_stopping(30)])
    best_iteration = lgbm_model.best_iteration_
    
    base_models_proba_train[f'LGBMClassifier_{counter}'] = lgbm_model.predict_proba(X, num_iteration=best_iteration)[:, 1]
    base_models_proba_late[f'LGBMClassifier_{counter}'] = lgbm_model.predict_proba(X_late, num_iteration=best_iteration)[:, 1]
    base_models_proba_test[f'LGBMClassifier_{counter}'] = lgbm_model.predict_proba(test_df.drop(columns=['id']), num_iteration=best_iteration)[:, 1]

  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1]	valid_0's auc: 0.84428
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1]	valid_0's auc: 0.847206
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1]	valid_0's auc: 0.845757
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1]	valid_0's auc: 0.84711
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1]	valid_0's auc: 0.850786


In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for counter, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), total = 5):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = CatBoostClassifier(
        #scale_pos_weight=scale_pos_weight_value,
        auto_class_weights='SqrtBalanced',
        #auto_class_weights='Balanced',
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        depth=9,
        l2_leaf_reg=0.5,
        logging_level='Silent',
        iterations=1,
        task_type="GPU",
        devices='0',
        learning_rate=0.05,
        use_best_model=True,
        early_stopping_rounds=30
    )

    model.fit(
        X_train, y_train,
        cat_features=cats,
        eval_set=[(X_test, y_test)],
        logging_level='Silent', #Verbose
    )

    
#     auc_score = roc_auc_score(y_val.values, y_pred_proba)
#     print("AUC по val: ", auc_score)
    
    print("AUC по Val: ", model.get_best_score()['validation']['AUC'], " фолд: ", str(counter))

    base_models_proba_train[f'CatBoostClassifier_{counter}'] = model.predict_proba(X)[:, 1]
    base_models_proba_late[f'CatBoostClassifier_{counter}'] = model.predict_proba(X_late)[:, 1]
    base_models_proba_test[f'CatBoostClassifier_{counter}'] = model.predict_proba(test_df.drop(columns=['id']))[:, 1]

  0%|          | 0/5 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


AUC по Val:  0.8398891985416412  фолд:  0


Default metric period is 5 because AUC is/are not implemented for GPU


AUC по Val:  0.8435580432415009  фолд:  1


Default metric period is 5 because AUC is/are not implemented for GPU


AUC по Val:  0.8457962572574615  фолд:  2


Default metric period is 5 because AUC is/are not implemented for GPU


AUC по Val:  0.8519709706306458  фолд:  3


Default metric period is 5 because AUC is/are not implemented for GPU


AUC по Val:  0.8431099355220795  фолд:  4


In [16]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for counter, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), total = 5):
    X_tr, y_tr = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[test_index], y.iloc[test_index]
    eval_set = [(X_val, y_val)]
    
    xgb_params = {
    'n_estimators': 2200,
    'eta': 0.05,
    'alpha':  0.2545607592482198,
    'subsample': 0.8388163485383147, 
    'colsample_bytree': 0.2732499701466825, 
    'max_depth': 16,
    'min_child_weight': 5,
    'gamma': 0.0017688666476104672,
    'eval_metric': 'auc',
    'max_bin': 262143,
    'tree_method':"gpu_hist",
    'device':"cuda",
}

    xgb_model = xgb.XGBClassifier(**xgb_params, early_stopping_rounds=30, random_state=42)
    xgb_model.fit(X_tr, y_tr, eval_set=eval_set, verbose=False)
    best_iteration = xgb_model.best_iteration
    print("AUC по Val: ", xgb_model.best_score, " фолд: ", str(counter)) 
    
    base_models_proba_train[f'XGBClassifier_{counter}'] = xgb_model.predict_proba(X, iteration_range=(0, best_iteration))[:, 1]
    base_models_proba_late[f'XGBClassifier_{counter}'] = xgb_model.predict_proba(X_late, iteration_range=(0, best_iteration))[:, 1]
    base_models_proba_test[f'XGBClassifier_{counter}'] = xgb_model.predict_proba(test_df.drop(columns=['id']), iteration_range=(0, best_iteration))[:, 1]

  0%|          | 0/5 [00:00<?, ?it/s]

AUC по Val:  0.8650752590193729  фолд:  0
AUC по Val:  0.8740962728223561  фолд:  1
AUC по Val:  0.8694725022800922  фолд:  2
AUC по Val:  0.8707928029790102  фолд:  3
AUC по Val:  0.8705754420426596  фолд:  4


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Определяем индексы категориальных признаков в DataFrame
cat_idxs = [df.columns.get_loc(col) for col in categorical_columns]

# Определяем количество уникальных значений для каждого категориального признака
cat_dims = [df[col].nunique() for col in categorical_columns]

tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    n_d = 16,
    n_a = 16,
    n_steps = 4,
    optimizer_fn = torch.optim.Adam,
    optimizer_params = dict(lr = (2e-2)),
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 10
    
)

for counter, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), total = 5):
    print(f'Training fold {fold + 1}')
    X_train, X_val = X.iloc[trn_ind].values, X.iloc[val_ind].values
    y_train, y_val = y.iloc[trn_ind].values.reshape(-1,1), y.iloc[val_ind].values.reshape(-1,1)


    clf =  TabNetClassifier(**tabnet_params)
    clf.fit(
      X_train, y_train,
      eval_set=[(X_val, y_val)],
      max_epochs = 200,
      patience = 20,
      batch_size = 1024*20, 
      virtual_batch_size = 128*20,
      num_workers = 4,
      drop_last = False,
      loss_fn=torch.nn.BCEWithLogitsLoss()
      )
      
    base_models_proba_train[f'TabNetClassifier_{counter}'] = clf.predict_proba(X)[:, 1]
    base_models_proba_late[f'TabNetClassifier_{counter}'] = clf.predict_proba(X_late)[:, 1]
    base_models_proba_test[f'TabNetClassifier_{counter}'] = clf.predict_proba(test_df.drop(columns=['id']))[:, 1]

In [17]:
eval_set = [(base_models_proba_late, y_late)]

xgb_params = {
    'n_estimators': 200,
    'eta': 0.05,
    'alpha':  0.2545607592482198,
    'subsample': 0.8388163485383147, 
    'colsample_bytree': 0.2732499701466825, 
    'max_depth': 16,
    'min_child_weight': 5,
    'gamma': 0.0017688666476104672,
    'eval_metric': 'auc',
    'max_bin': 262143,
    'tree_method':"gpu_hist",
    'device':"cuda",
}

xgb_model = xgb.XGBClassifier(**xgb_params, early_stopping_rounds=50, random_state=42)
xgb_model.fit(base_models_proba_train, y, eval_set=eval_set, verbose=True)

[0]	validation_0-auc:0.85807
[1]	validation_0-auc:0.86240
[2]	validation_0-auc:0.86531
[3]	validation_0-auc:0.86640
[4]	validation_0-auc:0.86820
[5]	validation_0-auc:0.86824
[6]	validation_0-auc:0.86871
[7]	validation_0-auc:0.86860
[8]	validation_0-auc:0.86896
[9]	validation_0-auc:0.86853
[10]	validation_0-auc:0.86881
[11]	validation_0-auc:0.86899
[12]	validation_0-auc:0.86883
[13]	validation_0-auc:0.86881
[14]	validation_0-auc:0.86900
[15]	validation_0-auc:0.86873
[16]	validation_0-auc:0.86857
[17]	validation_0-auc:0.86879
[18]	validation_0-auc:0.86879
[19]	validation_0-auc:0.86878


In [18]:
predict = xgb_model.predict_proba(base_models_proba_test)[:,1:].flatten()
submission = pd.DataFrame({'id' : test_df.id, 'target' : predict})
submission.to_csv('AllModels_submission.csv', index=False)