In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

import catboost as cb
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
from scipy.stats import gmean, rankdata
import time

import matplotlib.pyplot as plt
from typing import List, Optional

pd.set_option('display.max_columns', None)

  from pandas import Panel


In [2]:
train = pd.read_csv("G:/data/geekbrains-competitive-data-analysis/processed_train.csv")
test = pd.read_csv("G:/data/geekbrains-competitive-data-analysis/processed_test.csv")
train.head(3)

Unnamed: 0,AGE,AMOUNT_ANNUITY,AMOUNT_CREDIT,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,APPLICATION_NUMBER,CHILDRENS,DAYS_ON_LAST_JOB,DLR,DTI,DTI>36,EDUCATION_LEVEL,EXTERNAL_SCORING_RATING_1,EXTERNAL_SCORING_RATING_2,EXTERNAL_SCORING_RATING_3,FAMILY_SIZE,FAMILY_STATUS,FLAG_EMAIL,FLAG_PHONE,GENDER,NAME_CONTRACT_TYPE,OWN_CAR_AGE,REGION_POPULATION,TARGET,TOTAL_SALARY,active_count_day_overdue,active_credit_type,active_last_update_date,active_mean_prolong,active_overdue,active_sum_credit,active_sum_credit_limit,active_sum_debt,active_sum_overdue,age_by_decade,age_year,amt_instalment,amt_payment,appl_type_payment,closed_credit_end,closed_credit_type,closed_last_update_date,closed_long_credit,closed_mean_credit,closed_mean_prolong,closed_overdue,closed_overdue_max,closed_sum_credit,contracts_status,count_active_credit,count_close_credit,days_entry_payment,days_instalment,end_days_credit,mean_amount_goods_payment,mean_end_days_credit,mean_recent_days_credit,min_days_credit,name_goods_category,name_yield_group,num_instalment_number,num_instalment_version,payment_months,share_balance_payments,sum_amount_payment
0,15728.0,25128.0,855000.0,0.0,0.0,0.0,0.0,1.0,2.0,123687442.0,1.0,1719.0,,0.16,0.0,4.0,0.700784,0.645914,0.71657,3.0,1.0,0.0,0.0,1.0,0.0,11.0,0.019101,0.0,157500.0,,,,,,,,,,3.0,43.0,6275.925,3960.37125,0.0,1.015719,3.0,2162.0,2162.0,56362.5,0.0,0.0,0.083643,56362.5,0.0,,1.0,2659.0,2653.0,,68787.18,,,,19.0,2.0,5.75,1.0,34.03,1.100051,14439.24
1,,,,,,,,,,123597908.0,,,0.522774,,,,,,,,,,,,0.0,,,1.0,,0.0,3.0,10.0,0.0,0.0,595813.5,0.0,311476.5,0.0,,,11349.9,11349.9,0.0,1.228765,3.0,83.0,98.0,137876.22,0.0,0.0,0.100676,689381.1,0.0,4.0,5.0,457.0,450.0,1326.0,331908.75,473.25,494.5,57.0,27.0,1.0,9.0,1.0,,1.0,0.0
2,21557.0,42660.0,1006920.0,0.0,0.0,7.0,0.0,0.0,4.0,123526683.0,0.0,3618.0,0.733075,0.316,0.0,1.0,,0.682149,0.267869,2.0,1.0,0.0,1.0,0.0,0.0,,0.026392,0.0,135000.0,0.0,3.0,1.0,0.0,0.0,2160000.0,0.0,1583442.0,0.0,5.0,59.0,20169.4725,20169.4725,3.0,1.184895,3.0,149.0,246.0,630586.8,0.0,0.0,0.0,4414107.6,0.0,2.0,7.0,1917.5,1917.25,683.0,707715.0,683.0,1464.0,408.0,27.0,0.0,10.0,0.5,23.6,1.1,18814.5


In [3]:
def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
#             eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [4]:
cb_params_2000 = {
    "n_estimators": 2000,
    "learning_rate": 0.05,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "GPU",
    "max_bin": 20,
    "silent": True,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 17
}

In [5]:
def get_score(df_train):
    train = df_train.drop(["TARGET", "APPLICATION_NUMBER"], axis=1)
    target = df_train["TARGET"]

    categorial = train.dtypes[train.dtypes == "object"].index
    cv = KFold(n_splits=5, random_state=13, shuffle=True)

    estimators, oof_preds = catboost_cross_validation(
        params=cb_params_2000, X=train, y=target, cv=cv, categorical=categorial
    )
    
    oof_score = roc_auc_score(target, oof_preds)
    
    print(f"OOF-score = {round(oof_score, 5)}")
    return estimators, oof_preds

In [6]:
estimators, oof_preds = get_score(train)

Mon Sep 28 15:45:02 2020, Cross-Validation, 110093 rows, 67 cols
Fold 1, Valid score = 0.71637
Fold 2, Valid score = 0.72932
Fold 3, Valid score = 0.72219
Fold 4, Valid score = 0.71836
Fold 5, Valid score = 0.71474
Score by each fold: [0.71637, 0.72932, 0.72219, 0.71836, 0.71474]
OOF-score = 0.72011


In [7]:
categorial = train.dtypes[train.dtypes == "object"].index

In [8]:
cb_params_1000 = {
    "n_estimators": 2000,
    "learning_rate": 0.05,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "GPU",
    "max_bin": 20,
    "silent": True,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 17,
    "cat_features": "categorial"
}

In [9]:
train = train.select_dtypes(exclude=["object"])
train.head(3)

Unnamed: 0,AGE,AMOUNT_ANNUITY,AMOUNT_CREDIT,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,APPLICATION_NUMBER,CHILDRENS,DAYS_ON_LAST_JOB,DLR,DTI,DTI>36,EDUCATION_LEVEL,EXTERNAL_SCORING_RATING_1,EXTERNAL_SCORING_RATING_2,EXTERNAL_SCORING_RATING_3,FAMILY_SIZE,FAMILY_STATUS,FLAG_EMAIL,FLAG_PHONE,GENDER,NAME_CONTRACT_TYPE,OWN_CAR_AGE,REGION_POPULATION,TARGET,TOTAL_SALARY,active_count_day_overdue,active_credit_type,active_last_update_date,active_mean_prolong,active_overdue,active_sum_credit,active_sum_credit_limit,active_sum_debt,active_sum_overdue,age_by_decade,age_year,amt_instalment,amt_payment,appl_type_payment,closed_credit_end,closed_credit_type,closed_last_update_date,closed_long_credit,closed_mean_credit,closed_mean_prolong,closed_overdue,closed_overdue_max,closed_sum_credit,contracts_status,count_active_credit,count_close_credit,days_entry_payment,days_instalment,end_days_credit,mean_amount_goods_payment,mean_end_days_credit,mean_recent_days_credit,min_days_credit,name_goods_category,name_yield_group,num_instalment_number,num_instalment_version,payment_months,share_balance_payments,sum_amount_payment
0,15728.0,25128.0,855000.0,0.0,0.0,0.0,0.0,1.0,2.0,123687442.0,1.0,1719.0,,0.16,0.0,4.0,0.700784,0.645914,0.71657,3.0,1.0,0.0,0.0,1.0,0.0,11.0,0.019101,0.0,157500.0,,,,,,,,,,3.0,43.0,6275.925,3960.37125,0.0,1.015719,3.0,2162.0,2162.0,56362.5,0.0,0.0,0.083643,56362.5,0.0,,1.0,2659.0,2653.0,,68787.18,,,,19.0,2.0,5.75,1.0,34.03,1.100051,14439.24
1,,,,,,,,,,123597908.0,,,0.522774,,,,,,,,,,,,0.0,,,1.0,,0.0,3.0,10.0,0.0,0.0,595813.5,0.0,311476.5,0.0,,,11349.9,11349.9,0.0,1.228765,3.0,83.0,98.0,137876.22,0.0,0.0,0.100676,689381.1,0.0,4.0,5.0,457.0,450.0,1326.0,331908.75,473.25,494.5,57.0,27.0,1.0,9.0,1.0,,1.0,0.0
2,21557.0,42660.0,1006920.0,0.0,0.0,7.0,0.0,0.0,4.0,123526683.0,0.0,3618.0,0.733075,0.316,0.0,1.0,,0.682149,0.267869,2.0,1.0,0.0,1.0,0.0,0.0,,0.026392,0.0,135000.0,0.0,3.0,1.0,0.0,0.0,2160000.0,0.0,1583442.0,0.0,5.0,59.0,20169.4725,20169.4725,3.0,1.184895,3.0,149.0,246.0,630586.8,0.0,0.0,0.0,4414107.6,0.0,2.0,7.0,1917.5,1917.25,683.0,707715.0,683.0,1464.0,408.0,27.0,0.0,10.0,0.5,23.6,1.1,18814.5


In [10]:
x_train, x_valid = train_test_split(
    train.drop(["TARGET", "APPLICATION_NUMBER"], axis=1), train_size=0.7, random_state=1
)
y_train, y_valid = train_test_split(
    train["TARGET"], train_size=0.7, random_state=1
)

Получаем вредсказание для xgboost

In [11]:
xgb_params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "nthread": 6,
    "seed": 27,
}

In [12]:
dtrain = xgb.DMatrix(x_train, y_train)
dvalid = xgb.DMatrix(x_valid, y_valid)

model = xgb.train(
    dtrain=dtrain,
    params=xgb_params,
    num_boost_round=500,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    early_stopping_rounds=20,
    verbose_eval=100,
)
col = x_train.columns
train['xgb'] = model.predict(xgb.DMatrix(train[col]))
test['xgb'] = model.predict(xgb.DMatrix(test[col]))

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69912	valid-auc:0.68173
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[70]	train-auc:0.82128	valid-auc:0.72097



Получаем вредсказание для lightgbm

In [13]:
lgbm_params = {
    "n_estimators": 1000,
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0,
    "early_stopping_round": 50,
}

In [14]:
train_data = lgb.Dataset(x_train, label=y_train)
test_data = lgb.Dataset(x_valid, label=y_valid)

model = lgb.train(
    params = lgbm_params,
    train_set = train_data,
    valid_sets = test_data,
    verbose_eval=100,
)

train['lgb'] = model.predict(train[col])
test['lgb'] = model.predict(test[col])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.718672
[200]	valid_0's auc: 0.72163
Early stopping, best iteration is:
[208]	valid_0's auc: 0.721802


Получаем вредсказание для catboost

In [15]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.05,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "silent": True,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 17
}

In [16]:
model = cb.CatBoostClassifier(**cb_params)
model.fit(
    x_train, y_train, #categorical,
    eval_set=[(x_train, y_train), (x_valid, y_valid)]
)
train['cb'] = model.predict_proba(train[col])[:, 1]
test['cb'] = model.predict_proba(test[col])[:, 1]

In [17]:
def rank(x):    
    return np.mean(rankdata(x))

In [18]:
train['mean_pred'] = train[['xgb', 'lgb', 'cb']].mean(axis=1)
train['gmean_pred'] = train[['xgb', 'lgb', 'cb']].agg(gmean, axis=1)
train['avg_rank_pred'] = train[['xgb', 'lgb', 'cb']].agg(rank, axis=1)

In [19]:
estimators, oof_preds = get_score(train)

Mon Sep 28 15:47:21 2020, Cross-Validation, 110093 rows, 73 cols
Fold 1, Valid score = 0.85112
Fold 2, Valid score = 0.84567
Fold 3, Valid score = 0.83757
Fold 4, Valid score = 0.84762
Fold 5, Valid score = 0.83961
Score by each fold: [0.85112, 0.84567, 0.83757, 0.84762, 0.83961]
OOF-score = 0.84423


In [20]:
test['mean_pred'] = test[['xgb', 'lgb', 'cb']].mean(axis=1)
test['gmean_pred'] = test[['xgb', 'lgb', 'cb']].agg(gmean, axis=1)
test['avg_rank_pred'] = test[['xgb', 'lgb', 'cb']].agg(rank, axis=1)

In [27]:
y_pred = np.zeros(test.shape[0])
for estimator in estimators:
    y_pred += estimator.predict_proba(test)[:, 1]
    
y_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test["APPLICATION_NUMBER"].astype('int32'),
    "TARGET": y_pred / len(estimators)
})
y_pred.to_csv("predict-5.csv", index=False)

Как результат переобучились. Имеет смысл найти в обучающей выборки похожие заявки как в тестовой и повторить эксперемент

In [31]:
target = ["TARGET"]
colums = ['xgb', 'lgb', 'cb', 'mean_pred', 'gmean_pred', 'avg_rank_pred', "APPLICATION_NUMBER"]
train_df = train[colums+target]

In [32]:
estimators, oof_preds = get_score(train_df)

Mon Sep 28 16:01:50 2020, Cross-Validation, 110093 rows, 6 cols
Fold 1, Valid score = 0.84036
Fold 2, Valid score = 0.83302
Fold 3, Valid score = 0.82642
Fold 4, Valid score = 0.8324
Fold 5, Valid score = 0.82868
Score by each fold: [0.84036, 0.83302, 0.82642, 0.8324, 0.82868]
OOF-score = 0.83195


In [33]:
y_pred = np.zeros(test.shape[0])
for estimator in estimators:
    y_pred += estimator.predict_proba(test[colums])[:, 1]
    
y_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test["APPLICATION_NUMBER"].astype('int32'),
    "TARGET": y_pred / len(estimators)
})
y_pred.to_csv("predict-6.csv", index=False)