## Import


In [389]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from collections import Counter
from dateutil.relativedelta import relativedelta
from collections import defaultdict
import seaborn as sns
from sklearn.cluster import KMeans
from statsmodels.tsa.arima.model import ARIMA
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

## Data upload


In [None]:
train = pd.read_csv("train.csv")
cat_features = ['partnerrolestart_date_year', 'partnerrolestart_date_month',
                'partnerrolestart_date_day', '0_oh', '1_oh', '2_oh', '3_oh', '4_oh', '5_oh', 'active_in_weekends']

### Catboost

In [11]:
from sklearn.metrics import roc_auc_score
import optuna


def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(
        train.drop(columns=["clientbankpartner_pin", "partnerrolestart_date", "target"] + cat_features), train["target"],
        train_size=0.7, random_state=42)

    param = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "depth": trial.suggest_int("depth", 1, 15),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "od_type": "Iter",
        "od_wait": 200

    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.05, 1)

    model = CatBoostClassifier(**param, silent=True)
    model.fit(X_train, y_train, eval_set=(X_test, y_test))
    predictions = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, predictions)
    return auc

In [12]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

[I 2024-05-08 11:08:28,864] A new study created in memory with name: no-name-383ebcc9-8319-4f1d-a282-78da265c1b47
[I 2024-05-08 11:10:31,131] Trial 0 finished with value: 0.8682167603823399 and parameters: {'learning_rate': 0.08814349405829225, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.8659175971957315, 'min_data_in_leaf': 51, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4570597275207318}. Best is trial 0 with value: 0.8682167603823399.
[I 2024-05-08 11:11:42,837] Trial 1 finished with value: 0.8680225575687953 and parameters: {'learning_rate': 0.029083326157568347, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.22267647933166929, 'min_data_in_leaf': 32, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.62401324510348}. Best is trial 0 with value: 0.8682167603823399.
[I 2024-05-08 11:12:55,481] Trial 2 finished with value: 0.8629439572882563 and parameters: {'learning_rate': 0.0139690190768558

KeyboardInterrupt: 

In [13]:
print('Best hyperparameters:', study.best_params)
print('Best auc:', study.best_value)

Best hyperparameters: {'learning_rate': 0.023034450123221473, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.38751922976147585, 'min_data_in_leaf': 41, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.562886679659466}
Best auc: 0.8713100571306767


### LightGBM

In [138]:
def objective(trial):
    X, y = train.drop(columns=["clientbankpartner_pin", "partnerrolestart_date", "target"]), train["target"]
    params = {
        "objective": "binary",
        "eval_metric": "log_loss",
        "verbosity": -1,
        "n_estimators": 12500,
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int("scale_pos_weight", 1, 6),
        "early_stopping_round": 200,
        "random_state": 42,
        "num_leaves": trial.suggest_int("num_leaves", 16, 200),
        "max_depth": trial.suggest_int("max_depth", 4, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 25, 200),
    }
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=52)
    aucs = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        model = LGBMClassifier(**params)
        model.fit(x_train_fold, y_train_fold,
              eval_set=[(x_test_fold, y_test_fold)],
              eval_metric='auc',
                  categorical_feature=cat_features
              )
       
        aucs.append(roc_auc_score(y_test_fold, model.predict_proba(x_test_fold)[:, 1]))

    auc = np.mean(aucs)
    return auc

In [139]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[I 2024-05-14 23:07:38,905] A new study created in memory with name: no-name-112ac180-65d5-4297-8713-f8d8a9cf08eb
[I 2024-05-14 23:07:49,146] Trial 0 finished with value: 0.8596938300717331 and parameters: {'subsample': 0.7598164353777649, 'learning_rate': 0.029515071642261616, 'colsample_bytree': 0.5597574343095532, 'reg_alpha': 0.4528622486093408, 'reg_lambda': 0.6690994168414974, 'scale_pos_weight': 4, 'num_leaves': 88, 'max_depth': 12, 'min_child_samples': 64}. Best is trial 0 with value: 0.8596938300717331.
[I 2024-05-14 23:07:51,346] Trial 1 finished with value: 0.8607554765255377 and parameters: {'subsample': 0.9042565501939949, 'learning_rate': 0.05453158083237125, 'colsample_bytree': 0.48466480498802733, 'reg_alpha': 0.41974277345184496, 'reg_lambda': 0.6373073166340991, 'scale_pos_weight': 2, 'num_leaves': 77, 'max_depth': 5, 'min_child_samples': 38}. Best is trial 1 with value: 0.8607554765255377.
[I 2024-05-14 23:07:53,189] Trial 2 finished with value: 0.8575283351991532 an

KeyboardInterrupt: 

In [140]:
print('Best hyperparameters:', study.best_params)
print('Best auc:', study.best_value)

Best hyperparameters: {'subsample': 0.7115185946603728, 'learning_rate': 0.024023736944555056, 'colsample_bytree': 0.14329151151137828, 'reg_alpha': 0.30974329894538843, 'reg_lambda': 0.6316534790805843, 'scale_pos_weight': 1, 'num_leaves': 128, 'max_depth': 4, 'min_child_samples': 162}
Best auc: 0.8629725459694857


### XGBoost

In [344]:
def objective(trial):
    X, y = train.drop(columns=["clientbankpartner_pin", "partnerrolestart_date", "target"] + cat_features), train[
        "target"]
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 16),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 1000),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "subsample": trial.suggest_loguniform("subsample", 0.1, 1),
        "colsample_bynode": trial.suggest_loguniform("colsample_bynode", 0.1, 1),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.1, 1),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int("scale_pos_weight", 1, 6),
        "n_estimators": 12500,
        "eval_metric": "auc",
        "early_stopping_rounds": 300,
        "verbose": 0,
        "random_state": 52
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=52)
    aucs = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        model = XGBClassifier(**params)
        model.fit(x_train_fold, y_train_fold,
                  eval_set=[(x_test_fold, y_test_fold)],
                  verbose=0
                  )

        aucs.append(roc_auc_score(y_test_fold, model.predict_proba(x_test_fold)[:, 1]))

    auc = np.mean(aucs)
    return auc

In [345]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[I 2024-05-15 21:50:58,298] A new study created in memory with name: no-name-1eec8245-aac6-47db-907a-e46a47426da6
[I 2024-05-15 21:51:07,852] Trial 0 finished with value: 0.8623945179743387 and parameters: {'max_depth': 5, 'learning_rate': 0.11097175894264766, 'min_child_weight': 12, 'gamma': 0.00027421806236354213, 'subsample': 0.4774855565190035, 'colsample_bynode': 0.2378381997451015, 'colsample_bytree': 0.39389282840325407, 'reg_alpha': 0.1014622524365475, 'reg_lambda': 0.00031032869839250595, 'scale_pos_weight': 1}. Best is trial 0 with value: 0.8623945179743387.
[I 2024-05-15 21:51:19,032] Trial 1 finished with value: 0.8574470777893065 and parameters: {'max_depth': 5, 'learning_rate': 0.01192313123553818, 'min_child_weight': 191, 'gamma': 0.003084917300011102, 'subsample': 0.5645860835739724, 'colsample_bynode': 0.12361007722141602, 'colsample_bytree': 0.7068101208644, 'reg_alpha': 0.6901796623401767, 'reg_lambda': 6.374077899718462e-06, 'scale_pos_weight': 4}. Best is trial 0 w

KeyboardInterrupt: 

In [346]:
print('Best hyperparameters:', study.best_params)
print('Best auc:', study.best_value)

Best hyperparameters: {'max_depth': 4, 'learning_rate': 0.01590961559443953, 'min_child_weight': 34, 'gamma': 5.405851652756049e-06, 'subsample': 0.8067693448318983, 'colsample_bynode': 0.2549830746836031, 'colsample_bytree': 0.29219772531440413, 'reg_alpha': 0.00019832334804630278, 'reg_lambda': 9.74768990317862e-05, 'scale_pos_weight': 3}
Best auc: 0.8635503886523165
