In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np 
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt, gc, os
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
from xgboost import XGBClassifier
import pickle
import warnings 
import xgboost as xgb
import time

warnings.filterwarnings("ignore")


In [2]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']


In [3]:
#Woe_balanced dataframe
train = pd.read_parquet("../data/final/train_woe.parquet")

In [4]:
iv_features = pd.read_csv("../reports/iv_features.csv")

In [6]:
# train.drop('WOE_target',axis=1, inplace=True)
FEATURES = iv_features["useful"].to_list()
FEATURES.remove("target")

In [8]:
#Competition metric
def amex_metric(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

## XgBoost

In [9]:
# Model Parameters

xgb_params = {
        'max_depth': 20,
        'learning_rate':0.05,
        'max_delta_step':3,
        'subsample':0.6,
        'sampling_method':'gradient_based',
        'lambda':0.8,
        'alpha':0.8,
        'tree_method':'gpu_hist',
        'scale_pos_weight':0.3317302992934773,
        'max_bin':20,
        'colsample_bytree':0.6, 
        'eval_metric':'logloss',
        'objective':'binary:logistic',
        'predictor':'auto',
        }


In [10]:
# Setting MLFlow
experiment_name = "XGBoost - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id 

In [11]:
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

skf = KFold(n_splits = 5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(42)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
        
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, 'target']
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']

    dtrain=xgb.DMatrix(X_train, 
                        y_train)
    del X_train, y_train
    gc.collect()
    d_valid = xgb.DMatrix(X_valid, 
                        y_valid)
    del X_valid
    gc.collect()
    model = xgb.train(
                    xgb_params,
                    dtrain=dtrain,
                    evals=[(dtrain, 'train'), (d_valid, 'test')],
                    num_boost_round= 9999,
                    early_stopping_rounds = 100,
                    verbose_eval= 100
                                                
                    )

    model.save_model(f'../models/XGB_V_fold{fold}.xgb')
    mlflow.xgboost.log_model(model, "XGBClassifier")

    dd = model.get_score(importance_type='weight')
    df= pd.DataFrame({'feature':dd.keys(), f'importance_{fold}':dd.values()})
    importances.append(df)
    
    oof_preds = model.predict(d_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for XGBClassifier", acc)

    print("Kaggle Metric=", acc,'\n')

    df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
    df['oof_pred']= oof_preds
    oof.append(df)

    del   dd, df
    del  d_valid, model
    gc.collect()
print('#'*25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
acc= amex_metric(oof.target.values, oof.oof_pred.values)
print('OVERAL CV Kaggle Metric = ', acc)


2022/10/05 21:19:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/10/05 21:19:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


#########################
### Fold 1
### Train size 4425160 Valid size 1106291
### Training with 100% fold data...
#########################


2022/10/05 21:19:47 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b918e58a2a1e4c6db6ab63993224fc03', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	train-logloss:0.66319	test-logloss:0.66452
[100]	train-logloss:0.21480	test-logloss:0.29217
[200]	train-logloss:0.16169	test-logloss:0.26243
[300]	train-logloss:0.12373	test-logloss:0.24317
[400]	train-logloss:0.09736	test-logloss:0.22923
[500]	train-logloss:0.07859	test-logloss:0.21946
[600]	train-logloss:0.06472	test-logloss:0.21259
[700]	train-logloss:0.05477	test-logloss:0.20802
[800]	train-logloss:0.04659	test-logloss:0.20413
[900]	train-logloss:0.04056	test-logloss:0.20135
[1000]	train-logloss:0.03590	test-logloss:0.19898
[1100]	train-logloss:0.03202	test-logloss:0.19712
[1200]	train-logloss:0.02899	test-logloss:0.19579
[1300]	train-logloss:0.02641	test-logloss:0.19470
[1400]	train-logloss:0.02430	test-logloss:0.19373
[1500]	train-logloss:0.02257	test-logloss:0.19283
[1600]	train-logloss:0.02102	test-logloss:0.19206
[1700]	train-logloss:0.01975	test-logloss:0.19153
[1800]	train-logloss:0.01859	test-logloss:0.19091
[1900]	train-logloss:0.01759	test-logloss:0.19051
[2000]	train