In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt, gc, os
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
from xgboost import XGBClassifier
import pickle
import warnings
import xgboost as xgb
import time
from src.metric import amex_metric
warnings.filterwarnings("ignore")

In [2]:
cat_cols = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

In [3]:
VERSION = "01"

In [4]:
# Woe_balanced dataframe
train = pd.read_parquet("../data/processed/train_w_labels.parquet")
train=train.reset_index()

In [5]:
iv_features = pd.read_csv("../reports/iv_features.csv")

In [6]:
# train.drop('WOE_target',axis=1, inplace=True)
FEATURES = iv_features["useful"].to_list()
FEATURES.remove("target")
FEATURES = FEATURES[:-4]

In [7]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2018-03-13,0.934745,0,0.009382,1.007647,0.006104,0.135021,0.0,0.007174,...,-1,-1,0,0,0.0,,0,0.00297,0,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2018-03-25,0.880519,6,0.034684,1.004028,0.006911,0.165509,0.0,0.005068,...,-1,-1,0,0,0.0,,0,0.003169,0,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2018-03-12,0.880875,0,0.004284,0.812649,0.00645,,0.0,0.007196,...,-1,-1,0,0,0.0,,0,0.000834,0,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,2018-03-29,0.621776,0,0.012564,1.006183,0.007829,0.287766,0.0,0.009937,...,-1,-1,0,0,0.0,,0,0.00556,0,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,2018-03-30,0.8719,0,0.007679,0.815746,0.001247,,0.0,0.005528,...,-1,-1,0,0,0.0,,0,0.006944,0,0


## XgBoost


In [11]:
# Model Parameters

xgb_params = {
    "max_depth": 5,
    "learning_rate": 0.055,
    #'max_delta_step':3,
    #"subsample": 0.7,
    "sampling_method": "gradient_based",
    # "lambda": 0.7,
    #"alpha": 0.8,
    "tree_method": "gpu_hist",
    #"scale_pos_weight": 0.25,
    # "max_bin": 20,
    "colsample_bytree": 0.6,
    "eval_metric": "logloss",
    "objective": "binary:logistic",
    "predictor": "gpu_predictor",
}

In [12]:
# Setting MLFlow
experiment_name = "XGBoost - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [19]:
FEATURES = train.columns.to_list()
FEATURES.remove("target")
FEATURES= FEATURES[3:]

In [13]:
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

skf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(42)
        train_idx = np.random.choice(
            train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace=False
        )
        np.random.seed(None)

    print("#" * 25)
    print("### Fold", fold + 1)
    print("### Train size", len(train_idx), "Valid size", len(valid_idx))
    print(f"### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...")
    print("#" * 25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, "target"]
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, "target"]

    dtrain = xgb.DMatrix(X_train, y_train)
    del X_train, y_train
    gc.collect()
    d_valid = xgb.DMatrix(X_valid, y_valid)
    del X_valid
    gc.collect()
    model = xgb.train(
        xgb_params,
        dtrain=dtrain,
        evals=[(dtrain, "train"), (d_valid, "test")],
        num_boost_round=9999,
        early_stopping_rounds=100,
        verbose_eval=100,
    )

    model.save_model(f"../models/XGB_Version{VERSION}_fold{fold}.xgb")
    mlflow.xgboost.log_model(model, "XGBClassifier")

    dd = model.get_score(importance_type="weight")
    df = pd.DataFrame({"feature": dd.keys(), f"importance_{fold}": dd.values()})
    importances.append(df)

    oof_preds = model.predict(d_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for XGBClassifier", acc)

    print("Kaggle Metric=", acc, "\n")

    df = train.loc[valid_idx, ["customer_ID", "target"]].copy()
    df["oof_pred"] = oof_preds
    oof.append(df)

    del dd, df
    del d_valid, model
    gc.collect()
print("#" * 25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index("customer_ID")
acc = amex_metric(oof.target.values, oof.oof_pred.values)
print("OVERAL CV Kaggle Metric = ", acc)

2022/11/01 17:13:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/11/01 17:13:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################


2022/11/01 17:13:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ebb6f585dcf3446091607ede19442a72', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	train-logloss:0.66660	test-logloss:0.66650
[100]	train-logloss:0.28520	test-logloss:0.28714
[200]	train-logloss:0.26785	test-logloss:0.27298
[300]	train-logloss:0.26091	test-logloss:0.26883
[400]	train-logloss:0.25614	test-logloss:0.26674


KeyboardInterrupt: 

In [None]:
# import os
# os.system('systemctl poweroff')

## Random Forest

In [60]:
# Setting MLFlow
experiment_name = "RandomForest - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

2022/11/01 16:34:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/11/01 16:34:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [61]:
skf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(42)
        train_idx = np.random.choice(
            train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace=False
        )
        np.random.seed(None)

    print("#" * 25)
    print("### Fold", fold + 1)
    print("### Train size", len(train_idx), "Valid size", len(valid_idx))
    print(f"### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...")
    print("#" * 25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, "target"]
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, "target"]

    model = RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, y_train)

    #model.save_model(f"../models/RegLog_{VERSION}_fold{fold}.xgb")
    mlflow.sklearn.log_model(model, "RegLog")

    #dd = model.get_score(importance_type="weight")
    # df = pd.DataFrame({"feature": dd.keys(), f"importance_{fold}": dd.values()})
    # importances.append(df)

    oof_preds = model.predict(X_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for RegLog", acc)

    print("Kaggle Metric=", acc, "\n")

    df = train.loc[valid_idx, ["customer_ID", "target"]].copy()
    df["oof_pred"] = oof_preds
    oof.append(df)

    del df
    del X_valid, model
    gc.collect()
print("#" * 25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index("customer_ID")
acc = amex_metric(oof.target.values, oof.oof_pred.values)
print("OVERAL CV Kaggle Metric = ", acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5502397072824545 

#########################
### Fold 2
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################




Kaggle Metric= 0.55571971576264 

#########################
### Fold 3
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5629403701937159 

#########################
### Fold 4
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5499840840689397 

#########################
### Fold 5
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5526674836574395 

#########################
OVERAL CV Kaggle Metric =  0.5547401981009079


## Logistic Regression

In [62]:
# Setting MLFlow
experiment_name = "RegLog - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0


2022/11/01 16:39:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/11/01 16:39:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [63]:
train.fillna(-127,inplace=True)
skf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(42)
        train_idx = np.random.choice(
            train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace=False
        )
        np.random.seed(None)

    print("#" * 25)
    print("### Fold", fold + 1)
    print("### Train size", len(train_idx), "Valid size", len(valid_idx))
    print(f"### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...")
    print("#" * 25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, "target"]
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, "target"]

    model = LogisticRegression()
    model.fit(X_train, y_train)

    #model.save_model(f"../models/RegLog_{VERSION}_fold{fold}.xgb")
    mlflow.sklearn.log_model(model, "RegLog")

    #dd = model.get_score(importance_type="weight")
    #df = pd.DataFrame({"feature": dd.keys(), f"importance_{fold}": dd.values()})
    #importances.append(df)

    oof_preds = model.predict(X_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for RegLog", acc)

    print("Kaggle Metric=", acc, "\n")

    df = train.loc[valid_idx, ["customer_ID", "target"]].copy()
    df["oof_pred"] = oof_preds
    oof.append(df)

    del df
    del X_valid, model
    gc.collect()
print("#" * 25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index("customer_ID")
acc = amex_metric(oof.target.values, oof.oof_pred.values)
print("OVERAL CV Kaggle Metric = ", acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5615669116900607 

#########################
### Fold 2
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5663568476478444 

#########################
### Fold 3
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5638556793607863 

#########################
### Fold 4
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5566326166751184 

#########################
### Fold 5
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5604082283971887 

#########################
OVERAL CV Kaggle Metric =  0.5619966622962171
