In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import gc
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import warnings
import xgboost as xgb
from src.metric import amex_metric

warnings.filterwarnings("ignore")

In [2]:
cat_cols = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

In [3]:
VERSION = "05"
PATH = "../data/processed/"

In [4]:
# Woe_balanced dataframe
train = pd.read_parquet(PATH + "train_woebalanced_20bins.parquet")
train = train.reset_index()

In [5]:
enc = OneHotEncoder()
enc.fit_transform(train[cat_cols])

<458913x50 sparse matrix of type '<class 'numpy.float64'>'
	with 5048043 stored elements in Compressed Sparse Row format>

In [6]:
train = train.drop('target', axis=1)

def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2"]]
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(
        ["mean", "std", "min", "max", "last"]
    )
    test_num_agg.columns = ["_".join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(
        ["count", "last", "nunique"]
    )
    test_cat_agg.columns = ["_".join(x) for x in test_cat_agg.columns]

    df = pd.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print("shape after engineering", df.shape)

    return df


train = process_and_feature_engineer(train)

labels = pd.read_csv("../data/raw/train_labels.csv")
labels = labels.set_index("customer_ID")
train = train.merge(labels, left_index=True, right_index=True, how='left')

train = train.reset_index()

shape after engineering (458913, 918)


iv_features = pd.read_csv("../reports/iv_features_20bins.csv")


FEATURES = iv_features["useful"].to_list()
FEATURES.remove("target")
while np.NaN in FEATURES: FEATURES.remove(np.NaN)


## XgBoost


In [7]:
# Model Parameters

xgb_params = {
    "max_depth": 4,
    "learning_rate": 0.045,
    "max_delta_step": 3,
    "subsample": 0.7,
    "sampling_method": "gradient_based",
    "tree_method": "gpu_hist",
    "colsample_bytree": 0.6,
    "eval_metric": "logloss",
    "objective": "binary:logistic",
    "predictor": "gpu_predictor",
}

In [8]:
# Setting MLFlow
experiment_name = "XGBoost - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [11]:
FEATURES = train.columns.to_list()
FEATURES.remove("target")
FEATURES=FEATURES[1:]

In [12]:
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

skf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(42)
        train_idx = np.random.choice(
            train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace=False
        )
        np.random.seed(None)

    print("#" * 25)
    print("### Fold", fold + 1)
    print("### Train size", len(train_idx), "Valid size", len(valid_idx))
    print(f"### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...")
    print("#" * 25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, "target"]
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, "target"]

    dtrain = xgb.DMatrix(X_train, y_train)
    del X_train, y_train
    gc.collect()
    d_valid = xgb.DMatrix(X_valid, y_valid)
    del X_valid
    gc.collect()
    model = xgb.train(
        xgb_params,
        dtrain=dtrain,
        evals=[(dtrain, "train"), (d_valid, "test")],
        num_boost_round=9999,
        early_stopping_rounds=100,
        verbose_eval=100,
    )

    model.save_model(f"../models/XGB_Version{VERSION}_fold{fold}.xgb")
    mlflow.xgboost.log_model(model, "XGBClassifier")

    dd = model.get_score(importance_type="weight")
    df = pd.DataFrame({"feature": dd.keys(), f"importance_{fold}": dd.values()})
    importances.append(df)

    oof_preds = model.predict(d_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for XGBClassifier", acc)

    print("Kaggle Metric=", acc, "\n")

    df = train.loc[valid_idx, ["customer_ID", "target"]].copy()
    df["oof_pred"] = oof_preds
    oof.append(df)

    del dd, df
    del d_valid, model
    gc.collect()
print("#" * 25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index("customer_ID")
acc = amex_metric(oof.target.values, oof.oof_pred.values)
print("OVERAL CV Kaggle Metric = ", acc)

2022/11/03 14:37:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/11/03 14:37:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################


2022/11/03 14:37:58 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd92b0cf16d154174bb93b2bbb6fe9d94', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	train-logloss:0.66504	test-logloss:0.66503
[100]	train-logloss:0.24432	test-logloss:0.24563
[200]	train-logloss:0.22996	test-logloss:0.23298
[300]	train-logloss:0.22461	test-logloss:0.22916
[400]	train-logloss:0.22133	test-logloss:0.22734
[500]	train-logloss:0.21884	test-logloss:0.22616
[600]	train-logloss:0.21683	test-logloss:0.22539
[700]	train-logloss:0.21499	test-logloss:0.22481
[800]	train-logloss:0.21333	test-logloss:0.22438
[900]	train-logloss:0.21178	test-logloss:0.22400
[1000]	train-logloss:0.21031	test-logloss:0.22372
[1100]	train-logloss:0.20896	test-logloss:0.22348
[1200]	train-logloss:0.20769	test-logloss:0.22327
[1300]	train-logloss:0.20648	test-logloss:0.22315
[1400]	train-logloss:0.20530	test-logloss:0.22312
[1500]	train-logloss:0.20416	test-logloss:0.22303
[1600]	train-logloss:0.20299	test-logloss:0.22294
[1700]	train-logloss:0.20190	test-logloss:0.22291
[1800]	train-logloss:0.20080	test-logloss:0.22287
[1900]	train-logloss:0.19971	test-logloss:0.22287
[1989]	train

KeyboardInterrupt: 

In [None]:
# import os
# os.system('systemctl poweroff')

## Random Forest


In [60]:
# Setting MLFlow
experiment_name = "RandomForest - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

2022/11/01 16:34:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/11/01 16:34:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [61]:
skf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(42)
        train_idx = np.random.choice(
            train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace=False
        )
        np.random.seed(None)

    print("#" * 25)
    print("### Fold", fold + 1)
    print("### Train size", len(train_idx), "Valid size", len(valid_idx))
    print(f"### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...")
    print("#" * 25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, "target"]
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, "target"]

    model = RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, y_train)

    # model.save_model(f"../models/RegLog_{VERSION}_fold{fold}.xgb")
    mlflow.sklearn.log_model(model, "RegLog")

    # dd = model.get_score(importance_type="weight")
    # df = pd.DataFrame({"feature": dd.keys(), f"importance_{fold}": dd.values()})
    # importances.append(df)

    oof_preds = model.predict(X_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for RegLog", acc)

    print("Kaggle Metric=", acc, "\n")

    df = train.loc[valid_idx, ["customer_ID", "target"]].copy()
    df["oof_pred"] = oof_preds
    oof.append(df)

    del df
    del X_valid, model
    gc.collect()
print("#" * 25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index("customer_ID")
acc = amex_metric(oof.target.values, oof.oof_pred.values)
print("OVERAL CV Kaggle Metric = ", acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5502397072824545 

#########################
### Fold 2
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################




Kaggle Metric= 0.55571971576264 

#########################
### Fold 3
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5629403701937159 

#########################
### Fold 4
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5499840840689397 

#########################
### Fold 5
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################




Kaggle Metric= 0.5526674836574395 

#########################
OVERAL CV Kaggle Metric =  0.5547401981009079


## Logistic Regression


In [62]:
# Setting MLFlow
experiment_name = "RegLog - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

2022/11/01 16:39:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/11/01 16:39:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [63]:
train.fillna(-127, inplace=True)
skf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(42)
        train_idx = np.random.choice(
            train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace=False
        )
        np.random.seed(None)

    print("#" * 25)
    print("### Fold", fold + 1)
    print("### Train size", len(train_idx), "Valid size", len(valid_idx))
    print(f"### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...")
    print("#" * 25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, "target"]
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, "target"]

    model = LogisticRegression()
    model.fit(X_train, y_train)

    # model.save_model(f"../models/RegLog_{VERSION}_fold{fold}.xgb")
    mlflow.sklearn.log_model(model, "RegLog")

    # dd = model.get_score(importance_type="weight")
    # df = pd.DataFrame({"feature": dd.keys(), f"importance_{fold}": dd.values()})
    # importances.append(df)

    oof_preds = model.predict(X_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for RegLog", acc)

    print("Kaggle Metric=", acc, "\n")

    df = train.loc[valid_idx, ["customer_ID", "target"]].copy()
    df["oof_pred"] = oof_preds
    oof.append(df)

    del df
    del X_valid, model
    gc.collect()
print("#" * 25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index("customer_ID")
acc = amex_metric(oof.target.values, oof.oof_pred.values)
print("OVERAL CV Kaggle Metric = ", acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5615669116900607 

#########################
### Fold 2
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5663568476478444 

#########################
### Fold 3
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5638556793607863 

#########################
### Fold 4
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5566326166751184 

#########################
### Fold 5
### Train size 367131 Valid size 91782
### Training with 100% fold data...
#########################
Kaggle Metric= 0.5604082283971887 

#########################
OVERAL CV Kaggle Metric =  0.5619966622962171
