In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np 
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt, gc, os
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
from xgboost import XGBClassifier
import pickle
import warnings 
import xgboost as xgb
import lightgbm as lgb
import time

warnings.filterwarnings("ignore")


In [2]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']


In [3]:
#Woe_balanced dataframe
train = pd.read_parquet("../data/train_woe_balanced_downcast.parquet")

In [4]:
iv_features = pd.read_csv("../data/iv_features.csv")

In [6]:
# train.drop('WOE_target',axis=1, inplace=True)
FEATURES = iv_features["useful"].to_list()
FEATURES.remove("target")

['D_39',
 'D_41',
 'D_43',
 'B_5',
 'R_2',
 'D_46',
 'B_8',
 'D_50',
 'R_3',
 'P_3',
 'D_53',
 'S_5',
 'S_6',
 'R_4',
 'B_12',
 'S_8',
 'D_56',
 'B_13',
 'S_9',
 'D_59',
 'D_60',
 'S_11',
 'D_63',
 'D_64',
 'D_68',
 'S_12',
 'R_6',
 'S_13',
 'B_21',
 'D_69',
 'D_70',
 'D_71',
 'D_72',
 'S_15',
 'P_4',
 'D_76',
 'B_24',
 'B_26',
 'D_78',
 'D_79',
 'S_16',
 'D_81',
 'D_82',
 'D_83',
 'R_15',
 'D_84',
 'D_86',
 'R_19',
 'B_32',
 'S_20',
 'R_20',
 'R_21',
 'D_89',
 'D_91',
 'D_92',
 'D_94',
 'R_24',
 'D_96',
 'S_23',
 'S_25',
 'S_26',
 'D_102',
 'D_103',
 'D_104',
 'D_105',
 'D_107',
 'R_26',
 'R_27',
 'D_112',
 'S_27',
 'D_113',
 'D_114',
 'D_115',
 'D_116',
 'D_117',
 'D_118',
 'D_119',
 'D_120',
 'D_121',
 'D_122',
 'D_124',
 'D_125',
 'D_126',
 'D_127',
 'D_128',
 'D_129',
 'B_41',
 'B_42',
 'D_130',
 'D_132',
 'D_133',
 'D_134',
 'D_135',
 'D_136',
 'D_137',
 'D_138',
 'D_139',
 'D_140',
 'D_141',
 'D_142',
 'D_143',
 'D_145']

In [7]:
#Competition metric
def amex_metric(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
# Model Parameters

xgb_params = {
        'max_depth': 20,
        'learning_rate':0.05,
        'max_delta_step':3,
        'subsample':0.6,
        'sampling_method':'gradient_based',
        'lambda':0.8,
        'alpha':0.8,
        'tree_method':'gpu_hist',
        'scale_pos_weight':0.3317302992934773,
        'max_bin':20,
        'colsample_bytree':0.6, 
        'eval_metric':'logloss',
        'objective':'binary:logistic',
        'predictor':'auto',
        }


## XgBoost

In [None]:
# Setting MLFlow
experiment_name = "XGBoost - WoE Balanced + IV Balanced"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id 

In [None]:
mlflow.autolog()
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

skf = KFold(n_splits = 5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(42)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
        
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, 'target']
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']

    dtrain=xgb.DMatrix(X_train, 
                        y_train)
    del X_train, y_train
    gc.collect()
    d_valid = xgb.DMatrix(X_valid, 
                        y_valid)
    del X_valid
    gc.collect()
    model = xgb.train(
                    xgb_params,
                    dtrain=dtrain,
                    evals=[(dtrain, 'train'), (d_valid, 'test')],
                    num_boost_round= 9999,
                    early_stopping_rounds = 100,
                    verbose_eval= 100
                                                
                    )

    model.save_model(f'../models/XGB_V_fold{fold}.xgb')
    mlflow.xgboost.log_model(model, "XGBClassifier")

    dd = model.get_score(importance_type='weight')
    df= pd.DataFrame({'feature':dd.keys(), f'importance_{fold}':dd.values()})
    importances.append(df)
    
    oof_preds = model.predict(d_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    mlflow.log_metric("Kaggle Metric for XGBClassifier", acc)

    print("Kaggle Metric=", acc,'\n')

    df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
    df['oof_pred']= oof_preds
    oof.append(df)

    del   dd, df
    del  d_valid, model
    gc.collect()
print('#'*25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
acc= amex_metric(oof.target.values, oof.oof_pred.values)
print('OVERAL CV Kaggle Metric = ', acc)


2022/09/30 13:56:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/09/30 13:56:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/09/30 13:56:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.


#########################
### Fold 1
### Train size 4425160 Valid size 1106291
### Training with 100% fold data...
#########################


2022/09/30 13:56:13 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c5bf80da3b3b495699e8165c02873152', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	train-logloss:0.66611	test-logloss:0.66730
[100]	train-logloss:0.22298	test-logloss:0.29632
[200]	train-logloss:0.17260	test-logloss:0.26810
[300]	train-logloss:0.13489	test-logloss:0.24888
[400]	train-logloss:0.10641	test-logloss:0.23391
[500]	train-logloss:0.08708	test-logloss:0.22371
[600]	train-logloss:0.07266	test-logloss:0.21630
[700]	train-logloss:0.06127	test-logloss:0.21105
[800]	train-logloss:0.05249	test-logloss:0.20670
[900]	train-logloss:0.04561	test-logloss:0.20351
[1000]	train-logloss:0.04044	test-logloss:0.20081
[1100]	train-logloss:0.03616	test-logloss:0.19894
[1200]	train-logloss:0.03265	test-logloss:0.19729
[1300]	train-logloss:0.02981	test-logloss:0.19609
[1400]	train-logloss:0.02732	test-logloss:0.19494
[1500]	train-logloss:0.02523	test-logloss:0.19394
[1600]	train-logloss:0.02356	test-logloss:0.19304
[1700]	train-logloss:0.02201	test-logloss:0.19247
[1800]	train-logloss:0.02068	test-logloss:0.19179
[1900]	train-logloss:0.01950	test-logloss:0.19135
[2000]	train

XGBoostError: [15:24:39] ../src/c_api/../data/../common/device_helpers.cuh:428: Memory allocation error on worker 0: std::bad_alloc: cudaErrorMemoryAllocation: out of memory
- Free memory: 499646464
- Requested memory: 913359440

Stack trace:
  [bt] (0) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x3a5799) [0x7f060f33c799]
  [bt] (1) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x3a9bab) [0x7f060f340bab]
  [bt] (2) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x8da20) [0x7f060f024a20]
  [bt] (3) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x3f9337) [0x7f060f390337]
  [bt] (4) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x3fa333) [0x7f060f391333]
  [bt] (5) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x5d59de) [0x7f060f56c9de]
  [bt] (6) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x5d9384) [0x7f060f570384]
  [bt] (7) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x5daff5) [0x7f060f571ff5]
  [bt] (8) /home/victor/Documents/1-DataScience/1-Projetos/American_Express-Default_Prediction/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x1e5aa5) [0x7f060f17caa5]



## Reg Log && Random Forest


reg_log_params={
        'penalty': 'l2',
        'max_iter': 200,
        'warm_start': True,
        'n_jobs': 1,
            
                }
fores_params={
        'bootstrap': True,
        'criterion': 'gini',
        'max_depth': 20,
        'max_features': 'auto',
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.01,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'n_estimators': 10,
        'n_jobs': -1,
        'oob_score': False,
        'random_state': 42,
        'verbose': 0,
        'warm_start': False
            }

# Setting MLFlow
experiment_name = "RegLog, Forest raw dataset"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id 

mlflow.autolog()

modelclasses = [
        #["reg_log",LogisticRegression,reg_log_params],
        ["forest", RandomForestClassifier, fores_params]
                ]

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5


importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

skf = KFold(n_splits = FOLDS, shuffle=True, random_state=42)
with mlflow.start_run(experiment_id=exp_id):
        for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):
                

                if TRAIN_SUBSAMPLE<1.0:
                        np.random.seed(SEED)
                        train_idx = np.random.choice(train_idx, 
                                        int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
                        np.random.seed(None)
                
                X_train = train.loc[train_idx, FEATURES]
                y_train = train.loc[train_idx, 'target']
                X_valid = train.loc[valid_idx, FEATURES]
                y_valid = train.loc[valid_idx, 'target']

                for modelname, Model, param_list in modelclasses:
                        print('#'*25)
                        print('### Fold',fold+1)
                        print('### Train size',len(train_idx),'Valid size',len(valid_idx))
                        print(f'### Training model {modelname.upper()}')
                        print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
                        print('#'*25)

                        model = Model(**param_list)
                        #print(model)
                        model.fit(X_train,y_train)

                        pickle.dump(model, open(f'../models/{modelname}_fold{fold}.pkl','wb'))

                        oof_preds = model.predict(X_valid)
                        acc = amex_metric(y_valid.values, oof_preds)
                        print("Kaggle Metric=", acc,'\n')
                        mlflow.log_metric(f"Kaggle Metric for {modelname}", acc)
                        mlflow.sklearn.log_model(model, f"{Model}")
                        print("Model saved in run %s" % mlflow.active_run().info.run_uuid)
                        
                        mlflow.sklearn.log_model(model, f"{Model}")
                        df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
                        df['oof_pred']= oof_preds
                        df['model_name'] = modelname
                        oof.append(df)

                        del df, model

                del X_train, y_train
                del X_valid, y_valid

        
print('#'*25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
for n in range(len(modelclasses)):
    target = oof.loc[oof['model_name'] ==modelclasses[n][0], ['target'] ].reset_index()
    preds = oof.loc[oof['model_name'] == modelclasses[n][0], ['oof_pred']].reset_index()
    acc= amex_metric(target.target.values, preds.oof_pred.values)
    print(f'OVERAL CV Kaggle Metric for {modelclasses[n][0]} = {acc}')



# LGBM

# Setting MLFlow
experiment_name = "LightGBM"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id 

def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True


params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'dart',
    'seed': 42,
    'num_leaves': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.20,
    'bagging_freq': 10,
    'bagging_fraction': 0.50,
    'n_jobs': -1,
    'lambda_l2': 2,
    'min_data_in_leaf': 40,
    'device_type': 'gpu',
    'max_bin': 64,

    }
# Create a numpy array to store test predictions
#test_predictions = np.zeros(len(test))
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(train))
skf = KFold(n_splits = 5, shuffle=True, random_state=42)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {len(FEATURES)} features...')
    x_train, x_val = train[FEATURES].iloc[train_idx], train[FEATURES].iloc[valid_idx]
    y_train, y_val = train['target'].iloc[train_idx], train['target'].iloc[valid_idx]
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_val, y_val)
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = 10500,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 1500,
        verbose_eval = 500,
        feval = lgb_amex_metric
        )
    # Save best model
    pickle.dump(model, open(f'../models/LGBM_fold{fold}.pkl','wb'))
    # Predict validation
    val_pred = model.predict(x_val)
    # Add to out of folds array
    oof_predictions[valid_idx] = val_pred
    # Predict the test set
    #test_pred = model.predict(test[FEATURES])
    #test_predictions += test_pred / 5
    # Compute fold metric
    score = amex_metric(y_val, val_pred)
    print(f'Our fold {fold} CV score is {score}')
    mlflow.log_metric("Kaggle Metric for LightGbm", acc)
    mlflow.lightgbm.log_model(model, f"{Model}")
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()
# Compute out of folds metric
score = amex_metric(train[target], oof_predictions)
print(f'Our out of folds CV score is {score}')
# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[target], 'prediction': oof_predictions})
oof_df.to_csv(f'/content/drive/MyDrive/Amex/OOF/oof_lgbm_dart_baseline_5fold_seed42.csv', index = False)
# Create a dataframe to store test prediction
# test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
# test_df.to_csv(f'/content/drive/MyDrive/Amex/Predictions/test_lgbm_dart_baseline_fold_5_seed42.csv', index = False)

