# American Express - Default Prediction


# Data Set Problems
American Express is a globally integrated payments company. The largest payment card issuer in the world, they provide customers with access to products, insights, and experiences that enrich lives and build business success.

We’ll be apply our machine learning skills to predict credit default which allows lenders to optimize lending decisions.

Data pre-processing and feature engineering will be performed to prepare the dataset before it is used by the machine learning model.

# Objectives
The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile.

# Data Set Description
The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories,

D = Delinquency variables

S = Spend variables

P = Payment variables

B = Balance variables

R = Risk variables

with the following features being categorical:

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

Our task is to predict, for each customer_ID, the probability of a future payment default (target = 1).

Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.

# Notebook objective:

Comparison of 4 encoders (mean encoding, WoE encoding, label encoding, frequency encoding) vs original variables

# Results

- Mean encoding and WoE encoding are the best encoding type

- The best score (0.78.2) was obtained by averaging 2 predictions: the predictions obtained from the dataset of numeric variables + the predictions obtained from the dataset of numeric variables concatenated with the categorical variables encoded with mean encoding

- Predictions obtained by a Kaggle team trick scored 0.799 (See my last code)

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
import random

import warnings 
warnings.filterwarnings('ignore')

# Loding Data

In [None]:
train = pd.read_feather('../input/amexfeather/train_data.ftr')
train = train.groupby('customer_ID').tail(1).set_index('customer_ID')

test = pd.read_feather('../input/amexfeather/test_data.ftr')
test = test.groupby('customer_ID').tail(1).set_index('customer_ID')
test.reset_index(inplace=True)
ids = test["customer_ID"]

# Data insight

In [None]:
# select numerical and categorical features
def divideFeatures(df):
    numerical_features = df.select_dtypes(include=[np.number]).drop(['target'], axis=1)
    categorical_features = df.select_dtypes(include=['category'])
    return numerical_features, categorical_features

In [None]:
# create variables based on typology
payment_vars = [col for col in train.columns if col.startswith("P_")]
risk_vars = [col for col in train.columns if col.startswith("R_")]
balance_vars = [col for col in train.columns if col.startswith("B_")]
delinquency_vars = [col for col in train.columns if col.startswith("D_")]
spend_vars = [col for col in train.columns if col.startswith("S_")]

In [None]:
print(train[payment_vars].info())
print(train[risk_vars].info())
print(train[balance_vars].info())
print(train[delinquency_vars].info())
print(train[spend_vars].info())

In [None]:
print(train[payment_vars].isna().sum())
print(train[risk_vars].isna().sum())
print(train[balance_vars].isna().sum())
print(train[delinquency_vars].isna().sum())
print(train[spend_vars].isna().sum())

In [None]:
# trick to handle NaN values

# create a fake target column for test data since this column doesn't exist
test.loc[:, "target"] = -1

In [None]:
# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)

In [None]:
# drop NaN values from data
data = data.dropna(axis=1, thresh=int(0.80 * len(data)))
data.shape

In [None]:
# make a list of features we are interested in
numerical_features, categorical_features = divideFeatures(data)

In [None]:
# converte float16 in float32 to calculate the mean values
data[numerical_features.columns] = data[numerical_features.columns].astype(np.float32)

In [None]:
# fill the NaN values of the numeric variables with the mean
data[numerical_features.columns] = data.loc[:,numerical_features.columns].fillna(data[numerical_features.columns].mean())

In [None]:
# converte category in string to replace NaN values with NONE
data[categorical_features.columns] = data[categorical_features.columns].astype(str)

In [None]:
# fill the NaN values of the categorical variables with NONE
data[categorical_features.columns].fillna("NONE", inplace = True)

In [None]:
# reconvert in categorical variables
data[categorical_features.columns] = data[categorical_features.columns].astype("category")

In [None]:
# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [None]:
import gc

del data
gc.collect()

# Coding functions

In [None]:
from sklearn import preprocessing

# label encoding
def lab_enc(df_train, df_cv, column):
    le = preprocessing.LabelEncoder()
    le.fit(df_train[column])
    df_train_le = le.transform(df_train[column])
    df_cv[column] = df_cv[column].map(lambda s: 0 if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, 0)
    df_cv_le = le.transform(df_cv[column])
    return df_train_le, df_cv_le

In [None]:
#Source: https://www.kaggle.com/bhavikapanara/frequency-encoding
def freq_enc(df_train, df_cv, column):
    train = (df_train.groupby(column).size()) / len(df_train)
    cv = (df_cv.groupby(column).size()) / len(df_cv)
    freq_enc_train = df_train[column].apply(lambda x : train[x])
    freq_enc_cv = df_cv[column].apply(lambda x : cv[x])
    return freq_enc_train, freq_enc_cv

In [None]:
categorical_features.columns

In [None]:
# mean encoding

mean_encode1 = train.groupby("D_63")["target"].mean()
mean_encode2 = train.groupby("B_30")["target"].mean()
mean_encode3 = train.groupby("B_38")["target"].mean()
mean_encode4 = train.groupby("D_114")["target"].mean()
mean_encode5 = train.groupby("D_116")["target"].mean()
mean_encode6 = train.groupby("D_117")["target"].mean()
mean_encode7 = train.groupby("D_120")["target"].mean()
mean_encode8 = train.groupby("D_126")["target"].mean()

train.loc[:,"D_63_mean_enc"] = train["D_63"].map(mean_encode1).astype('float', copy=False)
train.loc[:,"B_30_mean_enc"] = train["B_30"].map(mean_encode2).astype('float', copy=False)
train.loc[:,"B_38_mean_enc"] = train["B_38"].map(mean_encode3).astype('float', copy=False)
train.loc[:,"D_114_mean_enc"] = train["D_114"].map(mean_encode4).astype('float', copy=False)
train.loc[:,"D_116_mean_enc"] = train["D_116"].map(mean_encode5).astype('float', copy=False)
train.loc[:,"D_117_mean_enc"] = train["D_117"].map(mean_encode6).astype('float', copy=False)
train.loc[:,"D_120_mean_enc"] = train["D_120"].map(mean_encode7).astype('float', copy=False)
train.loc[:,"D_126_mean_enc"] = train["D_126"].map(mean_encode8).astype('float', copy=False)

In [None]:
# map the above variables using map data for mean encoding created during training

test["D_63_mean_enc"] = test["D_63"].map(mean_encode1).astype('float', copy=False)
test["B_30_mean_enc"] = test["B_30"].map(mean_encode2).astype('float', copy=False)
test["B_38_mean_enc"] = test["B_38"].map(mean_encode3).astype('float', copy=False)
test["D_114_mean_enc"] = test["D_114"].map(mean_encode4).astype('float', copy=False)
test["D_116_mean_enc"] = test["D_116"].map(mean_encode5).astype('float', copy=False)
test["D_117_mean_enc"] = test["D_117"].map(mean_encode6).astype('float', copy=False)
test["D_120_mean_enc"] = test["D_120"].map(mean_encode7).astype('float', copy=False)
test["D_126_mean_enc"] = test["D_126"].map(mean_encode8).astype('float', copy=False)

In [None]:
# WoE (Weight of Evidence Encoding)

# calculate probability of target = 1; i.e. good = 1 for each category

woe1 = train.groupby("D_63")["target"].mean()
woe2 = train.groupby("B_30")["target"].mean()
woe3 = train.groupby("B_38")["target"].mean()
woe4 = train.groupby("D_114")["target"].mean()
woe5 = train.groupby("D_116")["target"].mean()
woe6 = train.groupby("D_117")["target"].mean()
woe7 = train.groupby("D_120")["target"].mean()
woe8 = train.groupby("D_126")["target"].mean()

woe1 = pd.DataFrame(woe1)
woe2 = pd.DataFrame(woe2)
woe3 = pd.DataFrame(woe3)
woe4 = pd.DataFrame(woe4)
woe5 = pd.DataFrame(woe5)
woe6 = pd.DataFrame(woe6)
woe7 = pd.DataFrame(woe7)
woe8 = pd.DataFrame(woe8)

# Rename the column name "good" to keep it consistent with formula for easy understanding
woe1 = woe1.rename(columns = {"target": "good"})
woe2 = woe2.rename(columns = {"target": "good"})
woe3 = woe3.rename(columns = {"target": "good"})
woe4 = woe4.rename(columns = {"target": "good"})
woe5 = woe5.rename(columns = {"target": "good"})
woe6 = woe6.rename(columns = {"target": "good"})
woe7 = woe7.rename(columns = {"target": "good"})
woe8 = woe8.rename(columns = {"target": "good"})

# Calculate bad probability wich is 1 - good probability
woe1["bad"] = 1 - woe1.good
woe2["bad"] = 1 - woe2.good
woe3["bad"] = 1 - woe3.good
woe4["bad"] = 1 - woe4.good
woe5["bad"] = 1 - woe5.good
woe6["bad"] = 1 - woe6.good
woe7["bad"] = 1 - woe7.good
woe8["bad"] = 1 - woe8.good

# We need to add a small value to avoid divide by zero in denominator
woe1["bad"] = np.where(woe1["bad"] == 0,0.000001, woe1["bad"])
woe2["bad"] = np.where(woe2["bad"] == 0,0.000001, woe2["bad"])
woe3["bad"] = np.where(woe3["bad"] == 0,0.000001, woe3["bad"])
woe4["bad"] = np.where(woe4["bad"] == 0,0.000001, woe4["bad"])
woe5["bad"] = np.where(woe5["bad"] == 0,0.000001, woe5["bad"])
woe6["bad"] = np.where(woe6["bad"] == 0,0.000001, woe6["bad"])
woe7["bad"] = np.where(woe7["bad"] == 0,0.000001, woe7["bad"])
woe8["bad"] = np.where(woe8["bad"] == 0,0.000001, woe8["bad"])

# compute the WoE
woe1["woe1"] = np.log(woe1.good / woe1.bad)
woe2["woe2"] = np.log(woe2.good / woe2.bad)
woe3["woe3"] = np.log(woe3.good / woe3.bad)
woe4["woe4"] = np.log(woe4.good / woe4.bad)
woe5["woe5"] = np.log(woe5.good / woe5.bad)
woe6["woe6"] = np.log(woe6.good / woe6.bad)
woe7["woe7"] = np.log(woe7.good / woe7.bad)
woe8["woe8"] = np.log(woe8.good / woe8.bad)

In [None]:
# Map the WoE value back to each row of dataframe
train.loc[:,"woe1_encode"] = train["D_63"].map(woe1["woe1"]).astype('float', copy=False)
train.loc[:,"woe2_encode"] = train["B_30"].map(woe2["woe2"]).astype('float', copy=False)
train.loc[:,"woe3_encode"] = train["B_38"].map(woe3["woe3"]).astype('float', copy=False)
train.loc[:,"woe4_encode"] = train["D_114"].map(woe4["woe4"]).astype('float', copy=False)
train.loc[:,"woe5_encode"] = train["D_116"].map(woe5["woe5"]).astype('float', copy=False)
train.loc[:,"woe6_encode"] = train["D_117"].map(woe6["woe6"]).astype('float', copy=False)
train.loc[:,"woe7_encode"] = train["D_120"].map(woe7["woe7"]).astype('float', copy=False)
train.loc[:,"woe8_encode"] = train["D_126"].map(woe8["woe8"]).astype('float', copy=False)

In [None]:
# map the above variables using map data for WoE encoding created during training

test["woe1_encode"] = test["D_63"].map(woe1["woe1"]).astype('float', copy=False)
test["woe2_encode"] = test["B_30"].map(woe2["woe2"]).astype('float', copy=False)
test["woe3_encode"] = test["B_38"].map(woe3["woe3"]).astype('float', copy=False)
test["woe4_encode"] = test["D_114"].map(woe4["woe4"]).astype('float', copy=False)
test["woe5_encode"] = test["D_116"].map(woe5["woe5"]).astype('float', copy=False)
test["woe6_encode"] = test["D_117"].map(woe6["woe6"]).astype('float', copy=False)
test["woe7_encode"] = test["D_120"].map(woe7["woe7"]).astype('float', copy=False)
test["woe8_encode"] = test["D_126"].map(woe8["woe8"]).astype('float', copy=False)

In [None]:
# update variables
numerical_features, categorical_features = divideFeatures(train)

In [None]:
y = train['target']
X_train_ori = train.drop(['target','S_2'],axis=1)
X_test_ori = test[X_train_ori.columns]

In [None]:
X_train_num = X_train_ori[numerical_features.columns]
X_test_num = X_test_ori[numerical_features.columns]

In [None]:
X_train_num_ori = X_train_num.iloc[:,0:146]
X_test_num_ori = X_test_num.iloc[:,0:146]

In [None]:
X_train_num_mean = X_train_num.iloc[:,0:154]
X_test_num_mean = X_test_num.iloc[:,0:154]

In [None]:
woe_vars = [col for col in X_train_num.columns if col.startswith("w")]

X_train_num_woe = pd.concat([X_train_num_ori, X_train_num[woe_vars]], axis = 1)
X_test_num_woe = pd.concat([X_test_num_ori, X_test_num[woe_vars]], axis = 1)

In [None]:
X_train_cat = X_train_ori[categorical_features.columns]
X_test_cat = X_test_ori[categorical_features.columns]

In [None]:
# Label Encoding
X_train_le = {}
X_test_le = {}

for i in X_train_cat.columns:
    X_train_le[i], X_test_le[i] = lab_enc(X_train_cat, X_test_cat, i)

X_train_le = pd.DataFrame(X_train_le)
X_test_le = pd.DataFrame(X_test_le)

In [None]:
# Frequency Encoding
X_train_freq = {}
X_test_freq = {}

for i in X_train_cat.columns:
    X_train_freq[i], X_test_freq[i] = freq_enc(X_train_cat, X_test_cat, i)

X_test_freq = pd.DataFrame(X_test_freq)
X_train_freq = pd.DataFrame(X_train_freq)

In [None]:
X_train_num_le = pd.concat([X_train_num_ori, X_train_le], axis = 1)
X_test_num_le = pd.concat([X_test_num_ori, X_test_le], axis = 1)

In [None]:
X_train_num_freq = pd.concat([X_train_num_ori, X_train_freq], axis = 1)
X_test_num_freq = pd.concat([X_test_num_ori, X_test_freq], axis = 1)

In [None]:
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title="model", callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    
    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
        
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1"+" %.3f") % (time() - start, 
                                   len(optimizer.cv_results_['params']),
                                   best_score,
                                   best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [None]:
# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

# Converting average precision score into a scorer suitable for model selection
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

In [None]:
from sklearn.model_selection import StratifiedKFold
# Setting a 5-fold stratified cross-validation 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier(boosting_type='gbdt',
                         metric='auc',
                         objective='binary',
                         n_jobs=1, 
                         verbose=-1,
                         random_state=0)

In [None]:
from skopt.space import Real, Categorical, Integer

grid_search = {
    'num_leaves': Integer(2, 256),                       # Maximum tree leaves for base learners
    'min_child_samples': Integer(5, 100),                # Minimal number of data in one leaf
    'reg_lambda': Real(1e-8, 10.0, 'log-uniform'),      # L2 regularization
    'reg_alpha': Real(1e-8, 10.0, 'log-uniform'),       # L1 regularization
    'scale_pos_weight': Real(1.0, 500.0, 'uniform'),     # Weighting of the minority class (Only for binary classification)
    'feature_fraction': Real(0.4, 1.0, 'uniform'),
    'bagging_fraction': Real(0.4, 1.0, 'uniform'),
    'bagging_freq': Integer(1, 7),
}

In [None]:
from skopt import BayesSearchCV

opt = BayesSearchCV(estimator=clf,                                    
                    search_spaces=grid_search,                      
                    scoring=roc_auc,                                  
                    cv=skf,                                           
                    n_iter=3000,                                      # max number of trials
                    n_points=3,                                       # number of hyperparameter sets evaluated at the same time
                    n_jobs=-1,                                        # number of jobs
                    iid=False,                                        # if not iid it optimizes on the cv score
                    return_train_score=False,                         
                    refit=False,                                      
                    optimizer_kwargs={'base_estimator': 'GP'},        # optmizer parameters: we use Gaussian Process (GP)
                    random_state=0)                                   # random state for replicability

In [None]:
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from time import time
import pprint
import joblib

# MODEL 1 (X_train_num_ori, X_test_num_ori)

overdone_control = DeltaYStopper(delta=0.0001)               # We stop if the gain of the optimization becomes too small
time_limit_control = DeadlineStopper(total_time=60 * 40)     # We impose a time limit (40 minutes)

best_params1 = report_perf(opt, X_train_num_ori, y,'LightGBM', 
                          callbacks=[overdone_control, time_limit_control])

In [None]:
# MODEL 2 (X_train_num_mean, X_test_num_mean)

overdone_control = DeltaYStopper(delta=0.0001)               # We stop if the gain of the optimization becomes too small
time_limit_control = DeadlineStopper(total_time=60 * 40)     # We impose a time limit (40 minutes)

best_params2 = report_perf(opt, X_train_num_mean, y,'LightGBM', 
                          callbacks=[overdone_control, time_limit_control])

In [None]:
# MODEL 3 (X_train_num_woe, X_test_num_woe)

overdone_control = DeltaYStopper(delta=0.0001)               # We stop if the gain of the optimization becomes too small
time_limit_control = DeadlineStopper(total_time=60 * 40)     # We impose a time limit (40 minutes)

best_params3 = report_perf(opt, X_train_num_woe, y,'LightGBM', 
                          callbacks=[overdone_control, time_limit_control])

In [None]:
# MODEL 4 (X_train_num_le, X_test_num_le)

overdone_control = DeltaYStopper(delta=0.0001)               # We stop if the gain of the optimization becomes too small
time_limit_control = DeadlineStopper(total_time=60 * 40)     # We impose a time limit (40 minutes)

best_params4 = report_perf(opt, X_train_num_le, y,'LightGBM', 
                          callbacks=[overdone_control, time_limit_control])

In [None]:
# MODEL 5 (X_train_num_freq, X_test_num_freq)

overdone_control = DeltaYStopper(delta=0.0001)               # We stop if the gain of the optimization becomes too small
time_limit_control = DeadlineStopper(total_time=60 * 40)     # We impose a time limit (40 minutes)

best_params5 = report_perf(opt, X_train_num_freq, y,'LightGBM', 
                          callbacks=[overdone_control, time_limit_control])

In [None]:
clf1 = lgb.LGBMClassifier(boosting_type='gbdt',
                         metric='auc',
                         objective='binary',
                         n_jobs=1, 
                         verbose=-1,
                         random_state=0,
                         **best_params1
                         )
                        

clf2 = lgb.LGBMClassifier(boosting_type='gbdt',
                         metric='auc',
                         objective='binary',
                         n_jobs=1, 
                         verbose=-1,
                         random_state=0,
                         **best_params2
                        )

clf3 = lgb.LGBMClassifier(boosting_type='gbdt',
                         metric='auc',
                         objective='binary',
                         n_jobs=1, 
                         verbose=-1,
                         random_state=0,
                         **best_params3
                        )

clf4 = lgb.LGBMClassifier(boosting_type='gbdt',
                         metric='auc',
                         objective='binary',
                         n_jobs=1, 
                         verbose=-1,
                         random_state=0,
                         **best_params4
                         )
                        


clf5 = lgb.LGBMClassifier(boosting_type='gbdt',
                         metric='auc',
                         objective='binary',
                         n_jobs=1, 
                         verbose=-1,
                         random_state=0,
                         **best_params5
                         )

In [None]:
clf1.fit(X_train_num_ori, y)
clf2.fit(X_train_num_mean, y)
clf3.fit(X_train_num_woe, y)
clf4.fit(X_train_num_le, y)
clf5.fit(X_train_num_freq, y)

In [None]:
predictions1 = clf1.predict_proba(X_test_num_ori)[:, 1].ravel()
predictions2 = clf2.predict_proba(X_test_num_mean)[:, 1].ravel()
predictions3 = clf3.predict_proba(X_test_num_woe)[:, 1].ravel()
predictions4 = clf4.predict_proba(X_test_num_le)[:, 1].ravel()
predictions5 = clf5.predict_proba(X_test_num_freq)[:, 1].ravel()

In [None]:
submission1 = pd.DataFrame({'customer_ID':ids, 'prediction': predictions1})
submission1.to_csv("submission_num_ori.csv", index = False)
submission2 = pd.DataFrame({'customer_ID':ids, 'prediction': predictions2})
submission2.to_csv("submission_num_mean.csv", index = False)
submission3 = pd.DataFrame({'customer_ID':ids, 'prediction': predictions3})
submission3.to_csv("submission_num_woe.csv", index = False)
submission4 = pd.DataFrame({'customer_ID':ids, 'prediction': predictions4})
submission4.to_csv("submission_num_le.csv", index = False)
submission5 = pd.DataFrame({'customer_ID':ids, 'prediction': predictions5})
submission5.to_csv("submission_num_freq.csv", index = False)

# predictions average

submission1_2 = pd.DataFrame({'customer_ID':ids, 'prediction': (predictions1 + predictions2) / 2})
submission1_2.to_csv("submission1_2.csv", index = False)
submission1_2_3 = pd.DataFrame({'customer_ID':ids, 'prediction': (predictions1 + predictions2 + predictions3) / 3})
submission1_2_3.to_csv("submission1_2_3.csv", index = False)
submission1_2_3_4 = pd.DataFrame({'customer_ID':ids, 'prediction': (predictions1 + predictions2 + predictions3 + predictions4) / 4})
submission1_2_3_4.to_csv("submission1_2_3_4.csv", index = False)
submission1_2_3_4_5 = pd.DataFrame({'customer_ID':ids, 'prediction': (predictions1 + predictions2 + predictions3 + predictions4 + predictions5) / 5})
submission1_2_3_4_5.to_csv("submission1_2_3_4_5.csv", index = False)

In [None]:
import glob
from scipy.stats import rankdata

paths = [x for x in glob.glob('../input/*/*.csv') if 'amex-default-prediction' not in x]
dfs = [pd.read_csv(x) for x in paths]
dfs = [x.sort_values(by='customer_ID') for x in dfs]

paths = [x for x in glob.glob('../input/*/*.csv') if 'amex-default-prediction' not in x]
paths

for df in dfs:
    df['prediction'] = np.clip(df['prediction'], 0, 1)

In [None]:
weights = [0.52, 0.87, 0.95, 0.57, 1, 0.8]

submit1 = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
submit1['prediction'] = 0

for df, weight in zip(dfs, weights):
    submit1['prediction'] += (df['prediction'] * weight)
    
submit1['prediction'] /= np.sum(weights)

submit.to_csv('mean_submission.csv', index=None)