In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
application = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv')
credit = pd.read_csv('../input/credit-card-approval-prediction/credit_record.csv')

# only model in the intersection cases between 2 dataset
ids = set(application['ID']).intersection(set(credit['ID']))
application = application[application['ID'].isin(ids)]
credit = credit[credit['ID'].isin(ids)]

In [3]:
application.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36457 entries, 0 to 434812
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   36457 non-null  int64  
 1   CODE_GENDER          36457 non-null  object 
 2   FLAG_OWN_CAR         36457 non-null  object 
 3   FLAG_OWN_REALTY      36457 non-null  object 
 4   CNT_CHILDREN         36457 non-null  int64  
 5   AMT_INCOME_TOTAL     36457 non-null  float64
 6   NAME_INCOME_TYPE     36457 non-null  object 
 7   NAME_EDUCATION_TYPE  36457 non-null  object 
 8   NAME_FAMILY_STATUS   36457 non-null  object 
 9   NAME_HOUSING_TYPE    36457 non-null  object 
 10  DAYS_BIRTH           36457 non-null  int64  
 11  DAYS_EMPLOYED        36457 non-null  int64  
 12  FLAG_MOBIL           36457 non-null  int64  
 13  FLAG_WORK_PHONE      36457 non-null  int64  
 14  FLAG_PHONE           36457 non-null  int64  
 15  FLAG_EMAIL           36457 non-null

# **WOE**

I'm going to use WOE transformation with Logistic Regression model to see if it can help improve the result. WOE transformation helps secure monotonic relationship between independent variable and dependent variable.

https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html

In [4]:
def calc_woe(feature, target):
    lst = []
    feature = feature.fillna("NULL")
    
    if len(feature.unique()) > 10:
        raise ValueError('This method currently only supports categorical features with cardinality less than 10')
    
    for group in list(feature.unique()):
        good = len(feature[(feature == group) & (target == 0)])
        bad = len(feature[(feature == group) & (target == 1)])
        lst.append([group, good, bad])

    data = pd.DataFrame(lst, columns=['Group', 'Good', 'Bad'])
    data['Distribution Good'] = data['Good'] / data['Good'].sum()
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    
    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    
    return data

In [5]:
def woe_transform(feature_train, feature_test, target_train):
    feature_train = feature_train.copy()
    feature_test = feature_test.copy()
    target_train = target_train.copy()
    
    result = calc_woe(feature_train, target_train)
    
    for i in range(len(result)):
        feature_train[feature_train == result.iloc[i]['Group']] = result.iloc[i]['WoE']
        feature_test[feature_test == result.iloc[i]['Group']] = result.iloc[i]['WoE']
        
    return feature_train, feature_test

# **Feature Engineering**

OCCUPATION TYPE

Because jobs in OCCUPATION_TYPE are generic, they will be less prone to overfit. Besides,dropping any value may cause a noticable loss of information.

In [6]:
def impute_occupation_type(application):
    
    x = application.copy()
    
    probability = x['OCCUPATION_TYPE'].value_counts().to_numpy()/x['OCCUPATION_TYPE'].value_counts().sum()
    job_list = x['OCCUPATION_TYPE'].value_counts().index.to_numpy()
    indexes = range(len(x['OCCUPATION_TYPE'].value_counts()))
    null_size = len(x[x['OCCUPATION_TYPE'].isnull()]['OCCUPATION_TYPE'])
    
    random_index = np.random.choice(a=indexes, size=null_size, p=probability)
    
    x.loc[:,'IMPUTED_OCCUPATION_TYPE'] = 0
    x.loc[x['OCCUPATION_TYPE'].isnull(),'IMPUTED_OCCUPATION_TYPE'] = 1
    x.loc[x['OCCUPATION_TYPE'].isnull(),'OCCUPATION_TYPE'] = job_list[random_index]

    return x

DAYS_EMPLOYED

In [7]:
def create_unemployed_column(application):
    x = application.copy()
    
    x.loc[x['DAYS_EMPLOYED']<=0,'UNEMPLOYED'] = 0 
    x.loc[x['DAYS_EMPLOYED']>0,'UNEMPLOYED'] = 1 
    
    return x

Remove outliers

In [8]:
continuous_columns = ['CNT_CHILDREN','CNT_FAM_MEMBERS','DAYS_BIRTH','DAYS_EMPLOYED','CNT_FAM_MEMBERS']
def remove_outliers(data, column):
    if column == 'DAYS_EMPLOYED':
        Q1 = data[data[column]<=0][column].quantile(0.25)
        Q3 = data[data[column]<=0][column].quantile(0.75)        
        IQR = Q3-Q1
        return data[((Q1-1.5*IQR <= data[column]) & (data[column] <= Q3+1.5*IQR)) | data[column]>0]
    else:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3-Q1
        return data[(Q1-1.5*IQR <= data[column]) & (data[column] <= Q3+1.5*IQR)]

Transform Skewed Data

In [9]:
before = application['CNT_FAM_MEMBERS'].skew()
after = np.log(application['CNT_FAM_MEMBERS']).skew()
print('Skewness coefficient')
print('CNT_FAM_MEMBERS ------')
print(f'Before: {before}')
print(f'After:  {after}')

before = application['CNT_CHILDREN'].skew()
after = np.power(application['CNT_CHILDREN'],1/7).skew()
print('CNT_CHILDREN ------')
print(f'Before: {before}')
print(f'After:  {after}')

before = application['AMT_INCOME_TOTAL'].skew()
after = np.log(application['AMT_INCOME_TOTAL']).skew()
print('AMT_INCOME_TOTAL ------')
print(f'Before: {before}')
print(f'After:  {after}')

# Only transform the ones < 0 (customers currently being employed)
before = application.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED'].skew()
after = (-1*np.sqrt(-1*application.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED'])).skew()
print('DAYS_EMPLOYED ------')
print(f'Before: {before}')
print(f'After:  {after}')

Skewness coefficient
CNT_FAM_MEMBERS ------
Before: 1.2985959074733653
After:  -0.2253083071969985
CNT_CHILDREN ------
Before: 2.5693822021105657
After:  0.842558906737619
AMT_INCOME_TOTAL ------
Before: 2.739009876253129
After:  0.09531218001198272
DAYS_EMPLOYED ------
Before: -1.731240402655581
After:  -0.6094234619014184


In [10]:
def transform_skewed_data(application):
    x = application.copy()
    
    x.loc[:,'CNT_FAM_MEMBERS'] = np.log(x['CNT_FAM_MEMBERS'])
    x.loc[:,'CNT_CHILDREN'] = np.power(x['CNT_CHILDREN'],1/7)
    x.loc[:,'AMT_INCOME_TOTAL'] = np.log(x['AMT_INCOME_TOTAL'])
    x.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED']  = -1*np.sqrt(-1*x.loc[application['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED'])
    
    return x

Encode dataset

In [11]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ['CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
                       'NAME_HOUSING_TYPE','OCCUPATION_TYPE']

def encode(features, encode_cols):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(application[encode_cols].dropna())
    
    x = features.copy().reset_index()
    x = x.join(pd.DataFrame(encoder.transform(x[encode_cols])))
    x = x.drop(categorical_columns,axis=1)
    return x

# **Labels**

From https://www.kaggle.com/rikdifos/eda-vintage-analysis, we choose to discard all records that last for less than 20 months to reduce noises (too short obeservation window won't be able to show the behaviour of the customers).Any customers that default for 60 days or more are labeled as bad customers. Otherwise, they are labeled as good.

In [12]:
def get_credit_status(credit):
    group=credit.groupby('ID')
    pivot_tb = credit.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')
    pivot_tb['open_month'] = group['MONTHS_BALANCE'].min()
    pivot_tb['end_month'] = group['MONTHS_BALANCE'].max() 
    pivot_tb['ID'] = pivot_tb.index
    pivot_tb = pivot_tb[['ID', 'open_month', 'end_month']]
    pivot_tb['window'] = pivot_tb['end_month'] - pivot_tb['open_month'] 
    pivot_tb.reset_index(drop = True, inplace = True)
    credit0 = credit.copy()
    credit0 = pd.merge(credit0, pivot_tb, on = 'ID', how = 'left') 
    credit0=credit0[credit0['window']>=20]
    credit0['status']=np.where((credit0['STATUS']=='2')| (credit0['STATUS']=='3')|(credit0['STATUS']=='4')|(credit0['STATUS']=='5'),1,0)
    
    return credit0

# **Processing Data**

In [13]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

def oversampling(X, y, sampling_strategy=0.5):

    sm = SMOTE(sampling_strategy=sampling_strategy)
    X, y = sm.fit_resample(X,y)
    return X, y

def downsampling(X, y, sampling_strategy=0.5):
    us = RandomUnderSampler(sampling_strategy=sampling_strategy)
    X, y = us.fit_resample(X,y)
    return X,y

In [14]:
def process_datasets(x_train, x_test, y_train, y_test, oversampling_strategy=0.5, down_sampling_strategy=0.5, transform_skewed = True, woe_cols = [], encode_cols = categorical_columns):
    """
    This function is a wrapper function for all of the preproccessing steps
    
    - x_train: unprocessed train application dataset
    _ x_test: unprocessed test application dataset
    - y_train: unprocessed train credit dataset
    - y_test : unprocessed test credit dataset
    - oversampling_factor:  oversample the positive cases by this factor (because the current label method only has 3% as positive)
    - down_sampling_amount: remove this amount of negative cases to balance positive/negative cases
    _ transform_skewed: if True, transform any skewed continuous data in the application datasset
    """
    
    # encode x
    x_train = impute_occupation_type(x_train)
    x_train = create_unemployed_column(x_train)
    if transform_skewed:
        x_train = transform_skewed_data(x_train)
    
    x_test = impute_occupation_type(x_test)
    x_test = create_unemployed_column(x_test)
    if transform_skewed:
        x_test = transform_skewed_data(x_test)
    
    # encode y
    y_train = get_credit_status(y_train)[['ID','status']]
    y_test = get_credit_status(y_test)[['ID','status']]

    y_train = y_train.groupby('ID').any().reset_index()
    y_test = y_test.groupby('ID').any().reset_index()
    
    # encode x
    encode_cols = encode_cols.copy()
    for col in woe_cols:
        x_train[col], x_test[col] = woe_transform(x_train[col], x_test[col], y_train['status']) 
        if col in encode_cols:
            encode_cols.remove(col)

    x_train = encode(x_train, encode_cols)
    x_test = encode(x_test, encode_cols)
    
    # Merge x and y together to make sure the ids matches

    merged_train = x_train.merge(y_train, on='ID')
    merged_test = x_test.merge(y_test, on='ID')
    
    # remove outliers
    for col in continuous_columns:
        merged_train = remove_outliers(merged_train, col)
    
    x_train = merged_train.drop(['ID', 'status'],axis=1)
    x_test = merged_test.drop(['ID', 'status'],axis=1)
    y_train = merged_train['status']
    y_test = merged_test['status']
    
    # oversampling/downsampling
    if oversampling_strategy:
        x_train, y_train = oversampling(x_train, y_train, sampling_strategy=oversampling_strategy)
    if down_sampling_strategy:
        x_train, y_train = downsampling(x_train, y_train, sampling_strategy=down_sampling_strategy)
    
    return x_train, x_test, y_train, y_test

# **Cross Validation**

In [15]:
from sklearn.utils import shuffle

# train/val/test
# 64/16/20
train_size = len(application)*64//100
val_size = len(application)*16//100
test_size = len(application)- train_size - val_size
fold_size = val_size
print(f'Train size: {train_size}, Validation size: {val_size}, Test size: {test_size}')

Train size: 23332, Validation size: 5833, Test size: 7292


Split cross-validation/test

In [16]:
application = shuffle(application)

cv_application = application[:train_size+val_size].copy()
test_application = application[train_size+val_size:].copy()

In [17]:
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
    
def initiate_model(model_name):
    if model_name == 'LogisticRegression':
        return LogisticRegression(max_iter=400)
    elif model_name == 'KNN':
        return KNeighborsClassifier(n_neighbors=900)
    elif model_name == 'RandomForest':
        return RandomForestClassifier(n_jobs=-1, max_samples=0.8, max_features='sqrt') 
    elif model_name == 'NeuroNetwork':
        nn_model = keras.Sequential([layers.Dense(64, activation='relu'),
                                     layers.Dense(128, activation='relu'),
                                     layers.Dense(128, activation='relu'),
                                     layers.Dense(1, activation='sigmoid')])

        nn_model.compile(keras.optimizers.Adam(), keras.losses.BinaryCrossentropy(), metrics=[keras.metrics.BinaryAccuracy()])
        
        return nn_model
    else:
        raise ValueError('This model is not currently supported.')

In [18]:
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score, precision_recall_curve, auc

f_beta = 0.5

def cross_validation(model_name, application, credit, epochs=10, oversampling_strategy=0.5, down_sampling_strategy=0.5, transform_skewed = True, woe_cols=[]):
    """
    This function performs cross validation and acts as a wrapper function for preproccessing, fitting, and evaluating steps
    
    - model: machine learning model
    - application: application dataset
    - credit: credit dataset
    - epochs: this only apply to the Neural Network, number of epochs to train
    - oversampling_strategy:  oversample the positive cases so that postive/negative = oversampling_strategy (because the current label method only has 3% as positive)
    - down_sampling_strategy: remove negative cases to so that (negative-postive)/positive = down_sampling_strategy
    _ transform_skewed: if True, transform any skewed continuous data in the application datasset
    - threshold: this only apply to the Neural Network, threshold for the decision boundary
    """
    application = shuffle(application)
    total_acc = 0
    total_f05 = 0
    total_precision = 0
    total_recall = 0
    models = list()
    thresholds_list = list()
    
    for i in range(5):
        model = initiate_model(model_name)
        
        x_train = application[:fold_size*i+1].append(application[fold_size*(i+1)-1:]).copy()
        x_test = application[fold_size*i:fold_size*(i+1)].copy()

        y_train = credit[credit['ID'].isin(x_train['ID'])].copy()
        y_test = credit[credit['ID'].isin(x_test['ID'])].copy()
    
    
        x_train, x_test, y_train, y_test = process_datasets(x_train, x_test, y_train, y_test, oversampling_strategy=oversampling_strategy, 
                                                            down_sampling_strategy=down_sampling_strategy, transform_skewed=transform_skewed, woe_cols=woe_cols)
        
        if str(type(model)) == "<class 'tensorflow.python.keras.engine.sequential.Sequential'>":
            model.fit(x_train, y_train, epochs=epochs, verbose=0)
        else:
            model = model.fit(x_train, y_train)
        
        predictions = model.predict(x_test)
        if str(type(model)) == "<class 'tensorflow.python.keras.engine.sequential.Sequential'>" or str(type(model)) == "<class 'sklearn.linear_model._logistic.LogisticRegression'>":
            if str(type(model)) == "<class 'tensorflow.python.keras.engine.sequential.Sequential'>":
                probs = predictions
            else:
                probs = model.predict_proba(x_test)[:,1]
            
            pre, rec, thresholds = precision_recall_curve(y_test, probs)
            f = (1+np.power(f_beta,2)) * pre * rec / (np.power(f_beta,2)*pre + rec)
            threshold = thresholds[np.argmax(f)]
            thresholds_list.append(threshold)
            predictions = probs >= threshold
            print('Threshold:{} , F05: {}'.format(threshold, f[np.argmax(f)]))
        
        models.append(model)
        total_acc = total_acc + accuracy_score(y_test, predictions)
        total_f05 = total_f05 + fbeta_score(y_test, predictions, beta=0.5)
        total_precision = total_precision + precision_score(y_test, predictions)
        total_recall = total_recall + recall_score(y_test,predictions)
    
    if thresholds_list:
        return total_acc/5, total_f05/5, total_precision/5, total_recall/5, models, thresholds_list
    else:
        return total_acc/5, total_f05/5, total_precision/5, total_recall/5, models

# **Models Selection**

> **Logistic Regression**

In [19]:
woe_columns = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE',
               'NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','FLAG_MOBIL','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL']

acc, f05, precision, recall, models, lr_thresholds = cross_validation('LogisticRegression', cv_application, credit, oversampling_strategy=0.5, 
                                                       down_sampling_strategy=0.5, transform_skewed = True, woe_cols=woe_columns)

Threshold:0.997303909062253 , F05: 0.7584269662921349
Threshold:0.9959900980193352 , F05: 0.8498023715415021
Threshold:0.9934921400052862 , F05: 0.8192090395480226
Threshold:0.9924691970411856 , F05: 0.8366533864541833
Threshold:0.9920339208202269 , F05: 0.7878787878787877


In [20]:
print(f'Acc: {acc}')
print(f'f05: {f05}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Acc: 0.9852280612812592
f05: 0.8103941103429262
Precision: 1.0
Recall: 0.46484838477859747


> **KNN**

In [21]:
acc, f05, precision, recall, models = cross_validation('KNN', cv_application, credit, oversampling_strategy=0.5, down_sampling_strategy=0.5, transform_skewed = True)

In [22]:
print(f'Acc: {acc}')
print(f'f05: {f05}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Acc: 0.9852110150013754
f05: 0.8145593468805208
Precision: 1.0
Recall: 0.4699016450629882


> **Random Forest**

In [23]:
acc, f05, precision, recall, models = cross_validation('RandomForest', cv_application, credit, oversampling_strategy=0.5, down_sampling_strategy=0.5, transform_skewed = True)

In [24]:
print(f'Acc: {acc}')
print(f'f05: {f05}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Acc: 0.9819989007134351
f05: 0.6955951747502545
Precision: 0.7610706504824151
Recall: 0.5210997991861236


> **Neuronetwork**


In [25]:
acc, f05, precision, recall, models, nn_thresholds = cross_validation('NeuroNetwork', cv_application, credit, epochs=10, oversampling_strategy=0.5, 
                                                        down_sampling_strategy=0.5, transform_skewed = True)

Threshold:1.0 , F05: 0.8847736625514403
Threshold:1.0 , F05: 0.7803468208092486
Threshold:1.0 , F05: 0.7267441860465117
Threshold:1.0 , F05: 0.8547008547008547
Threshold:1.0 , F05: 0.7920792079207921


In [26]:
print(f'Acc: {acc}')
print(f'f05: {f05}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Acc: 0.9852001119462603
f05: 0.8077289464057694
Precision: 1.0
Recall: 0.46824272267934236


# **Testing**

This is the best model chosen from model selection

In [27]:
application = shuffle(application)

x_train = cv_application.copy()
x_test = test_application.copy()

y_train = credit[credit['ID'].isin(x_train['ID'])].copy()
y_test = credit[credit['ID'].isin(x_test['ID'])].copy()

woe_columns = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE',
               'NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','FLAG_MOBIL','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL']

x_train, x_test, y_train, y_test = process_datasets(x_train, x_test, y_train, y_test, oversampling_strategy=0.5, 
                                                    down_sampling_strategy=0.5, transform_skewed = True, woe_cols=woe_columns)

In [28]:
model = initiate_model('LogisticRegression')
model.fit(x_train, y_train)

probs = model.predict_proba(x_test)[:,1]

# if using Logistic Regression or NN, use the average threshold from CV
threshold = sum(lr_thresholds)/len(lr_thresholds)
predictions = probs >= threshold

print(f'Acc: {accuracy_score(y_test, predictions)}')
print(f'f05: {fbeta_score(y_test, predictions, beta=0.5)}')
print(f'Precision: {precision_score(y_test, predictions)}')
print(f'Recall: {recall_score(y_test,predictions)}')

Acc: 0.9833333333333333
f05: 0.8058608058608059
Precision: 1.0
Recall: 0.4536082474226804


# **Conclusion**

* In this notebook, we performed different feature engineering, transformations, over/downsampling methods to train a model to classify good/bad credits with the provided unbalanced dataset.
* For mode selection, we use f05 score as the main criteria to deal with this unbalanced data.
* From the 4 models we tried to train above, Logistic Regression, KNN, and RandomForest produce similar results for cross validation. Random Forest has lower f05 score than others. For potential future needs for inferencing, we select the fitted Logistic Regression as the final model. Then, we run the Logistic Regression with the test set and it produces a similar result to its CV f05 score, which is about 0.8.
* WOE transformation didn't show much effect on the Logistic Regression, probably because we only performed transformation on categorical variable with cardinality lower than 20, left out the ones with high cardinality and continuous variable.

# **Acknowledgements**
*  https://www.kaggle.com/rikdifos/eda-vintage-analysis
*  https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html