In [None]:
import re
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)


## 0.Load Data

In [None]:
data = {}
file1 = {
    'application_test': 'application_test.csv',
    'application_train': 'application_train.csv'
    }

file2 = {
    'bureau_balance': 'bureau_balance.csv',
    'bureau': 'bureau.csv',
    'credit_card_balance': 'credit_card_balance.csv',
    'installments_payments': 'installments_payments.csv',
    'POS_CASH_balance': 'POS_CASH_balance.csv',
    'previous_application': 'previous_application.csv'
    }

for key, name in file1.items():
    data[key] = pd.read_csv(f'../dataset/{name}', index_col=0)
    print(f'Dataset: {key} - Shape: {data[key].shape}')

for key, name in file2.items():
    data[key] = pd.read_csv(f'../dataset/{name}')
    print(f'Dataset: {key} - Shape: {data[key].shape}')


## 1.Data Preparation

Data cleaning + create new features + group values of each feature + encoding

In [None]:
def one_hot_encoding(df, col_list):
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[col_list])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(col_list))
    df = pd.concat([df, one_hot_df], axis=1)
    df = df.drop(col_list, axis=1)
    return df

In [None]:
def label_encoding(df, col_list):
    for col in col_list:
        label_encoder = preprocessing.LabelEncoder() 
        df[col] = label_encoder.fit_transform(df[col])
    return df

In [None]:
def handling_outlier(df):
    return df

In [None]:
def checking_missing_values(df):
    
    return df

# Percentage of null in null columns
def percentage_null(df):
    missing_percent = df.isnull().sum()/df.shape[0]*100
    return missing_percent[missing_percent > 0].sort_values(ascending=False)

for key in data.keys():
    print(f'Percentage of missing values in {key}:')
    print(percentage_null(data[key]),'\n')

### a.application_train + application_tesst

In [None]:
def application_processing(df):
    #CLEANING
    df.loc[df['DAYS_EMPLOYED'] == 365243, 'DAYS_EMPLOYED'] = np.nan 
    df.loc[df['CODE_GENDER'] == 'XNA', 'CODE_GENDER'] = 'F'

    df.loc[(df['NAME_FAMILY_STATUS'] == 'Unknown') | (df['NAME_FAMILY_STATUS'] == 'Civil marriage'), 'NAME_FAMILY_STATUS'] = 'Married'
    df.loc[(df['NAME_FAMILY_STATUS'] == 'Single / not married') | (df['NAME_FAMILY_STATUS'] == 'Separated') | (df['NAME_FAMILY_STATUS'] == 'Widow'),\
            'NAME_FAMILY_STATUS'] = 'Unmarried'

    df.loc[(df['NAME_TYPE_SUITE'] == 'Other_A') | (df['NAME_TYPE_SUITE'] == 'Other_B'), 'NAME_TYPE_SUITE'] = 'Other'
    df['ORGANIZATION_TYPE'] = df['ORGANIZATION_TYPE'].str.replace(':', '').str.split(' ').str[0]

    mapping_education_type = {
        'Lower secondary': 1,
        'Secondary / secondary special': 2,
        'Incomplete higher': 3,
        'Higher education': 4,
        'Academic degree': 5
    }
    df['NAME_EDUCATION_TYPE'] = df['NAME_EDUCATION_TYPE'].apply(lambda x : mapping_education_type[x])
    
    #CREATE NEW COLUMNS
    df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERCENT'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_MEMBER'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

    #FLAG_CONTACT(PHONE + EMAIL)
    df['FLAG_CONTACT_PHONE'] = df['FLAG_CONT_MOBILE']*(df['FLAG_EMP_PHONE'] + df['FLAG_WORK_PHONE'])
    df = df.drop(['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL'], axis=1)

    #FLAG_DOCUMENT
    df['FLAG_DOCUMENT_36'] = df['FLAG_DOCUMENT_3'] * (df['FLAG_DOCUMENT_6'] + df['FLAG_DOCUMENT_3'])
    df = df.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 
                'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
                'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 
                'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
                ], axis=1)
    
    #HOUSE
    
    #RENAME COLUMN FOR MERGING
    

### b.bureau + bureau_balance

In [None]:
def cleaning_bureau(data):
    # Drop all the unbelievable values
    data = data.loc[(data['DAYS_CREDIT_UPDATE'] >= -2922)]
    data = data.loc[(data['DAYS_ENDDATE_FACT'] >= -2922) |
                    (data['DAYS_ENDDATE_FACT'].isna())]
    data = data.loc[(data['DAYS_CREDIT_ENDDATE'] >= -2922)]

    # For loans that due in more than 50 years, we replace it with NaN
    data.loc[(data['DAYS_CREDIT_ENDDATE'] > 50*365)] = np.nan

    # Change CREDIT_ACTIVE to Closed for DAYS_ENDDATE_FACT < 0
    data.loc[(data['DAYS_ENDDATE_FACT'] < 0) & 
             (data['CREDIT_ACTIVE'] == 'Active'), 'CREDIT_ACTIVE'] = 'Closed'
    
    # Drop column with high missing values and not useful
    data = data.drop(['AMT_ANNUITY', 'CREDIT_CURRENCY'], axis=1)
    
    # Drop entries with AMT_CREDIT_SUM = 0
    data = data.loc[data['AMT_CREDIT_SUM'] != 0]
    
    return data

In [None]:
def feature_engineering_bureau_balance(bureau_balance):
    # Change the negative value to positive
    bureau_balance['MONTHS_BALANCE'] = np.abs(bureau_balance['MONTHS_BALANCE'])
    
    status_dict = {'C': 0, '0': 1, '1': 2, '2': 3, 'X': 4, '3': 5, '4': 6, '5': 7}
    bureau_balance['STATUS'] = bureau_balance['STATUS'].map(status_dict)

    bureau_balance['WEIGHTED_STATUS'] = bureau_balance['STATUS'] / (bureau_balance['MONTHS_BALANCE'] + 1) # Avoid division by zero
    bureau_balance = bureau_balance.sort_values(
        by=['SK_ID_BUREAU', 'MONTHS_BALANCE'], ascending=[True, True]
        ) # Sorting to calculate moving averages

    bureau_balance['MONTHS_BALANCE'] = bureau_balance['MONTHS_BALANCE'] // 12

    # Using exponential weighted moving average to calculate the weighted status
    bureau_balance['EXP_WEIGHTED_STATUS'] = bureau_balance.groupby('SK_ID_BUREAU')['WEIGHTED_STATUS'] \
                                                        .transform(lambda x: x.ewm(alpha = 0.7).mean())
    
    # Using exponential moving average to calculate the status
    bureau_balance['EXP_ENCODED_STATUS'] = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'] \
                                                        .transform(lambda x: x.ewm(alpha = 0.7).mean())
    

    # Aggregating data for each SK_ID_BUREAU
    bureau_balance_agg = bureau_balance.groupby(['SK_ID_BUREAU']).agg({
        'MONTHS_BALANCE' : ['mean','max'],
        'STATUS' : ['mean','max','first'],
        'WEIGHTED_STATUS' : ['mean','sum','first'],
        'EXP_WEIGHTED_STATUS' : ['last'],
        'EXP_ENCODED_STATUS' : ['last']
        })
    bureau_balance_agg.columns = ['_'.join(ele).upper() for ele in bureau_balance_agg.columns]

    # Aggregating data for the last 3 years
    balance_agg_all_years = pd.DataFrame()
    for period in range(3):
        period_group = bureau_balance.loc[bureau_balance['MONTHS_BALANCE'] == period].groupby('SK_ID_BUREAU').agg({
            'STATUS': ['mean', 'max', 'last', 'first'],
            'WEIGHTED_STATUS': ['mean', 'max', 'first', 'last'],
            'EXP_WEIGHTED_STATUS': ['last'],
            'EXP_ENCODED_STATUS': ['last']
        })
        
        period_group.columns = ['_'.join(col).upper() + '_PERIOD_' + str(period) for col in period_group.columns]

        if period == 0:
            balance_agg_all_years = period_group
        else:
            balance_agg_all_years = balance_agg_all_years.merge(period_group, on='SK_ID_BUREAU', how='outer')


    balance_agg_rest_years = bureau_balance[bureau_balance.MONTHS_BALANCE > period] \
                    .groupby(['SK_ID_BUREAU']).agg({
                                                'STATUS' : ['mean','max','last','first'],
                                                'WEIGHTED_STATUS' : ['mean','max', 'first','last'],
                                                'EXP_WEIGHTED_STATUS' : ['last'],
                                                'EXP_ENCODED_STATUS' : ['last'] 
                                                })
    balance_agg_rest_years.columns = ['_'.join(ele).upper() + '_THE_REST' for ele in balance_agg_rest_years.columns]

    balance_agg_all_years = balance_agg_all_years.merge(balance_agg_rest_years, on = 'SK_ID_BUREAU', how = 'outer')
    bureau_balance_agg = bureau_balance_agg.merge(balance_agg_all_years, on = 'SK_ID_BUREAU', how = 'inner')

    bureau_balance_agg.fillna(0, inplace = True)
    
    return bureau_balance_agg

In [None]:
def feature_engineering_bureau(bureau, final_bureau_balance):
    # Merge the bureau_balance with bureau
    bureau = bureau.merge(final_bureau_balance, on='SK_ID_BUREAU', how='left')

    # New features for DAYS columns:
    bureau['CREDIT_AGE'] = abs(bureau['DAYS_CREDIT'])
    bureau['CREDIT_DURATION'] = abs(bureau['DAYS_CREDIT'] - bureau['DAYS_CREDIT_ENDDATE'])
    bureau['DAYS_CREDIT_LEFT'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_CREDIT']
    bureau['FLAG_IS_OVERDUE'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)
    bureau['DAYS_OVERDUE_DURATION'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: max(0,x))
    bureau['DAYS_OVERDUE_RATIO'] = bureau['CREDIT_DAY_OVERDUE'] / (bureau['CREDIT_DURATION'] + 0.0001)
    bureau['DAYS_EARLY_REPAYMENT'] = abs(bureau['DAYS_ENDDATE_FACT'] - bureau['DAYS_CREDIT_ENDDATE'])
    bureau['FLAG_IS_EARLY_REPAYMENT'] = bureau['DAYS_CREDIT_ENDDATE'].apply(lambda x: 1 if x > 0 else 0)
    bureau['DAYS_SINCE_LAST_UPDATE'] = bureau['DAYS_CREDIT_UPDATE'] - bureau['DAYS_CREDIT']
    bureau['CREDIT_ENDDATE_UPDATE_DIFF'] = abs(bureau['DAYS_CREDIT_UPDATE'] - bureau['DAYS_CREDIT_ENDDATE']) 
    
    # New features for AMT columns:
    bureau['CREDIT_UTILIZATION_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / (bureau['AMT_CREDIT_SUM'] + 0.0001)
    bureau['AMT_OVERDUE_RATIO'] = bureau['AMT_CREDIT_SUM_OVERDUE'] / (bureau['AMT_CREDIT_SUM'] + 0.0001)
    bureau['PROLONGATION_FREQUENCY'] = bureau['CNT_CREDIT_PROLONG'] / (bureau['CREDIT_DURATION'] + 0.0001)
    bureau['CREDIT_DEBT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
    bureau['DEBT_LIMIT_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / (bureau['AMT_CREDIT_SUM_LIMIT'] + 0.0001)
    bureau['MAX_OVERDUE_DEBT_RATIO'] = bureau['AMT_CREDIT_MAX_OVERDUE'] / (bureau['AMT_CREDIT_SUM_DEBT'] + 0.0001)
    bureau['TOTAL_RISK_SCORE'] = bureau['CREDIT_UTILIZATION_RATIO'] + bureau['AMT_OVERDUE_RATIO'] + bureau['DEBT_LIMIT_RATIO']
    bureau['DEBT_CREDIT_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / (bureau['AMT_CREDIT_SUM_OVERDUE'] + 0.0001)
    bureau['OVERDUE_SEVERITY'] = bureau['AMT_CREDIT_MAX_OVERDUE'] / (bureau['AMT_CREDIT_SUM_OVERDUE'] + 0.0001)
    bureau['OVERDUE_DURATION_RATIO'] = bureau['DAYS_OVERDUE_DURATION'] / (bureau['CREDIT_DURATION'] + 0.0001)
    bureau['OVERDUE_SEVERITY_RATIO'] = bureau['OVERDUE_SEVERITY'] / (bureau['DAYS_OVERDUE_DURATION'] + 0.0001)
    bureau['RISK_EXPOSURE_RATIO'] = (bureau['AMT_CREDIT_SUM_DEBT'] + bureau['AMT_CREDIT_SUM_OVERDUE']) / (bureau['AMT_CREDIT_SUM_LIMIT'] + 0.0001)

    # Combine all other credit type into 'Other' category (expect Consumer Credit, credit card, car loan, mortgage, microloan)
    column_to_keep = ['Consumer credit', 'Credit card', 'Car loan', 'Mortgage', 'Microloan']
    bureau['CREDIT_TYPE'] = bureau['CREDIT_TYPE'].apply(lambda x: x if x in column_to_keep else 'Other')

    # Only keep Active and Closed status, change Sold and Bad Debt to Other
    column_to_keep = ['Active', 'Closed']
    bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].apply(lambda x: x if x in column_to_keep else 'Other')

    # Aggregate with respect to 'SK_ID_CURR' in order to merge with application_train
    # First, aggreagate based on the category of CREDIT_ACTIVE
    aggregate_CREDIT_ACTIVE = {
        'CREDIT_AGE' : ['mean', 'max', 'min'],
        'CREDIT_DURATION' : ['mean', 'max', 'min'],
        'DAYS_CREDIT_LEFT' : ['mean', 'max', 'min'],
        'FLAG_IS_OVERDUE' : ['sum'],
        'DAYS_OVERDUE_DURATION' : ['mean', 'max', 'min'],
        'DAYS_OVERDUE_RATIO' : ['mean', 'max', 'min'],
        'DAYS_EARLY_REPAYMENT' : ['mean', 'max'],
        'FLAG_IS_EARLY_REPAYMENT' : ['sum'],
        'DAYS_SINCE_LAST_UPDATE' : ['mean', 'max', 'min'],
        'CREDIT_ENDDATE_UPDATE_DIFF' : ['mean', 'max', 'min'],
        'CREDIT_UTILIZATION_RATIO' : ['mean', 'max', 'min'],
        'AMT_OVERDUE_RATIO' : ['mean', 'max', 'min'],
        'PROLONGATION_FREQUENCY' : ['mean', 'max', 'min'],
        'CREDIT_DEBT_DIFF' : ['mean', 'max', 'min'],
        'DEBT_LIMIT_RATIO' : ['mean', 'max', 'min'],
        'MAX_OVERDUE_DEBT_RATIO' : ['mean', 'max', 'min'],
        'TOTAL_RISK_SCORE' : ['mean', 'max', 'min'],
        'DEBT_CREDIT_RATIO' : ['mean', 'max', 'min'],
        'OVERDUE_SEVERITY' : ['mean', 'max', 'min'],
        'OVERDUE_DURATION_RATIO' : ['mean', 'max', 'min'],
        'OVERDUE_SEVERITY_RATIO' : ['mean', 'max', 'min'],
        'RISK_EXPOSURE_RATIO' : ['mean', 'max', 'min'],
        'DAYS_CREDIT' : ['mean','min','max','last'],
        'CREDIT_DAY_OVERDUE' : ['mean','max'],
        'DAYS_CREDIT_ENDDATE' : ['mean','max'],
        'DAYS_ENDDATE_FACT' : ['mean','min'],
        'AMT_CREDIT_MAX_OVERDUE': ['max','sum'],
        'CNT_CREDIT_PROLONG': ['max','sum'],
        'AMT_CREDIT_SUM' : ['sum','max'],
        'AMT_CREDIT_SUM_DEBT': ['sum'],
        'AMT_CREDIT_SUM_LIMIT': ['max','sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['max','sum'],
        'DAYS_CREDIT_UPDATE' : ['mean','min'],
    }

    agg_bureau_credit = pd.DataFrame()
    agg_active_type = ['Active', 'Closed', 'Other']
    for i, type in enumerate(agg_active_type):
        bureau_type = bureau.loc[bureau['CREDIT_ACTIVE'] == type]
        bureau_type_agg = bureau_type.groupby('SK_ID_CURR').agg(aggregate_CREDIT_ACTIVE)
        bureau_type_agg.columns = pd.Index(['_'.join(col_name).upper() + '_CREDIT_ACTIVE_' + type.upper() 
                                            for col_name in bureau_type_agg.columns.tolist()])
        
        if i == 0:
            agg_bureau_credit = bureau_type_agg
        else:
            agg_bureau_credit = agg_bureau_credit.merge(bureau_type_agg, on='SK_ID_CURR', how='outer')

    # One-hot encoding for CREDIT_ACTIVE, CREDIT_TYPE
    credit_active_dummies = pd.get_dummies(bureau['CREDIT_ACTIVE'], prefix='CREDIT_ACTIVE')
    credit_type_dummies = pd.get_dummies(bureau['CREDIT_TYPE'], prefix='CREDIT_TYPE')
    bureau = pd.concat([bureau, credit_active_dummies, credit_type_dummies], axis=1)
    bureau = bureau.drop(['CREDIT_ACTIVE', 'CREDIT_TYPE'], axis=1)

    # Finally, aggregate based on 'SK_ID_CURR'
    bureau_agg = bureau.drop('SK_ID_BUREAU', axis = 1).groupby('SK_ID_CURR').agg('mean')
    bureau_agg.columns = [col + '_MEAN_OVERALL' for col in bureau_agg.columns]
    bureau_agg = bureau_agg.merge(agg_bureau_credit, on='SK_ID_CURR', how='outer')

    return bureau_agg

In [None]:
def bureau_processing(bureau, bureau_balance):
    bureau = cleaning_bureau(bureau)
    bureau_balance = feature_engineering_bureau_balance(bureau_balance)
    bureau = feature_engineering_bureau(bureau, bureau_balance)

    bureau = bureau.loc[:, bureau.isna().mean() < 0.5]
    return bureau

### c.credit_card_balance

### d.installments_payment

### e.previous_application

In [None]:
def previous_application_processing(df):
    df.loc[df['DAYS_FIRST_DRAWING'] == 365243, 'DAYS_FIRST_DRAWING'] = np.nan 
    df.loc[df['DAYS_FIRST_DUE'] == 365243, 'DAYS_FIRST_DUE'] = np.nan 
    df.loc[df['DAYS_LAST_DUE_1ST_VERSION'] == 365243, 'DAYS_LAST_DUE_1ST_VERSION'] = np.nan 
    df.loc[df['DAYS_LAST_DUE'] == 365243, 'DAYS_LAST_DUE'] = np.nan 
    df.loc[df['DAYS_TERMINATION'] == 365243, 'DAYS_TERMINATION'] = np.nan 
    df.loc[df['SELLERPLACE_AREA'] == 4000000] = np.nan

    return df

### f.POS_CASH_balance

### g.main

In [None]:
#Clean dataset
data['application_train'] = application_processing(data['application_train'])
data['application_test'] = application_processing(data['application_test'])
data['bureau_final'] = bureau_processing(data['bureau'], data['bureau_balance'])


#Vì group theo SK_ID_CURR nên không sửa trực tiếp trên dataset chính
data['previous_application_cleaned'] = previous_application_processing(data['previous_application'])

In [None]:
# Merged dataset


In [None]:
# Create new feature when merge
#         train_data = previous_application[['AMT_CREDIT', 'AMT_ANNUITY', 'CNT_PAYMENT']].dropna()
#         train_data['CREDIT_ANNUITY_RATIO'] = train_data['AMT_CREDIT'] / (train_data['AMT_ANNUITY'] + 1)
#         #value to predict is our CNT_PAYMENT
#         train_value = train_data.pop('CNT_PAYMENT')
        
#         #test data would be our application_train data
#         test_data = data_to_predict[['AMT_CREDIT','AMT_ANNUITY']].fillna(0)
#         test_data['CREDIT_ANNUITY_RATIO'] = test_data['AMT_CREDIT'] / (test_data['AMT_ANNUITY'] + 1)
        
#         lgbmr = LGBMRegressor(max_depth = 9, n_estimators = 5000, n_jobs = -1, learning_rate = 0.3, 
#                               random_state = 125)
#         lgbmr.fit(train_data, train_value)
#         #dumping the model to pickle file
#         with open('cnt_payment_predictor_lgbmr.pkl', 'wb') as f:
#             pickle.dump(lgbmr, f)
#         #predicting the CNT_PAYMENT for test_data
#         cnt_payment = lgbmr.predict(test_data)

#Create new column PREV_COUNT which is the number of previous_application of each SK_ID_CURR to see correlation
# appli_prev_app2 = data['application_train'][['SK_ID_CURR', 'TARGET']]
# appli_prev_app2 = appli_prev_app2.merge(target_dist[['SK_ID_CURR', 'count']], on=['SK_ID_CURR'], how='left')
# appli_prev_app2.rename(columns={'count': 'PREV_COUNT'}, inplace=True)
# appli_prev_app2.head()

In [1]:
# Feature Selection (check correlation) + Feature Engingeering

## 2.Build model