# Library and Function

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import phik
    
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from datetime import datetime

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

# 1. Load Data

In [2]:
def one_hot_encoding(df, col_list):
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[col_list])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(col_list))
    df = pd.concat([df, one_hot_df], axis=1)
    df = df.drop(col_list, axis=1)
    return df

In [3]:
def load_data (path='..//dataset//'):
    global application_train, application_test, bureau, bureau_balance
    application_train = pd.read_csv(path + 'application_train.csv')
    application_test = pd.read_csv(path + 'application_test.csv')
    bureau = pd.read_csv(path + 'bureau.csv')
    bureau_balance = pd.read_csv(path + 'bureau_balance.csv')

    print('Done loading')

In [4]:
load_data()

Done loading


In [5]:
# How many unique SK_ID_CURR are there in bureau that also in application_train?
print('Number of unique SK_ID_CURR in bureau:', bureau['SK_ID_CURR'].nunique())
print('Number of unique SK_ID_CURR in application_train:', application_train['SK_ID_CURR'].nunique())
print('Number of unique SK_ID_CURR in both bureau and application_train:', 
      len(set(bureau['SK_ID_CURR']) & set(application_train['SK_ID_CURR'])))

Number of unique SK_ID_CURR in bureau: 263491
Number of unique SK_ID_CURR in application_train: 246009
Number of unique SK_ID_CURR in both bureau and application_train: 210809


In [6]:
# How many unique SK_ID_BUREAU are there in bureau that also in balance?
print('Number of unique SK_ID_BUREAU in bureau:', bureau['SK_ID_BUREAU'].nunique())
print('Number of unique SK_ID_BUREAU in bureau_balance:', bureau_balance['SK_ID_BUREAU'].nunique())
print('Number of unique SK_ID_BUREAU in both bureau and bureau_balance:', 
      len(set(bureau['SK_ID_BUREAU']) & set(bureau_balance['SK_ID_BUREAU'])))

Number of unique SK_ID_BUREAU in bureau: 1465325
Number of unique SK_ID_BUREAU in bureau_balance: 817395
Number of unique SK_ID_BUREAU in both bureau and bureau_balance: 523515


# 2. Clean Data

## 2.1 Handling null data

In [7]:
def handle_null(data):
    data = data.drop(columns=['CREDIT_CURRENCY'])

    data['AMT_ANNUITY'] = data['AMT_ANNUITY'].fillna(0) # Assume the loan does not require annuity payment
    print('Fill missing values in AMT_ANNUITY with 0')
    
    data['AMT_CREDIT_MAX_OVERDUE'] = data['AMT_CREDIT_MAX_OVERDUE'].fillna(0) # Assume the loan has not been overdue
    print('Fill missing values in AMT_CREDIT_MAX_OVERDUE with 0')

    # Fill missing values in DAYS_CREDIT_ENDDATE
    data.loc[(data['DAYS_CREDIT_ENDDATE'].isna()) & (data['CREDIT_ACTIVE'].isin(['Closed', 'Sold', 'Bad debt'])), 'DAYS_CREDIT_ENDDATE'] = 0
    data.loc[(data['DAYS_CREDIT_ENDDATE'].isna()) & (data['CREDIT_ACTIVE'] == 'Active') & (data['DAYS_ENDDATE_FACT'].notna()), 'DAYS_CREDIT_ENDDATE'] = data['DAYS_ENDDATE_FACT']
    median_days_credit_enddate = data.loc[(data['DAYS_CREDIT_ENDDATE'] > 0) & (data['DAYS_CREDIT_ENDDATE'] < 80*365), 'DAYS_CREDIT_ENDDATE'].median()
    data['DAYS_CREDIT_ENDDATE'] = data['DAYS_CREDIT_ENDDATE'].fillna(median_days_credit_enddate)
    print('Fill missing values in DAYS_CREDIT_ENDDATE')

    data['DAYS_ENDDATE_FACT'] = data['DAYS_ENDDATE_FACT'].fillna(9999) # Assume the loan has not ended
    print('Fill missing values in DAYS_ENDDATE_FACT with 9999')

    median_amt_credit_sum = data['AMT_CREDIT_SUM'].median()
    data['AMT_CREDIT_SUM'] = data['AMT_CREDIT_SUM'].fillna(median_amt_credit_sum)
    print('Fill missing values in AMT_CREDIT_SUM with median')

    median_amt_credit_sum_limit = data['AMT_CREDIT_SUM_LIMIT'].median()
    data['AMT_CREDIT_SUM_LIMIT'] = data['AMT_CREDIT_SUM_LIMIT'].fillna(median_amt_credit_sum_limit)
    print('Fill missing values in AMT_CREDIT_SUM_LIMIT with median')

    data['AMT_CREDIT_SUM_DEBT'] = data['AMT_CREDIT_SUM_DEBT'].fillna(0) # Assume the loan has been paid off
    print('Fill missing values in AMT_CREDIT_SUM_DEBT with 0')

    return data

In [8]:
bureau = handle_null(bureau)

Fill missing values in AMT_ANNUITY with 0
Fill missing values in AMT_CREDIT_MAX_OVERDUE with 0
Fill missing values in DAYS_CREDIT_ENDDATE
Fill missing values in DAYS_ENDDATE_FACT with 9999
Fill missing values in AMT_CREDIT_SUM with median
Fill missing values in AMT_CREDIT_SUM_LIMIT with median
Fill missing values in AMT_CREDIT_SUM_DEBT with 0


-------------------------------
## 2.2 Handling Outliers

In [9]:
def handle_outlier(data):
    data = data.loc[data['DAYS_CREDIT_ENDDATE'] > -2922] # credit end before it was even applied
    data = data.loc[data['DAYS_CREDIT_UPDATE'] > -100*365] # credit was updated more than 100 years after it was applied
    return data

In [10]:
bureau = handle_outlier(bureau)

-----------------------
## 2.3 Misc

In [11]:
def other_update(data):
    # Change CREDIT_ACTIVE to Closed for DAYS_ENDDATE_FACT < 0
    data.loc[(data['DAYS_ENDDATE_FACT'] < 0) & 
             (data['CREDIT_ACTIVE'] == 'Active'), 'CREDIT_ACTIVE'] = 'Closed'
    
    return data

In [12]:
bureau = other_update(bureau)
bureau.to_csv('bureau_cleaned.csv', mode='w+')

---------------------------------
# 3. Feature Engineering

In [13]:
def cleaning_bureau(data):
    
    # NULL HANDLING
    data = data.drop(columns=['CREDIT_CURRENCY'])

    data['AMT_ANNUITY'] = data['AMT_ANNUITY'].fillna(0) # Assume the loan does not require annuity payment
    print('Fill missing values in AMT_ANNUITY with 0')
    
    data['AMT_CREDIT_MAX_OVERDUE'] = data['AMT_CREDIT_MAX_OVERDUE'].fillna(0) # Assume the loan has not been overdue
    print('Fill missing values in AMT_CREDIT_MAX_OVERDUE with 0')

    # Fill missing values in DAYS_CREDIT_ENDDATE
    data.loc[(data['DAYS_CREDIT_ENDDATE'].isna()) & (data['CREDIT_ACTIVE'].isin(['Closed', 'Sold', 'Bad debt'])), 'DAYS_CREDIT_ENDDATE'] = 0
    data.loc[(data['DAYS_CREDIT_ENDDATE'].isna()) & (data['CREDIT_ACTIVE'] == 'Active') & (data['DAYS_ENDDATE_FACT'].notna()), 'DAYS_CREDIT_ENDDATE'] = data['DAYS_ENDDATE_FACT']
    median_days_credit_enddate = data.loc[(data['DAYS_CREDIT_ENDDATE'] > 0) & (data['DAYS_CREDIT_ENDDATE'] < 80*365), 'DAYS_CREDIT_ENDDATE'].median()
    data['DAYS_CREDIT_ENDDATE'] = data['DAYS_CREDIT_ENDDATE'].fillna(median_days_credit_enddate)
    print('Fill missing values in DAYS_CREDIT_ENDDATE')

    data['DAYS_ENDDATE_FACT'] = data['DAYS_ENDDATE_FACT'].fillna(9999) # Assume the loan has not ended
    print('Fill missing values in DAYS_ENDDATE_FACT with 9999')

    median_amt_credit_sum = data['AMT_CREDIT_SUM'].median()
    data['AMT_CREDIT_SUM'] = data['AMT_CREDIT_SUM'].fillna(median_amt_credit_sum)
    print('Fill missing values in AMT_CREDIT_SUM with median')

    median_amt_credit_sum_limit = data['AMT_CREDIT_SUM_LIMIT'].median()
    data['AMT_CREDIT_SUM_LIMIT'] = data['AMT_CREDIT_SUM_LIMIT'].fillna(median_amt_credit_sum_limit)
    print('Fill missing values in AMT_CREDIT_SUM_LIMIT with median')

    data['AMT_CREDIT_SUM_DEBT'] = data['AMT_CREDIT_SUM_DEBT'].fillna(0) # Assume the loan has been paid off
    print('Fill missing values in AMT_CREDIT_SUM_DEBT with 0')
    
    ## OUTLIERS HANDLING
    data = data.loc[data['DAYS_CREDIT_ENDDATE'] > -2922] # remove credits that end before it was even applied
    data = data.loc[data['DAYS_CREDIT_UPDATE'] > -100*365] # remove credits was updated more than 100 years after it was applied
    
    # MISCELLANEOUS
    # Change CREDIT_ACTIVE to Closed for DAYS_ENDDATE_FACT < 0
    data.loc[(data['DAYS_ENDDATE_FACT'] < 0) & 
             (data['CREDIT_ACTIVE'] == 'Active'), 'CREDIT_ACTIVE'] = 'Closed'
    
    return data

In [14]:
def feature_engineering_bureau_balance(bureau_balance):
    # Change the negative value to positive
    bureau_balance['MONTHS_BALANCE'] = np.abs(bureau_balance['MONTHS_BALANCE'])
    
    status_dict = {'C': 0, '0': 1, '1': 2, '2': 3, 'X': 4, '3': 5, '4': 6, '5': 7}
    bureau_balance['STATUS'] = bureau_balance['STATUS'].map(status_dict)

    bureau_balance['WEIGHTED_STATUS'] = bureau_balance['STATUS'] / (bureau_balance['MONTHS_BALANCE'] + 1) # Avoid division by zero
    bureau_balance = bureau_balance.sort_values(
        by=['SK_ID_BUREAU', 'MONTHS_BALANCE'], ascending=[True, True]
        ) # Sorting to calculate moving averages

    bureau_balance['MONTHS_BALANCE'] = bureau_balance['MONTHS_BALANCE'] // 12

    # Using exponential weighted moving average to calculate the weighted status
    bureau_balance['EXP_WEIGHTED_STATUS'] = bureau_balance.groupby('SK_ID_BUREAU')['WEIGHTED_STATUS'] \
                                                        .transform(lambda x: x.ewm(alpha = 0.7).mean())
    
    # Using exponential moving average to calculate the status
    bureau_balance['EXP_ENCODED_STATUS'] = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'] \
                                                        .transform(lambda x: x.ewm(alpha = 0.7).mean())
    

    # Aggregating data for each SK_ID_BUREAU
    bureau_balance_agg = bureau_balance.groupby(['SK_ID_BUREAU']).agg({
        'MONTHS_BALANCE' : ['mean','max'],
        'STATUS' : ['mean'],
        'WEIGHTED_STATUS' : ['mean'],
        'EXP_WEIGHTED_STATUS' : ['last'],
        'EXP_ENCODED_STATUS' : ['last']
        })
    bureau_balance_agg.columns = ['_'.join(ele).upper() for ele in bureau_balance_agg.columns]

    # Aggregating data for the last 3 years
    balance_agg_all_years = pd.DataFrame()
    for period in range(1):
        period_group = bureau_balance.loc[bureau_balance['MONTHS_BALANCE'] == period].groupby('SK_ID_BUREAU').agg({
            'STATUS': ['mean'],
            'WEIGHTED_STATUS': ['mean'],
            'EXP_WEIGHTED_STATUS': ['mean'],
            'EXP_ENCODED_STATUS': ['mean']
        })
        
        period_group.columns = ['_'.join(col).upper() + '_PERIOD_' + str(period) for col in period_group.columns]

        if period == 0:
            balance_agg_all_years = period_group
        else:
            balance_agg_all_years = balance_agg_all_years.merge(period_group, on='SK_ID_BUREAU', how='outer')


    balance_agg_rest_years = bureau_balance[bureau_balance.MONTHS_BALANCE > period] \
                    .groupby(['SK_ID_BUREAU']).agg({
                                                'STATUS' : ['mean'],
                                                'WEIGHTED_STATUS' : ['mean'],
                                                'EXP_WEIGHTED_STATUS' : ['mean'],
                                                'EXP_ENCODED_STATUS' : ['mean'] 
                                                })
    balance_agg_rest_years.columns = ['_'.join(ele).upper() + '_THE_REST' for ele in balance_agg_rest_years.columns]

    balance_agg_all_years = balance_agg_all_years.merge(balance_agg_rest_years, on = 'SK_ID_BUREAU', how = 'outer')
    bureau_balance_agg = bureau_balance_agg.merge(balance_agg_all_years, on = 'SK_ID_BUREAU', how = 'inner')

    bureau_balance_agg.fillna(0, inplace = True)
    
    return bureau_balance_agg

In [15]:
def feature_engineering_bureau(bureau, final_bureau_balance):
    # Merge the bureau_balance with bureau
    bureau = bureau.merge(final_bureau_balance, on='SK_ID_BUREAU', how='left')

    # New features for DAYS columns:
    bureau['CREDIT_AGE'] = abs(bureau['DAYS_CREDIT'])
    bureau['CREDIT_DURATION'] = abs(bureau['DAYS_CREDIT'] - bureau['DAYS_CREDIT_ENDDATE'])
    bureau['DAYS_CREDIT_LEFT'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_CREDIT']
    bureau['FLAG_IS_OVERDUE'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)
    bureau['DAYS_OVERDUE_DURATION'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: max(0,x))
    bureau['DAYS_OVERDUE_RATIO'] = bureau['CREDIT_DAY_OVERDUE'] / (bureau['CREDIT_DURATION'] + 0.0001)
    bureau['DAYS_EARLY_REPAYMENT'] = abs(bureau['DAYS_ENDDATE_FACT'] - bureau['DAYS_CREDIT_ENDDATE'])
    bureau['FLAG_IS_EARLY_REPAYMENT'] = bureau['DAYS_CREDIT_ENDDATE'].apply(lambda x: 1 if x > 0 else 0)
    bureau['DAYS_SINCE_LAST_UPDATE'] = bureau['DAYS_CREDIT_UPDATE'] - bureau['DAYS_CREDIT']
    bureau['CREDIT_ENDDATE_UPDATE_DIFF'] = abs(bureau['DAYS_CREDIT_UPDATE'] - bureau['DAYS_CREDIT_ENDDATE']) 
    
    # New features for AMT columns:
    bureau['CREDIT_UTILIZATION_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / (bureau['AMT_CREDIT_SUM'] + 0.0001)
    bureau['AMT_OVERDUE_RATIO'] = bureau['AMT_CREDIT_SUM_OVERDUE'] / (bureau['AMT_CREDIT_SUM'] + 0.0001)
    bureau['PROLONGATION_FREQUENCY'] = bureau['CNT_CREDIT_PROLONG'] / (bureau['CREDIT_DURATION'] + 0.0001)
    bureau['CREDIT_DEBT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
    bureau['DEBT_LIMIT_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / (bureau['AMT_CREDIT_SUM_LIMIT'] + 0.0001)
    bureau['MAX_OVERDUE_DEBT_RATIO'] = bureau['AMT_CREDIT_MAX_OVERDUE'] / (bureau['AMT_CREDIT_SUM_DEBT'] + 0.0001)

    bureau['DEBT_CREDIT_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / (bureau['AMT_CREDIT_SUM_OVERDUE'] + 0.0001)
    bureau['OVERDUE_SEVERITY'] = bureau['AMT_CREDIT_MAX_OVERDUE'] / (bureau['AMT_CREDIT_SUM_OVERDUE'] + 0.0001)
    bureau['OVERDUE_DURATION_RATIO'] = bureau['DAYS_OVERDUE_DURATION'] / (bureau['CREDIT_DURATION'] + 0.0001)
    
    # Combine all other credit type into 'Other' category (expect Consumer Credit, Credit Card, Car loan, Mortgage, Microloan)
    column_to_keep = ['Consumer credit', 'Credit card', 'Car loan', 'Mortgage', 'Microloan']
    bureau['CREDIT_TYPE'] = bureau['CREDIT_TYPE'].apply(lambda x: x if x in column_to_keep else 'Other')

    # Only keep Active and Closed status, change Sold and Bad Debt to Other
    column_to_keep = ['Active', 'Closed']
    bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].apply(lambda x: x if x in column_to_keep else 'Other')

    # Aggregate with respect to 'SK_ID_CURR' in order to merge with application_train
    # First, aggreagate based on the category of CREDIT_ACTIVE
    aggregate_CREDIT_ACTIVE = {
        'CREDIT_AGE' : ['mean'],
        'CREDIT_DURATION' : ['mean'],
        'DAYS_CREDIT_LEFT' : ['mean'],
        'FLAG_IS_OVERDUE' : ['sum'],
        'DAYS_OVERDUE_DURATION' : ['mean'],
        'DAYS_OVERDUE_RATIO' : ['mean'],
        'DAYS_EARLY_REPAYMENT' : ['mean'],
        'FLAG_IS_EARLY_REPAYMENT' : ['sum'],
        'DAYS_SINCE_LAST_UPDATE' : ['mean'],
        'CREDIT_ENDDATE_UPDATE_DIFF' : ['mean'],
        'CREDIT_UTILIZATION_RATIO' : ['mean'],
        'AMT_OVERDUE_RATIO' : ['mean'],
        'PROLONGATION_FREQUENCY' : ['mean'],
        'CREDIT_DEBT_DIFF' : ['mean'],
        'DEBT_LIMIT_RATIO' : ['mean'],
        'MAX_OVERDUE_DEBT_RATIO' : ['mean'],
        'DEBT_CREDIT_RATIO' : ['mean'],
        'OVERDUE_SEVERITY' : ['mean'],
        'OVERDUE_DURATION_RATIO' : ['mean'],
        'DAYS_CREDIT' : ['mean','min','max'],
        'CREDIT_DAY_OVERDUE' : ['mean'],
        'DAYS_CREDIT_ENDDATE' : ['mean','max'],
        'DAYS_ENDDATE_FACT' : ['mean','max'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'AMT_CREDIT_SUM' : ['sum','mean'],
        'AMT_CREDIT_SUM_DEBT': ['sum', 'mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean','sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean','sum'],
        'DAYS_CREDIT_UPDATE' : ['mean','min'],
    }

    agg_bureau_credit = pd.DataFrame()
    agg_active_type = ['Active', 'Closed', 'Other']
    for i, type in enumerate(agg_active_type):
        bureau_type = bureau.loc[bureau['CREDIT_ACTIVE'] == type]
        bureau_type_agg = bureau_type.groupby('SK_ID_CURR').agg(aggregate_CREDIT_ACTIVE)
        bureau_type_agg.columns = pd.Index(['_'.join(col_name).upper() + '_CREDIT_ACTIVE_' + type.upper() 
                                            for col_name in bureau_type_agg.columns.tolist()])
        
        if i == 0:
            agg_bureau_credit = bureau_type_agg
        else:
            agg_bureau_credit = agg_bureau_credit.merge(bureau_type_agg, on='SK_ID_CURR', how='outer')
    
    bureau = one_hot_encoding(bureau, ['CREDIT_ACTIVE', 'CREDIT_TYPE'])

    # Finally, aggregate based on 'SK_ID_CURR'
    bureau_agg = bureau.drop('SK_ID_BUREAU', axis = 1).groupby('SK_ID_CURR').agg('mean')
    bureau_agg.columns = [col + '_MEAN_OVERALL' for col in bureau_agg.columns]
    bureau_agg = bureau_agg.merge(agg_bureau_credit, on='SK_ID_CURR', how='outer')

    return bureau_agg

In [16]:
bureau_balance_final = feature_engineering_bureau_balance(bureau_balance)
bureau_final = feature_engineering_bureau(bureau, bureau_balance_final)

In [17]:
bureau_final

Unnamed: 0_level_0,DAYS_CREDIT_MEAN_OVERALL,CREDIT_DAY_OVERDUE_MEAN_OVERALL,DAYS_CREDIT_ENDDATE_MEAN_OVERALL,DAYS_ENDDATE_FACT_MEAN_OVERALL,AMT_CREDIT_MAX_OVERDUE_MEAN_OVERALL,CNT_CREDIT_PROLONG_MEAN_OVERALL,AMT_CREDIT_SUM_MEAN_OVERALL,AMT_CREDIT_SUM_DEBT_MEAN_OVERALL,AMT_CREDIT_SUM_LIMIT_MEAN_OVERALL,AMT_CREDIT_SUM_OVERDUE_MEAN_OVERALL,DAYS_CREDIT_UPDATE_MEAN_OVERALL,AMT_ANNUITY_MEAN_OVERALL,MONTHS_BALANCE_MEAN_MEAN_OVERALL,MONTHS_BALANCE_MAX_MEAN_OVERALL,STATUS_MEAN_MEAN_OVERALL,WEIGHTED_STATUS_MEAN_MEAN_OVERALL,EXP_WEIGHTED_STATUS_LAST_MEAN_OVERALL,EXP_ENCODED_STATUS_LAST_MEAN_OVERALL,STATUS_MEAN_PERIOD_0_MEAN_OVERALL,WEIGHTED_STATUS_MEAN_PERIOD_0_MEAN_OVERALL,EXP_WEIGHTED_STATUS_MEAN_PERIOD_0_MEAN_OVERALL,EXP_ENCODED_STATUS_MEAN_PERIOD_0_MEAN_OVERALL,STATUS_MEAN_THE_REST_MEAN_OVERALL,WEIGHTED_STATUS_MEAN_THE_REST_MEAN_OVERALL,EXP_WEIGHTED_STATUS_MEAN_THE_REST_MEAN_OVERALL,...,DEBT_LIMIT_RATIO_MEAN_CREDIT_ACTIVE_OTHER,MAX_OVERDUE_DEBT_RATIO_MEAN_CREDIT_ACTIVE_OTHER,DEBT_CREDIT_RATIO_MEAN_CREDIT_ACTIVE_OTHER,OVERDUE_SEVERITY_MEAN_CREDIT_ACTIVE_OTHER,OVERDUE_DURATION_RATIO_MEAN_CREDIT_ACTIVE_OTHER,DAYS_CREDIT_MEAN_CREDIT_ACTIVE_OTHER,DAYS_CREDIT_MIN_CREDIT_ACTIVE_OTHER,DAYS_CREDIT_MAX_CREDIT_ACTIVE_OTHER,CREDIT_DAY_OVERDUE_MEAN_CREDIT_ACTIVE_OTHER,DAYS_CREDIT_ENDDATE_MEAN_CREDIT_ACTIVE_OTHER,DAYS_CREDIT_ENDDATE_MAX_CREDIT_ACTIVE_OTHER,DAYS_ENDDATE_FACT_MEAN_CREDIT_ACTIVE_OTHER,DAYS_ENDDATE_FACT_MAX_CREDIT_ACTIVE_OTHER,AMT_CREDIT_MAX_OVERDUE_MEAN_CREDIT_ACTIVE_OTHER,CNT_CREDIT_PROLONG_SUM_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_SUM_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_MEAN_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_DEBT_SUM_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_DEBT_MEAN_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_LIMIT_MEAN_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_LIMIT_SUM_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_OVERDUE_MEAN_CREDIT_ACTIVE_OTHER,AMT_CREDIT_SUM_OVERDUE_SUM_CREDIT_ACTIVE_OTHER,DAYS_CREDIT_UPDATE_MEAN_CREDIT_ACTIVE_OTHER,DAYS_CREDIT_UPDATE_MIN_CREDIT_ACTIVE_OTHER
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,-63.000000,0.0,237.000000,9999.000000,0.000,0.0,8.551350e+04,77566.500000,0.000000,0.0,-28.000000,0.000,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,-2348.000000,0.0,-2044.000000,9999.000000,11666.385,0.0,2.835000e+04,0.000000,0.000000,0.0,-18.000000,0.000,3.777778,6.000000,0.185185,0.002523,0.012893,0.999994,0.000000,0.000000,0.000000,0.000000,0.185185,0.002523,0.002421,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,-810.333333,0.0,-572.166667,2657.000000,1257.330,0.0,4.239018e+04,16069.500000,0.000000,0.0,-625.500000,0.000,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,-1131.428571,0.0,-658.714286,3488.714286,0.000,0.0,6.913607e+04,28395.642857,0.000000,0.0,-805.000000,0.000,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,-773.333333,0.0,474.500000,4619.000000,0.000,0.0,4.704240e+05,209084.250000,0.000000,0.0,-63.333333,0.000,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307504,-1149.750000,0.0,-557.750000,1759.500000,0.000,0.0,2.165383e+05,29941.875000,0.000000,0.0,-742.750000,0.000,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
307505,-461.500000,0.0,5931.500000,9999.000000,0.000,0.0,1.809000e+06,0.000000,270000.000000,0.0,-12.000000,90463.500,0.307692,1.000000,0.692308,0.237443,0.134161,0.999997,0.500000,0.228333,0.255484,0.500000,0.357143,0.016918,0.016318,...,,,,,,,,,,,,,,,,,,,,,,,,,
307506,-1390.500000,0.0,-1115.500000,-1095.250000,0.000,0.0,6.481575e+04,0.000000,0.000000,0.0,-862.500000,0.000,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
307508,-765.428571,0.0,610.285714,5577.285714,0.000,0.0,7.866153e+05,35964.000000,257.817857,0.0,-154.571429,3639.465,1.056418,1.428571,1.257893,0.192758,0.095737,1.033572,0.571429,0.161617,0.180409,0.606867,0.686465,0.031141,0.035007,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [18]:
bureau_final.to_csv('bureau_test.csv', mode='w+')

In [19]:
# How many unique SK_ID_CURR are there in bureau_final that also in application_train?
print('Number of unique SK_ID_CURR in bureau_final:', bureau_final.index.nunique())
print('Number of unique SK_ID_CURR in application_train:', application_train['SK_ID_CURR'].nunique())
print('Number of unique SK_ID_CURR in both bureau_final and application_train:', 
      len(set(bureau_final.index) & set(application_train['SK_ID_CURR'])))

Number of unique SK_ID_CURR in bureau_final: 263487
Number of unique SK_ID_CURR in application_train: 246009
Number of unique SK_ID_CURR in both bureau_final and application_train: 210806


------------------------------
# 4. FEATURE SELECTION