In this section, I will clean the data of two files: `installments_payments.csv` and `credit_card_balance.csv`. After that, with the knowledge and insights gained from the Exploratory Data Analysis, I will come up with a good set of features using Feature Engineering. 

# Preliminaries

## 1. Loading Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import uniform

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

from sklearn.impute import SimpleImputer

## 2. Loading Data

In [2]:
installments_payments = pd.read_csv(".\\Data\\final_project_DP_dataset\\installments_payments.csv")
cc_balance = pd.read_csv(".\\Data\\final_project_DP_dataset\\credit_card_balance.csv")

# Data Cleaning and Feature Engineering

## 1. `installments_payments.csv`

In [3]:
def installments_payments_preprocessing(installments_payments):
    # Drop the row with NaN value in SK_ID_CURR
    installments_payments = installments_payments.drop(installments_payments[installments_payments['SK_ID_CURR'].isna()].index)

    # Replace the NaN values in AMT_INSTALMENT and DAYS_ENTRY_PAYMENT with 0
    installments_payments[['AMT_INSTALMENT', 'DAYS_ENTRY_PAYMENT']] = installments_payments[['AMT_INSTALMENT', 'DAYS_ENTRY_PAYMENT']].fillna(value = 0)

    # Handling outliers of NUM_INSTALMENT_VERSION column 
    installments_payments = installments_payments[~installments_payments['NUM_INSTALMENT_VERSION'].isin([68, 178, 52, 48, 47, 49, 50])]

    return installments_payments

In [4]:
def compute_ema(df, column, alpha=0.5):
    return df[column].ewm(alpha=alpha).mean()

def installment_payment_features_generating(installments_payments):
    installments_payments.sort_values(
        by=['SK_ID_CURR', 'SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'], 
        ascending=True, 
        inplace=True
    )

    installments_payments['DAYS_PAYMENT_DIFF'] = installments_payments['DAYS_INSTALMENT'] - installments_payments['DAYS_ENTRY_PAYMENT']
    installments_payments['IS_DELAYED'] = installments_payments['DAYS_PAYMENT_DIFF'] > 0  # True if payment is delayed
    installments_payments['EXP_DAYS_PAYMENT_DIFF'] = compute_ema(installments_payments, 'DAYS_PAYMENT_DIFF')
    installments_payments['AMT_PAYMENT_DIFF'] = installments_payments['AMT_INSTALMENT'] - installments_payments['AMT_PAYMENT']
    installments_payments['AMT_PAYMENT_RATIO'] = installments_payments['AMT_PAYMENT'] / (installments_payments['AMT_INSTALMENT'] + 1)
    installments_payments['EXP_AMT_PAYMENT_DIFF'] = compute_ema(installments_payments, 'AMT_PAYMENT_DIFF')
    installments_payments['EXP_AMT_PAYMENT_RATIO'] = compute_ema(installments_payments, 'AMT_PAYMENT_RATIO')

    prev_aggregations = {
        'NUM_INSTALMENT_NUMBER': ['max'],
        'DAYS_INSTALMENT': ['max', 'min'],
        'DAYS_ENTRY_PAYMENT': ['max', 'min'],
        'AMT_INSTALMENT': ['mean', 'sum', 'max'],
        'AMT_PAYMENT': ['mean', 'sum', 'max'],
        'DAYS_PAYMENT_DIFF': ['mean', 'min', 'max'],
        'EXP_DAYS_PAYMENT_DIFF': ['last'],
        'IS_DELAYED': ['sum'],
        'AMT_PAYMENT_DIFF': ['mean', 'min', 'max'],
        'AMT_PAYMENT_RATIO': ['mean', 'min', 'max'],
        'EXP_AMT_PAYMENT_DIFF': ['last'],
        'EXP_AMT_PAYMENT_RATIO': ['last']
    }
    
    group_overall = installments_payments.groupby(['SK_ID_PREV', 'SK_ID_CURR'], as_index=False).agg(prev_aggregations)
    group_overall.columns = ['_'.join(col).upper() for col in group_overall.columns]
    group_overall.rename(columns={'SK_ID_PREV_': 'SK_ID_PREV', 'SK_ID_CURR_': 'SK_ID_CURR'}, inplace=True)

    last_year_aggregations = {
        'AMT_INSTALMENT': ['mean', 'sum', 'max'],
        'AMT_PAYMENT': ['mean', 'sum', 'max'],
        'DAYS_PAYMENT_DIFF': ['mean', 'min', 'max'],
        'AMT_PAYMENT_RATIO': ['mean', 'min', 'max'],
        'AMT_PAYMENT_DIFF': ['mean', 'min', 'max'],
        'EXP_DAYS_PAYMENT_DIFF': ['last'],
        'EXP_AMT_PAYMENT_RATIO': ['last'],
        'EXP_AMT_PAYMENT_DIFF': ['last']
    }
    last_year_agg = installments_payments[installments_payments['DAYS_INSTALMENT'] > -365].groupby('SK_ID_PREV').agg(last_year_aggregations)
    last_year_agg.columns = ['_'.join(col).upper() + '_LAST_1_YEAR' for col in last_year_agg.columns]
    installments_payments_agg_prev = group_overall.merge(last_year_agg, on='SK_ID_PREV', how='outer')

    main_features_aggregations = {
        'AMT_INSTALMENT_MEAN': ['mean', 'sum', 'max'],
        'AMT_INSTALMENT_SUM': ['mean', 'sum', 'max'],
        'AMT_INSTALMENT_MAX': ['mean'],
        'AMT_PAYMENT_MEAN': ['mean', 'sum', 'max'],
        'AMT_PAYMENT_SUM': ['mean', 'sum', 'max'],
        'AMT_PAYMENT_MAX': ['mean'],
        'DAYS_PAYMENT_DIFF_MEAN': ['mean', 'min', 'max'],
        'DAYS_PAYMENT_DIFF_MIN': ['mean', 'min'],
        'DAYS_PAYMENT_DIFF_MAX': ['mean', 'max'],
        'AMT_PAYMENT_RATIO_MEAN': ['mean', 'min', 'max'],
        'AMT_PAYMENT_RATIO_MIN': ['mean', 'min'],
        'AMT_PAYMENT_RATIO_MAX': ['mean', 'max'],
        'AMT_PAYMENT_DIFF_MEAN': ['mean', 'min', 'max'],
        'AMT_PAYMENT_DIFF_MIN': ['mean', 'min'],
        'AMT_PAYMENT_DIFF_MAX': ['mean', 'max'],
        'EXP_DAYS_PAYMENT_DIFF_LAST': ['mean'],
        'EXP_AMT_PAYMENT_RATIO_LAST': ['mean'],
        'EXP_AMT_PAYMENT_DIFF_LAST': ['mean']
    }
    grouped_main_features = installments_payments_agg_prev.groupby('SK_ID_CURR').agg(main_features_aggregations)
    grouped_main_features.columns = ['_'.join(col).upper() for col in grouped_main_features.columns]

    grouped_remaining_features = installments_payments_agg_prev.iloc[:, [1] + list(range(26, len(installments_payments_agg_prev.columns)))].groupby('SK_ID_CURR').mean()
    installments_payments_aggregated = grouped_main_features.merge(grouped_remaining_features, on='SK_ID_CURR', how='inner')

    return installments_payments_aggregated


## 2. `credit_card_balance.csv`

In [7]:
def cc_balance_preprocessing(cc_balance):
    # Replace NaN values in the numerical columns with mean
    columns = cc_balance.select_dtypes(include = ['float64', 'int64']).columns
    numerical_imputer = SimpleImputer(strategy = 'mean')
    cc_balance[columns] = numerical_imputer.fit_transform(cc_balance[columns])

    # Handling outliers (removing one abrubtly large value in AMT_PAYMENT_CURRENT)
    cc_balance['AMT_PAYMENT_CURRENT'][cc_balance['AMT_PAYMENT_CURRENT'] > 4000000] = np.nan

    return cc_balance

In [8]:
def cc_balance_features_generating(cc_balance):
    cc_balance['MONTHS_BALANCE'] = np.abs(cc_balance['MONTHS_BALANCE'])
    cc_balance = cc_balance.sort_values(by = ['SK_ID_PREV','MONTHS_BALANCE'], ascending = [1, 0])

    # One-hot encoding the categorical column NAME_CONTRACT_STATUS
    contract_status_dummies = pd.get_dummies(cc_balance['NAME_CONTRACT_STATUS'], prefix = "CONTRACT")
    contract_names = contract_status_dummies.columns.tolist() 

    withdrawal_cols = ['AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT']
    cc_balance['AMT_DRAWING_SUM'] = cc_balance[withdrawal_cols].sum(axis=1)
    drawing_cols = ['CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM']
    cc_balance['CNT_DRAWING_SUM'] = cc_balance[drawing_cols].sum(axis=1)
    cc_balance['BALANCE_LIMIT_RATIO'] = cc_balance['AMT_BALANCE'] / (cc_balance['AMT_CREDIT_LIMIT_ACTUAL'] + 1)
    cc_balance['AMT_INTEREST_RECEIVABLE'] = cc_balance['AMT_TOTAL_RECEIVABLE'] - cc_balance['AMT_RECEIVABLE_PRINCIPAL']
    cc_balance['PAYMENT_BALANCE_RATIO'] = cc_balance['AMT_PAYMENT_CURRENT'] / (cc_balance['AMT_BALANCE'] + 1)
    cc_balance['MIN_PAYMENT_RATIO'] = cc_balance['AMT_PAYMENT_CURRENT'] / (cc_balance['AMT_INST_MIN_REGULARITY'] + 1)
    cc_balance['MIN_PAYMENT_TOTAL_RATIO'] = cc_balance['AMT_PAYMENT_TOTAL_CURRENT'] / (cc_balance['AMT_INST_MIN_REGULARITY'] + 1)
    cc_balance['MIN_PAYMENT_DIFF'] = cc_balance['AMT_PAYMENT_CURRENT'] - cc_balance['AMT_INST_MIN_REGULARITY']
    cc_balance['MIN_PAYMENT_TOTAL_DIFF'] = cc_balance['AMT_PAYMENT_TOTAL_CURRENT'] - cc_balance['AMT_INST_MIN_REGULARITY']
    cc_balance['PAYMENT_RECEIVABLE_RATIO'] = cc_balance['AMT_PAYMENT_TOTAL_CURRENT'] / (cc_balance['AMT_TOTAL_RECEIVABLE'] + 1)
    cc_balance['SK_DPD_RATIO'] = cc_balance['SK_DPD'] / (cc_balance['SK_DPD_DEF'] + 1)
    cc_balance['CUMULATIVE_PAYMENT'] = cc_balance.groupby(['SK_ID_CURR', 'MONTHS_BALANCE'])['AMT_PAYMENT_TOTAL_CURRENT'].cumsum()
    cc_balance['CUMULATIVE_BALANCE'] = cc_balance.groupby(['SK_ID_CURR', 'MONTHS_BALANCE'])['AMT_BALANCE'].cumsum()

    # Compute the Exponential Weighted Moving Average 
    rolling_columns = [
            'AMT_BALANCE',
            'AMT_CREDIT_LIMIT_ACTUAL',
            'AMT_RECEIVABLE_PRINCIPAL',
            'AMT_RECIVABLE',
            'AMT_TOTAL_RECEIVABLE',
            'AMT_DRAWING_SUM',
            'CNT_DRAWING_SUM',
            'BALANCE_LIMIT_RATIO',
            'AMT_INTEREST_RECEIVABLE',
            'MIN_PAYMENT_RATIO',
            'MIN_PAYMENT_DIFF',
            'MIN_PAYMENT_TOTAL_RATIO',
            'MIN_PAYMENT_TOTAL_DIFF',
            'SK_DPD_RATIO',
            'CUMULATIVE_PAYMENT',
            'CUMULATIVE_BALANCE']
    exp_weighted_columns = ['EXP_' + ele for ele in rolling_columns]
    cc_balance[exp_weighted_columns] = cc_balance.groupby(['SK_ID_CURR','SK_ID_PREV'])[rolling_columns].transform(lambda x: x.ewm(alpha = 0.7).mean())

    # Aggregate data by SK_ID_PREV
    aggregations = {
            'SK_ID_CURR' : ['first'],
            'MONTHS_BALANCE': ['max'],
            'AMT_BALANCE' : ['sum','mean','max'],
            'AMT_CREDIT_LIMIT_ACTUAL' : ['sum','mean','max'],
            'AMT_DRAWINGS_ATM_CURRENT' : ['sum','max'],
            'AMT_DRAWINGS_CURRENT' : ['sum','max'],
            'AMT_DRAWINGS_OTHER_CURRENT' : ['sum','max'],
            'AMT_DRAWINGS_POS_CURRENT' : ['sum','max'],
            'AMT_INST_MIN_REGULARITY' : ['mean','min','max'],
            'AMT_PAYMENT_CURRENT' : ['mean','min','max'],
            'AMT_PAYMENT_TOTAL_CURRENT' : ['mean','min','max'],
            'AMT_RECEIVABLE_PRINCIPAL' : ['sum','mean','max'],
            'AMT_RECIVABLE' : ['sum','mean','max'],
            'AMT_TOTAL_RECEIVABLE' : ['sum','mean','max'],
            'CNT_DRAWINGS_ATM_CURRENT' : ['sum','max'],
            'CNT_DRAWINGS_CURRENT' : ['sum','max'],
            'CNT_DRAWINGS_OTHER_CURRENT' : ['sum','max'],
            'CNT_DRAWINGS_POS_CURRENT' : ['sum','max'],
            'CNT_INSTALMENT_MATURE_CUM' : ['sum','max','min'],
            'SK_DPD' : ['sum','max'],
            'SK_DPD_DEF' : ['sum','max'],

            'AMT_DRAWING_SUM' : ['sum','max'],
            'CNT_DRAWING_SUM' : ['sum','max'],
            'BALANCE_LIMIT_RATIO' : ['mean','max','min'],
            'AMT_INTEREST_RECEIVABLE' : ['min','mean'],
            'PAYMENT_BALANCE_RATIO' : ['min','mean'],
            'MIN_PAYMENT_RATIO' : ['min','mean'],
            'MIN_PAYMENT_TOTAL_RATIO' : ['min','mean'],
            'MIN_PAYMENT_DIFF' : ['min','mean'],
            'MIN_PAYMENT_TOTAL_DIFF' : ['min','mean'],
            'PAYMENT_RECEIVABLE_RATIO' : ['min','mean'],
            'SK_DPD_RATIO' : ['max','mean'],
            'CUMULATIVE_PAYMENT' : ['max','sum','mean'],
            'CUMULATIVE_BALANCE' : ['max','sum','mean'],
        
            'EXP_AMT_BALANCE' : ['last'],
            'EXP_AMT_CREDIT_LIMIT_ACTUAL' : ['last'],
            'EXP_AMT_RECEIVABLE_PRINCIPAL' : ['last'],
            'EXP_AMT_RECIVABLE' : ['last'],
            'EXP_AMT_TOTAL_RECEIVABLE' : ['last'],
            'EXP_AMT_DRAWING_SUM' : ['last'],
            'EXP_CNT_DRAWING_SUM' : ['last'],
            'EXP_BALANCE_LIMIT_RATIO' : ['last'],
            'EXP_AMT_INTEREST_RECEIVABLE' : ['last'],
            'EXP_MIN_PAYMENT_RATIO' : ['last'],
            'EXP_MIN_PAYMENT_DIFF' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_RATIO' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_DIFF' : ['last'],
            'EXP_SK_DPD_RATIO' : ['last'],
            'EXP_CUMULATIVE_PAYMENT' : ['last'],
            'EXP_CUMULATIVE_BALANCE' : ['last']
        }

    cc_balance_aggregated_overall = cc_balance.groupby('SK_ID_PREV').agg(aggregations)
    cc_balance_aggregated_overall.columns = ['_'.join(ele).upper() for ele in cc_balance_aggregated_overall.columns]
    cc_balance_aggregated_overall.rename(columns = {'SK_ID_CURR_FIRST' : 'SK_ID_CURR'}, inplace = True)

    # Aggregate data over Contract Status 
    aggregations_for_categories = {
            'SK_DPD' : ['sum','max'],
            'SK_DPD_DEF' : ['sum','max'],
            'BALANCE_LIMIT_RATIO' : ['mean','max','min'],
            'AMT_INTEREST_RECEIVABLE' : ['min','mean'],
            'CNT_DRAWING_SUM' : ['sum','max'],
            'MIN_PAYMENT_RATIO': ['min','mean'],
            'MIN_PAYMENT_DIFF' : ['min','mean'],
            'MIN_PAYMENT_TOTAL_RATIO' : ['min','mean'], 
            'MIN_PAYMENT_TOTAL_DIFF' : ['min','mean'],
            'SK_DPD_RATIO' : ['max','mean'],
            'EXP_AMT_DRAWING_SUM' : ['last'],
            'EXP_BALANCE_LIMIT_RATIO' : ['last'],
            'EXP_CNT_DRAWING_SUM' : ['last'],
            'EXP_MIN_PAYMENT_RATIO' : ['last'],
            'EXP_MIN_PAYMENT_DIFF' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_DIFF' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_RATIO' : ['last'],
            'EXP_AMT_INTEREST_RECEIVABLE' : ['last'],
            'EXP_SK_DPD_RATIO' : ['last'],
        }

    contract_status_categories = ['Active', 'Completed']
    cc_balance_categories_agg = pd.DataFrame()
    for i, contract_type in enumerate(contract_status_categories):
        group = cc_balance[cc_balance['NAME_CONTRACT_STATUS'] == contract_type].groupby('SK_ID_PREV').agg(aggregations_for_categories)
        group.columns = ['_'.join(ele).upper() + '_' + contract_type.upper() for ele in group.columns]
        if i == 0:
            cc_balance_categories_agg = group
        else:
            cc_balance_categories_agg = cc_balance_categories_agg.merge(group, on = 'SK_ID_PREV', how = 'outer')

    cc_balance_rest_categories_agg = cc_balance[(cc_balance['NAME_CONTRACT_STATUS'] != 'Active') & 
                                        (cc_balance['NAME_CONTRACT_STATUS'] != 'Completed')].groupby('SK_ID_PREV').agg(aggregations_for_categories)
    cc_balance_rest_categories_agg.columns = ['_'.join(ele).upper() + '_REST' for ele in cc_balance_rest_categories_agg.columns]
    cc_balance_categories_agg = cc_balance_categories_agg.merge(cc_balance_rest_categories_agg, on = 'SK_ID_PREV', how = 'outer')

    cc_balance['YEAR_BALANCE'] = cc_balance['MONTHS_BALANCE'] // 12

    aggregations_for_year = {
            'SK_DPD' : ['sum','max'],
            'SK_DPD_DEF' : ['sum','max'],
            'BALANCE_LIMIT_RATIO' : ['mean','max','min'],
            'AMT_INTEREST_RECEIVABLE' : ['min','mean'],
            'CNT_DRAWING_SUM' : ['sum','max'],
            'MIN_PAYMENT_RATIO': ['min','mean'],
            'MIN_PAYMENT_DIFF' : ['min','mean'],
            'MIN_PAYMENT_TOTAL_RATIO' : ['min','mean'], 
            'MIN_PAYMENT_TOTAL_DIFF' : ['min','mean'],
            'SK_DPD_RATIO' : ['max','mean'],
            'CUMULATIVE_PAYMENT' : ['max','sum','mean'],
            'CUMULATIVE_BALANCE' : ['max','sum','mean'],
            'EXP_AMT_DRAWING_SUM' : ['last'],
            'EXP_BALANCE_LIMIT_RATIO' : ['last'],
            'EXP_AMT_INTEREST_RECEIVABLE' : ['last'],
            'EXP_CNT_DRAWING_SUM' : ['last'],
            'EXP_MIN_PAYMENT_RATIO' : ['last'],
            'EXP_MIN_PAYMENT_DIFF' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_RATIO' : ['last'],
            'EXP_MIN_PAYMENT_TOTAL_DIFF' : ['last'],
            'EXP_SK_DPD_RATIO' : ['last'],
            'EXP_CUMULATIVE_PAYMENT' : ['last'],
            'EXP_CUMULATIVE_BALANCE' : ['last']
        }

    cc_balance_year_agg = pd.DataFrame()
    for year in range(2):
        group = cc_balance[cc_balance['YEAR_BALANCE'] == year].groupby('SK_ID_PREV').agg(aggregations_for_year)
        group.columns = ['_'.join(ele).upper() + '_YEAR_' + str(year) for ele in group.columns]
        if year == 0:
            cc_balance_year_agg = group
        else:
            cc_balance_year_agg = cc_balance_year_agg.merge(group, on = 'SK_ID_PREV', how = 'outer')

    cc_balance_rest_year_agg = cc_balance[cc_balance['YEAR_BALANCE'] >= 2].groupby('SK_ID_PREV').agg(aggregations_for_year)
    cc_balance_rest_year_agg.columns = ['_'.join(ele).upper() + '_YEAR_REST' for ele in cc_balance_rest_year_agg.columns]
    cc_balance_year_agg = cc_balance_year_agg.merge(cc_balance_rest_year_agg, on = 'SK_ID_PREV', how = 'outer')
    cc_balance = cc_balance.drop('YEAR_BALANCE', axis = 1)

    cc_aggregated = cc_balance_aggregated_overall.merge(cc_balance_categories_agg, on = 'SK_ID_PREV', how = 'outer')
    cc_aggregated = cc_aggregated.merge(cc_balance_year_agg, on = 'SK_ID_PREV', how = 'outer')

    cc_balance = pd.concat([cc_balance, contract_status_dummies], axis=1)
    aggregated_cc_contract = cc_balance[['SK_ID_PREV'] + contract_names].groupby('SK_ID_PREV').mean()
    cc_aggregated = cc_aggregated.merge(aggregated_cc_contract, on = 'SK_ID_PREV', how = 'outer')

    cc_aggregated = cc_aggregated.groupby('SK_ID_CURR', as_index = False).mean()
            
    return cc_aggregated