# Home Credit Default Feature Construction

In [None]:
import numpy as np
import pandas as pd

### Load the data

In [None]:
application = pd.read_csv("application_train.csv")
bureau_balance = pd.read_csv("bureau_balance.csv")
bureau = pd.read_csv("bureau.csv")
credit_card = pd.read_csv("credit_card_balance.csv")
installments = pd.read_csv("installments_payments.csv")
pos_cash = pd.read_csv("POS_CASH_balance.csv")
previous_application = pd.read_csv("previous_application.csv")

# Preprocessing and feature construction

## Application

In [None]:
application.sort_values("SK_ID_CURR").head()

## Bureau

In [None]:
bureau.sort_values("SK_ID_CURR").head()

### Derived features

* Number of bureau records
* Maximum days overdue
* Number of active trades
* Days since most recent application
* Maximum credit overdue
* Total credit amount
* Number of credit cards
* Sum of annuity amounts

In [None]:
def bureau_agg_func(g):
    d = {"SK_ID_BUREAU": np.max(g["SK_ID_BUREAU"]), 
        "COUNT_BUREAU_RECORDS": len(g), 
        "COUNT_ACTIVE": np.sum(g["CREDIT_ACTIVE"] == "Active"), 
        "MAX_CREDIT_DAY_OVERDUE": np.max(g["CREDIT_DAY_OVERDUE"]), 
        "DAYS_SINCE_APPLIED": - np.max(g["DAYS_CREDIT"]), 
        "MAX_AMT_CREDIT_MAX_OVERDUE": np.max(g["AMT_CREDIT_MAX_OVERDUE"]), 
        "TOTAL_AMT_CREDIT_SUM": np.sum(g["AMT_CREDIT_SUM"]), 
        "COUNT_CREDIT_CARD": np.sum(g["CREDIT_TYPE"] == "Credit card"), 
        "SUM_AMT_ANNUITY": np.sum(g["AMT_ANNUITY"])}
    
    return pd.Series(d)

In [None]:
bureau_agg = bureau.groupby("SK_ID_CURR").apply(bureau_agg_func).reset_index()
bureau_agg.head()

## Bureau balance

In [None]:
bureau_balance.sort_values(["SK_ID_BUREAU", "MONTHS_BALANCE"]).head()

### Derived features

* Proportion and sum of months current (includes unreported status)
* Worst delinquent status

In [None]:
def bureau_balance_agg_func(g):
    d = {"PROP_CURRENT": np.mean(g["STATUS"].apply(lambda x: x in ["C", "X"])), 
        "COUNT_CURRENT": np.sum(g["STATUS"].apply(lambda x: x in ["C", "X"])), 
        "WORST_DQ_STATUS": np.max(g["STATUS"].apply(lambda x: 0 if x in ["C", "X"] else int(x)))}
    
    return pd.Series(d)

In [None]:
bureau_balance_agg = bureau_balance.groupby("SK_ID_BUREAU").apply(bureau_balance_agg_func).reset_index()
bureau_balance_agg.head()

### Join application and bureau data

In [None]:
df = pd.merge(application, 
              pd.merge(bureau_agg, bureau_balance_agg, 
                       how="inner", 
                       on="SK_ID_BUREAU"), 
              how="left", 
              on="SK_ID_CURR")
del bureau_agg, bureau_balance_agg

## Credit card

In [None]:
credit_card.sort_values(["SK_ID_CURR", "SK_ID_PREV", "MONTHS_BALANCE"]).head()

### Derived features

#### Balance and utilization

* Average utilization
* Max utilization
* Most recent utilization

#### ATM withdrawals

* Average count
* Maximum count

#### Days past due

* Maximum days past due
* Maximum days past due with tolerance

In [None]:
def credit_card_agg_func(g):
    d = {"AVG_BALANCE": np.mean(g["AMT_BALANCE"]), 
        "MAX_BALANCE": np.max(g["AMT_BALANCE"]), 
        "AVG_UTILIZATION": np.mean(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "MAX_UTILIZATION": np.max(g["AMT_BALANCE"] / g["AMT_CREDIT_LIMIT_ACTUAL"]), 
        "MAX_DPD": np.max(g["SK_DPD"]), 
        "MAX_DPD_DEF": np.max(g["SK_DPD_DEF"]), 
        "AVG_CNT_DRAWINGS_ATM_CURRENT": np.mean(g["CNT_DRAWINGS_ATM_CURRENT"]), 
        "MAX_CNT_DRAWINGS_ATM_CURRENT": np.max(g["CNT_DRAWINGS_ATM_CURRENT"])}
    
    return pd.Series(d)

In [None]:
credit_card_agg = credit_card.groupby("SK_ID_CURR").apply(credit_card_agg_func).reset_index()
credit_card_agg.head()

### Join features

In [None]:
df = pd.merge(df, credit_card_agg, how="left", on="SK_ID_CURR")
del credit_card_agg

## Installments

In [None]:
installments.sort_values(["SK_ID_CURR", "SK_ID_PREV", "NUM_INSTALMENT_NUMBER"]).head()

### Derived features

* Number of underpayments (less than 50% of amount due)
* Average payment size

In [None]:
def installment_agg_func(g):
    d = {"COUNT_UNDERPAYMENT": np.sum(g["AMT_PAYMENT"] / g["AMT_INSTALMENT"] < 0.5), 
        "AVG_PAYMENT_SIZE": np.mean(g["AMT_PAYMENT"])}
    
    return pd.Series(d)

In [None]:
installment_agg = installments.groupby("SK_ID_CURR").apply(installment_agg_func).reset_index()
installment_agg.head()

### Join features

In [None]:
df = pd.merge(df, installment_agg, how="left", on="SK_ID_CURR")
del installment_agg

## Point of sale cash

In [None]:
pos_cash.sort_values(["SK_ID_CURR", "SK_ID_PREV"]).head()

### Derived features

* Worst days past due status
* Worst days past due status with tolerance

In [None]:
def pos_cash_agg_func(g):
    d = {"MAX_POS_DPD": np.max(g["SK_DPD"]), 
        "MAX_POS_DPD_DEF": np.max(g["SK_DPD_DEF"])}
    
    return pd.Series(d)

In [None]:
pos_cash_agg = pos_cash.groupby("SK_ID_CURR").apply(pos_cash_agg_func).reset_index()
pos_cash_agg.head()

### Join features

In [None]:
df = pd.merge(df, pos_cash_agg, how="left", on="SK_ID_CURR")
del pos_cash_agg

## Previous application

In [None]:
previous_application.sort_values(["SK_ID_CURR", "SK_ID_PREV"]).head()

### Derived features

* Count of previous applications
* Average and maximum credit amount requested
* Averages involving down payment
* Average proportion approved
* Average interest rate of previous applications
* Count or proportion of different loan purposes
* Count or proportion of payment methods
* Count or proportion of rejection reasons
* Count or proportion of insurance requested
* Count of walk-in applications
* Average and maximum interest rate group

In [None]:
def previous_agg_func(g):
    d = {"COUNT_PREV_APP": len(g), 
        "AVG_PREV_REQ_AMOUNT": np.mean(g["AMT_APPLICATION"]), 
        "MAX_PREV_REQ_AMOUNT": np.max(g["AMT_APPLICATION"]), 
        "AVG_PREV_RATE_DOWNPAYMENT": np.mean(g["RATE_DOWN_PAYMENT"]), 
        "AVG_PREV_PROP_APPROVED": np.mean(g["AMT_CREDIT"] / g["AMT_APPLICATION"]), 
        "AVG_PREV_INT_RATE": np.mean(g["RATE_INTEREST_PRIMARY"]), 
        "SUM_PREV_URGENT_NEEDS": np.sum(g["NAME_CASH_LOAN_PURPOSE"] == "Urgent needs"), 
        "SUM_PREV_REPAIRS": np.sum(g["NAME_CASH_LOAN_PURPOSE"] == "Repairs"), 
        "SUM_PREV_OTHER": np.sum(g["NAME_CASH_LOAN_PURPOSE"] == "Other"), 
        "SUM_PREV_LIMIT_REJECT": np.sum(g["CODE_REJECT_REASON"] == "LIMIT"), 
        "SUM_PREV_HC_REJECT": np.sum(g["CODE_REJECT_REASON"] == "HC"), 
        "SUM_PREV_INSURE_REQ": np.sum(g["NFLAG_INSURED_ON_APPROVAL"]), 
        "COUNT_PREV_WALK_IN": np.sum(g["NAME_PRODUCT_TYPE"] == "walk-in"), 
        "COUNT_PREV_HIGH_YIELD": np.sum(g["NAME_YIELD_GROUP"] == "high"), 
        "COUNT_PREV_LOW_YIELD": np.sum(g["NAME_YIELD_GROUP"].apply(lambda x: x.startswith("low")))}
    
    return pd.Series(d)

In [None]:
previous_agg = previous_application.groupby("SK_ID_CURR").apply(previous_agg_func).reset_index()
previous_agg.head()

### Join features

In [None]:
df = pd.merge(df, previous_agg, how="left", on="SK_ID_CURR")
df.drop("SK_ID_BUREAU", axis=1, inplace=True)
del previous_agg
del application, bureau_balance, bureau, credit_card, installments, pos_cash, previous_application