In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import gc
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
from contextlib import contextmanager
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
import multiprocessing

In [2]:
print(os.listdir("data/"))

['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv']


In [3]:
app_train_df = pd.read_csv("data/application_train.csv")
app_test_df = pd.read_csv("data/application_test.csv")
bureau = pd.read_csv("data/bureau.csv")
bureau_balance = pd.read_csv("data/bureau_balance.csv")
column_description_df = pd.read_csv("data/HomeCredit_columns_description.csv", encoding='ISO-8859-1')

train_labels = app_train_df.iloc[:,:2]
test_id = app_test_df.iloc[:,0]

In [4]:
@contextmanager
def timer(name):
    time_before_func_call = time.time()
    yield
    time_after_func_call = time.time()
    print('##################')
    print('{} - done in {:.0f}s \n'.format(name, time_after_func_call - time_before_func_call))


def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

    
def column_description_lookup(column):
    print(column_description_df[column_description_df['Row']==column].Description.values[0])

    
def get_app_domain_features(df):
    df = df.copy()
    if 'DAYS_EMPLOYED' in df:
        df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    
    return df


def get_previous_app_domain_features(df):
    df = df.copy()
    df['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    df['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    df['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    df['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    df['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    df['APP_CREDIT_PERC'] = df['AMT_APPLICATION'] / df['AMT_CREDIT']
    
    return df


def get_installment_domain_features(df):
    df = df.copy()
    df['PAYMENT_PERC'] = df['AMT_PAYMENT'] / df['AMT_INSTALMENT']
    df['PAYMENT_DIFF'] = df['AMT_INSTALMENT'] - df['AMT_PAYMENT']
    df['DPD'] = df['DAYS_ENTRY_PAYMENT'] - df['DAYS_INSTALMENT']
    df['DBD'] = df['DAYS_INSTALMENT'] - df['DAYS_ENTRY_PAYMENT']
    df['DPD'] = df['DPD'].apply(lambda x: x if x > 0 else 0)
    df['DBD'] = df['DBD'].apply(lambda x: x if x > 0 else 0)
    
    return df


def get_poly_features(df, imputer=False):
    poly_features = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']].copy()
    df = df.drop(columns=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
    if 'TARGET' in poly_features:
        poly_features = poly_features.drop(columns = ['TARGET'])
    
    if not imputer:
        imputer = SimpleImputer(strategy = 'median')
        poly_features = imputer.fit_transform(poly_features)
    else:
        poly_features = imputer.transform(poly_features)
    
    poly_transformer = PolynomialFeatures(degree = 3)
    poly_transformer.fit(poly_features)
    poly_features = poly_transformer.transform(poly_features)
    
    poly_features_df = pd.DataFrame(
        poly_features, 
        columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
    )
    
    # Merge polynomial features into training dataframe
    poly_features_df['SK_ID_CURR'] = df['SK_ID_CURR'].values
    df = df.merge(poly_features_df, on = 'SK_ID_CURR', how = 'left')
    
    return df, imputer


def agg_numeric(df, group_by_var):
    """
    Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_by_var (string): 
            The variable by which to group the dataframe.
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for all numeric columns. 
            Each instance of the grouping variable will have the statistics 
            (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than the grouping variable
    for col in df:
        if col != group_by_var and 'SK_ID' in col:
            df = df.drop(columns = col)
    
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[group_by_var] = df[group_by_var].values
    agg = numeric_df.groupby(group_by_var).agg(['mean', 'max', 'min', 'sum']).reset_index()

    # Flatten column names
    agg.columns = [
        column_name[0] + "_" + column_name[1] if column_name[1] else column_name[0] \
        for column_name in zip(agg.columns.get_level_values(0), agg.columns.get_level_values(1))
    ]
    
    return agg


def count_categorical(df, group_by_var):
    """
    Computes counts and normalized counts for each observation
    of `group_by_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_by_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_by_var`.
        
    """
    
    # Get categorical columns
    cat_ohe = pd.get_dummies(df.select_dtypes('category')).copy()
    cat_ohe[group_by_var] = df[group_by_var]
    cat_agg = cat_ohe.groupby(group_by_var).agg(['sum', 'mean']).reset_index()
    
    # Flatten column names
    cat_agg.columns = [
        column_name[0] + "_" + column_name[1] if column_name[1] else column_name[0] \
        for column_name in zip(cat_agg.columns.get_level_values(0), cat_agg.columns.get_level_values(1))
    ]
    
    return cat_agg


def deal_with_cat_feats(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # List Categorical Features
    cat_feat_names = train_df.select_dtypes('category').columns.to_list()
    train_df[cat_feat_names] = train_df[cat_feat_names].astype("object").fillna("MISSING")
    test_df[cat_feat_names] = test_df[cat_feat_names].astype("object").fillna("MISSING")

    # Remove rows for which category levels exist in train but not in test set
    for feat in cat_feat_names:
        cat_levels_missing_from_test = list(set(train_df[feat].value_counts().index) - set(test_df[feat].value_counts().index))
        if cat_levels_missing_from_test:
            train_df.drop(
                index=train_df[train_df[feat].isin(cat_levels_missing_from_test)].index, 
                inplace=True
            )

    # label encode categorical variables
    cat_feat_levels = {}

    for feat in cat_feat_names:
        le = LabelEncoder()
        train_df[feat] = le.fit_transform(train_df[feat])
        test_df[feat] = le.transform(test_df[feat])
        cat_feat_levels[feat] = le.classes_

        # Replace "MISSING" level with negative value (treated as missing by LGBM)
        if "MISSING" in cat_feat_levels[feat]:
            train_df[feat] = train_df[feat].replace(list(cat_feat_levels[feat]).index("MISSING"), -1)
            test_df[feat] = test_df[feat].replace(list(cat_feat_levels[feat]).index("MISSING"), -1)
            
    train_df[cat_feat_names] = train_df[cat_feat_names].astype("category")
    test_df[cat_feat_names] = test_df[cat_feat_names].astype("category")
    
    return train_df, test_df

<br>
<br>

In [5]:
with timer("Reduce memory"):
    app_train_df = reduce_mem_usage(app_train_df, use_float16=True)
    app_test_df = reduce_mem_usage(app_test_df, use_float16=True)
    bureau = reduce_mem_usage(bureau, use_float16=True)
    bureau_balance = reduce_mem_usage(bureau_balance, use_float16=True)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 59.54 MB
Decreased by 79.2%
Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 9.40 MB
Decreased by 79.1%
Memory usage of dataframe is 222.62 MB
Memory usage after optimization is: 78.57 MB
Decreased by 64.7%
Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 156.21 MB
Decreased by 75.0%
##################
Reduce memory - done in 8s 



___

**Domain knowledge features**

These domain knowledge features were inspired by the awesome community Kaggle notebooks [such as this one](https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features).

In [6]:
with timer("Get application domain knowledge features"):
    app_train_df = get_app_domain_features(app_train_df)
    app_test_df = get_app_domain_features(app_test_df)

# Save intermediate dataframes
# app_train_df.to_csv("app_train_with_domain_feats.csv", index=False)
# app_test_df.to_csv("app_test_wth_domain_feats.csv", index=False)

##################
Get application domain knowledge features - done in 0s 



___

**Polynomial features** 

These polynomial features were inspired by [this notebook](https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction)

In [7]:
# with timer("Get polynomial features"):
#     app_train_df, missing_imputer = get_poly_features(app_train_df)
#     app_test_df, _ = get_poly_features(app_test_df, imputer=missing_imputer)

# Save intermediate dataframes
#app_train_poly.to_csv("app_train_with_domain_poly_feats.csv", index=False)
#app_test_poly.to_csv("app_test_with_domain_poly_feats.csv", index=False)

___

**Incorporate Bureau and Bureau Balance features**

In [8]:
with timer("Incorporate Bureau Balance features into Bureau dataframe"):
    bureau_balance_agg = pd.merge(
        left = agg_numeric(bureau_balance, group_by_var='SK_ID_BUREAU'), 
        right = count_categorical(bureau_balance, group_by_var='SK_ID_BUREAU'),
        how='inner',
        on='SK_ID_BUREAU'
    )
    
    bureau.merge(bureau_balance_agg, how='left', on='SK_ID_BUREAU')
    
    del bureau_balance, bureau_balance_agg
    gc.collect()
    
with timer("Incorporate Bureau features into applications dataframe"):
    bureau_agg = pd.merge(
        left = agg_numeric(bureau, group_by_var='SK_ID_CURR'), 
        right = count_categorical(bureau, group_by_var='SK_ID_CURR'),
        how='inner',
        on='SK_ID_CURR'
    )

    app_train_df = app_train_df.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
    app_test_df = app_test_df.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
    
    del bureau, bureau_agg
    gc.collect()

##################
Incorporate Bureau Balance features into Bureau dataframe - done in 17s 

##################
Incorporate Bureau features into applications dataframe - done in 15s 



___

**Incorporate all other features**

More domain knowledge features inspired by [this](https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features) notebook 

In [9]:
with timer("Read in all other dataframes"):
    previous = pd.read_csv("data/previous_application.csv")
    cash = pd.read_csv("data/POS_CASH_balance.csv")
    credit = pd.read_csv('data/credit_card_balance.csv')
    installments = pd.read_csv('data/installments_payments.csv')

with timer("Reduce memory"):
    previous = reduce_mem_usage(previous, use_float16=True)
    cash = reduce_mem_usage(cash, use_float16=True)
    credit = reduce_mem_usage(credit, use_float16=True)
    installments = reduce_mem_usage(installments, use_float16=True)
    
with timer("Get more domain knowledge features"):
    previous = get_previous_app_domain_features(previous)
    installments = get_installment_domain_features(installments)

##################
Read in all other dataframes - done in 24s 

Memory usage of dataframe is 471.48 MB
Memory usage after optimization is: 130.62 MB
Decreased by 72.3%
Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 171.69 MB
Decreased by 71.9%
Memory usage of dataframe is 673.88 MB
Memory usage after optimization is: 263.69 MB
Decreased by 60.9%
Memory usage of dataframe is 830.41 MB
Memory usage after optimization is: 311.40 MB
Decreased by 62.5%
##################
Reduce memory - done in 11s 

##################
Get more domain knowledge features - done in 8s 



___

In [10]:
with timer("Incorporate previous application features into applications dataframe"):
    previous_agg = pd.merge(
        left = agg_numeric(previous, group_by_var='SK_ID_CURR'), 
        right = count_categorical(previous, group_by_var='SK_ID_CURR'),
        how='inner',
        on='SK_ID_CURR'
    )
    
    # Drop columns with >90% missing values
    previous_agg_missing = (previous_agg.isna().sum() / len(previous_agg))
    previous_agg.drop(columns=previous_agg_missing[previous_agg_missing>=0.9].index.tolist(), inplace=True)
    
    app_train_df = app_train_df.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')
    app_test_df = app_test_df.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')
    
    del previous_agg, previous_agg_missing, previous
    gc.collect();
    
with timer("Incorporate cash loan features into applications dataframe"):
    cash_agg = pd.merge(
        left = agg_numeric(cash, group_by_var='SK_ID_CURR'), 
        right = count_categorical(cash, group_by_var='SK_ID_CURR'),
        how='inner',
        on='SK_ID_CURR'
    )
    
    app_train_df = app_train_df.merge(cash_agg, on = 'SK_ID_CURR', how = 'left')
    app_test_df = app_test_df.merge(cash_agg, on = 'SK_ID_CURR', how = 'left')
    
    del cash_agg, cash
    gc.collect();
    
with timer("Incorporate installment payments features into applications dataframe"):
    installments_agg = agg_numeric(installments, group_by_var='SK_ID_CURR')
    
    app_train_df = app_train_df.merge(installments_agg, on = 'SK_ID_CURR', how = 'left')
    app_test_df = app_test_df.merge(installments_agg, on = 'SK_ID_CURR', how = 'left')
    
    del installments_agg, installments
    gc.collect();
    
with timer("Incorporate credit card features into applications dataframe"):
    credit_agg = pd.merge(
        left = agg_numeric(credit, group_by_var='SK_ID_CURR'), 
        right = count_categorical(credit, group_by_var='SK_ID_CURR'),
        how='inner',
        on='SK_ID_CURR'
    )

    app_train_df = app_train_df.merge(credit_agg, on = 'SK_ID_CURR', how = 'left')
    app_test_df = app_test_df.merge(credit_agg, on = 'SK_ID_CURR', how = 'left')
    
    del credit_agg, credit
    gc.collect();

##################
Incorporate previous application features into applications dataframe - done in 225s 

##################
Incorporate cash loan features into applications dataframe - done in 18s 

##################
Incorporate installment payments features into applications dataframe - done in 14s 

##################
Incorporate credit card features into applications dataframe - done in 11s 



___

In [11]:
app_train_df = reduce_mem_usage(app_train_df)
app_test_df = reduce_mem_usage(app_test_df)

Memory usage of dataframe is 1287.73 MB
Memory usage after optimization is: 851.94 MB
Decreased by 33.8%
Memory usage of dataframe is 204.08 MB
Memory usage after optimization is: 134.07 MB
Decreased by 34.3%


___

**Preprocess and Label encode categorical features**

In [12]:
app_train_df, app_test_df = deal_with_cat_feats(app_train_df, app_test_df)

In [14]:
app_train_df.select_dtypes("category").columns.to_list()

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

In [17]:
app_train_df.to_csv("app_train_df_preprocessed.csv", index=False)
app_test_df.to_csv("app_test_df_preprocessed.csv", index=False)