In [1]:
import pandas as pd
import numpy as np
import gc

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

import lightgbm as lgb

## Set random number seed, read in main table (to join target variable to loan ID), and feature matrix created in preprocessing notebook.

In [4]:
seed=123
X=pd.read_pickle('loan data.pkl')
Y=X.join(pd.read_csv("application_train.csv")[['SK_ID_CURR','TARGET']].set_index('SK_ID_CURR').TARGET,how='left').TARGET

## We want to create dummy variables for all categorical variables (shown below), but only if their category number is >2.  

In [6]:
X.select_dtypes(include=['object']).columns

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'NAME_TYPE_SUITE',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
       'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY'],
      dtype='object')

## The below will create dummy variables for categories >2, and will convert any text (male, female) to 0,1 if there are only 2 categories

In [None]:
categcolumns=[]

for col in X.select_dtypes(include=['object']).columns:
    if X[col].nunique()==2:
        X[col], _ = pd.factorize(X[col])
    elif X[col].nunique()>2:
        categcolumns.append(col)

dummies=pd.get_dummies(X[categcolumns])
X=pd.concat([X.drop(categcolumns,axis=1), dummies], axis=1)

In [12]:
X=X[X.count().sort_values(ascending=False).index]

## Same process of dropping correlated columns in preprocessing notebook, except now on all of the final aggregated features with the main table features

In [14]:
corr_matrix = X.corr().abs()

In [16]:
threshold = 0.95
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))

There are 2178 columns to remove.


In [17]:
to_drop

['SUM(CREDIT_CARD_BALANCE.SK_DPD_DEF/CNT_DRAWINGS_POS_CURRENT)',
 'SUM(CREDIT_CARD_BALANCE.AMT_TOTAL_RECEIVABLE/AMT_RECIVABLE)',
 'SUM(CREDIT_CARD_BALANCE.SK_DPD_DEF/AMT_BALANCE)',
 'SUM(CREDIT_CARD_BALANCE.SK_DPD_DEF/AMT_DRAWINGS_CURRENT)',
 'SUM(CREDIT_CARD_BALANCE.SK_DPD_DEF/MONTHS_BALANCE)',
 'SUM(CREDIT_CARD_BALANCE.AMT_TOTAL_RECEIVABLE/SK_DPD)',
 'SUM(CREDIT_CARD_BALANCE.AMT_TOTAL_RECEIVABLE-AMT_RECIVABLE)',
 'SUM(CREDIT_CARD_BALANCE.AMT_CREDIT_LIMIT_ACTUAL/AMT_RECIVABLE)',
 'SUM(CREDIT_CARD_BALANCE.AMT_CREDIT_LIMIT_ACTUAL/AMT_RECEIVABLE_PRINCIPAL)',
 'SUM(CREDIT_CARD_BALANCE.AMT_CREDIT_LIMIT_ACTUAL*AMT_INST_MIN_REGULARITY)',
 'SUM(CREDIT_CARD_BALANCE.AMT_CREDIT_LIMIT_ACTUAL/AMT_INST_MIN_REGULARITY)',
 'SUM(CREDIT_CARD_BALANCE.AMT_TOTAL_RECEIVABLE/AMT_DRAWINGS_ATM_CURRENT)',
 'SUM(CREDIT_CARD_BALANCE.AMT_TOTAL_RECEIVABLE/CNT_DRAWINGS_CURRENT)',
 'SUM(CREDIT_CARD_BALANCE.AMT_TOTAL_RECEIVABLE/CNT_DRAWINGS_ATM_CURRENT)',
 'SUM(CREDIT_CARD_BALANCE.AMT_TOTAL_RECEIVABLE/CNT_DRAWINGS_PO

In [20]:
X.drop(to_drop,axis=1,inplace=True)

## Drop all zero importance features, identified by running a base light GBM model twice

In [27]:
def identify_zero_importance_features(train, train_labels, iterations = 2):

    
    # Initialize an empty array to hold feature importances
    feature_importances = np.zeros(train.shape[1])

    # Create the model with several hyperparameters
    model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', 
                               n_estimators = 10000, class_weight = 'balanced')
    
    # Fit the model multiple times to avoid overfitting
    for i in range(iterations):

        # Split into training and validation set
        train_features, valid_features, train_y, valid_y = train_test_split(train, train_labels, 
                                                                            test_size = 0.25, 
                                                                            random_state = i)

        # Train using early stopping
        model.fit(train_features, train_y, early_stopping_rounds=100, 
                  eval_set = [(valid_features, valid_y)], 
                  eval_metric = 'auc', verbose = 200)

        # Record the feature importances
        feature_importances += model.feature_importances_ / iterations
    
    feature_importances = pd.DataFrame({'feature': list(train.columns), 
                            'importance': feature_importances}).sort_values('importance', 
                                                                            ascending = False)
    
    # Find the features with zero importance
    zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
    print('\nThere are %d features with 0.0 importance' % len(zero_features))
    
    return zero_features

zero_features=identify_zero_importance_features(X, Y, iterations = 3)

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[55]	valid_0's auc: 0.765211	valid_0's binary_logloss: 0.578987
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[70]	valid_0's auc: 0.769223	valid_0's binary_logloss: 0.576006
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[59]	valid_0's auc: 0.768828	valid_0's binary_logloss: 0.575613

There are 1278 features with 0.0 importance


In [29]:
X.drop(zero_features,axis=1,inplace=True)

In [33]:
X.to_pickle('loan data_1327.pkl')