In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import gc

In [2]:
print(os.listdir("data/"))

['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv']


In [3]:
app_train_df = pd.read_csv("data/application_train.csv")
app_test_df = pd.read_csv("data/application_test.csv")
column_description_df = pd.read_csv("data/HomeCredit_columns_description.csv", encoding='ISO-8859-1')
train_labels = app_train_df.iloc[:,:2]
test_id = app_test_df.iloc[:,0]

bureau = pd.read_csv("data/bureau.csv")
bureau_balance = pd.read_csv("data/bureau_balance.csv")

def column_description_lookup(column):
    print(column_description_df[column_description_df['Row']==column].Description.values[0])

___

**Domain only csv**

`app_train_domain` will contain domain knowledge engineered features without polynomial features. These features were inspired by the awesome Kaggle competition notebooks that are available on the Kaggle competition website [such as this one](https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features).

In [4]:
app_train_domain = app_train_df.copy()
app_test_domain = app_test_df.copy()

app_train_domain['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']
app_train_domain['PAYMENT_RATE'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['INCOME_PER_PERSON'] = app_train_domain['AMT_INCOME_TOTAL'] / app_train_domain['CNT_FAM_MEMBERS']
app_train_domain['INCOME_CREDIT_PERC'] = app_train_domain['AMT_INCOME_TOTAL'] / app_train_domain['AMT_CREDIT']

app_test_domain['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']
app_test_domain['PAYMENT_RATE'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['INCOME_PER_PERSON'] = app_test_domain['AMT_INCOME_TOTAL'] / app_test_domain['CNT_FAM_MEMBERS']
app_test_domain['INCOME_CREDIT_PERC'] = app_test_domain['AMT_INCOME_TOTAL'] / app_test_domain['AMT_CREDIT']


app_train_domain.drop(columns="TARGET", inplace=True)

In [5]:
app_train_domain.to_csv("app_train_domain.csv", index=False)
app_test_domain.to_csv("app_test_domain.csv", index=False)

___

**Combine Poly with Domain and get csv** 

`app_train_poly` will contain the domain knowledge engineered features with the addition of interactions between the `EXT_SOURCE_X` and the `Days_Birth` features. They were created as these features have a relatively high correlation with the target compared to the other features. The idea to create these features was inspired by [this notebook](https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction)

In [4]:
# Make a new dataframe for polynomial features
poly_features = app_train_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']].copy()
poly_features['TARGET'] = train_labels['TARGET'].copy()
poly_features_test = app_test_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']].copy()

# imputer for handling missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

#poly_target = poly_features['TARGET']

poly_features = poly_features.drop(columns = ['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures
                                  
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree = 3)

# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)

Polynomial Features shape:  (307511, 35)


In [5]:
# Put train features into dataframe
poly_features = pd.DataFrame(poly_features, 
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Add in the target
#poly_features['TARGET'] = poly_target

# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test, 
                                  columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                                'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Merge polynomial features into training dataframe
poly_features['SK_ID_CURR'] = app_train_df['SK_ID_CURR']
app_train_poly = app_train_df.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

# Merge polnomial features into testing dataframe
poly_features_test['SK_ID_CURR'] = app_test_df['SK_ID_CURR']
app_test_poly = app_test_df.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

# Align the dataframes
app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape: ', app_train_poly.shape)
print('Testing data with polynomial features shape:  ', app_test_poly.shape)

Training data with polynomial features shape:  (307511, 156)
Testing data with polynomial features shape:   (48744, 156)


In [7]:
for col in set(app_train_domain.columns) - set(app_train_poly.columns):
    app_train_poly[col] = app_train_domain[col]
    app_test_poly[col] = app_test_domain[col]
print(app_train_poly.shape)

(307511, 164)


In [9]:
app_train_poly.to_csv("app_train_poly_domain.csv", index=False)
app_test_poly.to_csv("app_test_poly_domain.csv", index=False)

<br>
<br>

**Read in initial DF**

In [6]:
# Use domain AND polynomial features
#app_train_df = pd.read_csv("app_train_poly_domain.csv")
#app_test_df = pd.read_csv("app_test_poly_domain.csv")

# Just use domain features
app_train_df = pd.read_csv("app_train_domain.csv")
app_test_df = pd.read_csv("app_test_domain.csv")

app_train_df['SK_ID_CURR'] = app_train_df['SK_ID_CURR'].values.astype(int)
app_test_df['SK_ID_CURR'] = app_test_df['SK_ID_CURR'].values.astype(int)

<br>
<br>

**Incorporate Bureau and Bureau Balance features**

In [8]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [9]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object')).copy()

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    categorical.reset_index(level=0, inplace=True)
    
    return categorical

In [10]:
bureau_counts = count_categorical(bureau, group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg = bureau_agg.merge(bureau_counts, how='inner', on='SK_ID_CURR')
del(bureau_counts)

In [11]:
bureau_balance_counts = count_categorical(bureau_balance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_agg = agg_numeric(bureau_balance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_agg = bureau_balance_agg.merge(bureau_balance_counts, how='inner', on='SK_ID_BUREAU')
del(bureau_balance_counts)

In [12]:
bureau_balance_by_loan = bureau[['SK_ID_CURR', 'SK_ID_BUREAU']].merge(bureau_balance_agg, how='left', on='SK_ID_BUREAU')
bureau_balance_by_applicant = agg_numeric(bureau_balance_by_loan, group_var='SK_ID_CURR', df_name = 'client')
del(bureau_balance_by_loan)

In [13]:
# Merge with the value counts of bureau
app_train_df = app_train_df.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
app_test_df = app_test_df.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of bureau
app_train_df = app_train_df.merge(bureau_balance_by_applicant, on = 'SK_ID_CURR', how = 'left')
app_test_df = app_test_df.merge(bureau_balance_by_applicant, on = 'SK_ID_CURR', how = 'left')

___

**Drop low populated categorical feature classes and flag missing classes as "MISSING"**

In [14]:
#train = app_train_df.iloc[:,1:].copy()
#test = app_test_df.iloc[:,1:].copy()

train = app_train_df.copy()
test = app_test_df.copy()

In [15]:
# Drop low populated categorical classes
train_label = train_labels.copy()
train_label.drop(train[train['CODE_GENDER'] == 'XNA'].index, inplace=True)
train.drop(train[train['CODE_GENDER'] == 'XNA'].index, inplace=True)

train_label.drop(train[train["NAME_INCOME_TYPE"] == "Maternity leave"].index, inplace=True)
train.drop(train[train["NAME_INCOME_TYPE"] == "Maternity leave"].index, inplace=True)

train_label.drop(train[train["NAME_FAMILY_STATUS"] == "Unknown"].index, inplace=True)
train.drop(train[train["NAME_FAMILY_STATUS"] == "Unknown"].index, inplace=True)

In [16]:
# Get list of categorical variables
binary_cat_feats = train.select_dtypes('object').nunique()[train.select_dtypes('object').nunique()==2].index
cat_feats=train.select_dtypes('object').nunique()[train.select_dtypes('object').nunique()>2].index
total_cat_feats = list(binary_cat_feats) + list(cat_feats)

In [17]:
# Flag missing classes
for feat in total_cat_feats:
    train[feat] = train[feat].fillna("MISSING")
    test[feat] = test[feat].fillna("MISSING")

___

**Detect categories in Training set that are not in Test set**

In [18]:
for cat_feat in total_cat_feats:
    if len(set(train[cat_feat].value_counts().index)) != len(set(test[cat_feat].value_counts().index)):
        print(cat_feat)

___

**Label encode categorical variables**

In [19]:
train_cat_classes = {}
test_cat_classes = {}

for cat_feat in total_cat_feats:
    le = LabelEncoder()
    train[cat_feat] = le.fit_transform(train[cat_feat])
    train_cat_classes[cat_feat] = le.classes_
    
    test[cat_feat] = le.transform(test[cat_feat])
    test_cat_classes[cat_feat] = le.classes_

___

**Replace "MISSING" classes with negative number (treated as missing by LGBM)**

In [20]:
for feat in train_cat_classes:
    if "MISSING" in train_cat_classes[feat]:
        train[feat] = train[feat].replace(list(train_cat_classes[feat]).index("MISSING"), -1)
        test[feat] = test[feat].replace(list(test_cat_classes[feat]).index("MISSING"), -1)

___

**Make categorical columns: "category" type**

In [21]:
for feat in total_cat_feats:
    train[feat] = train[feat].astype('category')
    test[feat] = test[feat].astype('category')

___

**Save to csv**

In [None]:
train.to_csv("app_bureau_train.csv", index=False)
test.to_csv("app_bureau_test.csv", index=False)

___

<br>
<br>

# LGBM with application and bureau features

**Perform CV to find good number for n_estimators and check feature importance**

In [20]:
label = train_label['TARGET'].copy()

cv_n_estimators = []
cv_scores = []
fi = []

skf = StratifiedKFold(n_splits=5)

for train_index, val_index in skf.split(X=train, y=label):
    #X_train, y_train = train[train_index], label[train_index]
    #X_val, y_val = train[val_index], label[val_index]
    
    # Split as DataFrame
    X_train, y_train = train.iloc[train_index,:], label.iloc[train_index]
    X_val, y_val = train.iloc[val_index,:], label.iloc[val_index]
    
    
    
    model = LGBMClassifier(n_estimators=10000, 
                           num_leaves=31,
                           objective = 'binary', 
                           class_weight = 'balanced', 
                           learning_rate = 0.05, 
                           #reg_alpha = 0.1, 
                           #reg_lambda = 0.1, 
                           subsample = 0.8, 
                           colsample_by_tree=0.50,
                           n_jobs = 4, 
                           random_state = 1)
    
    model.fit(X_train, 
              y_train, 
              eval_metric = 'auc',
              eval_set = [(X_val, y_val)],
              eval_names = ['valid'], 
              categorical_feature = 'auto',
              early_stopping_rounds = 100, 
              verbose = 200)
    
    cv_n_estimators.append(model.best_iteration_)
    cv_scores.append(model.best_score_['valid']['auc'])
    fi.append(model.feature_importances_)

Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.770815	valid's binary_logloss: 0.543502
Early stopping, best iteration is:
[247]	valid's auc: 0.771298	valid's binary_logloss: 0.537155
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.772807	valid's binary_logloss: 0.547614
[400]	valid's auc: 0.773049	valid's binary_logloss: 0.523202
Early stopping, best iteration is:
[314]	valid's auc: 0.773706	valid's binary_logloss: 0.532681
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.766202	valid's binary_logloss: 0.544239
[400]	valid's auc: 0.766746	valid's binary_logloss: 0.519297
Early stopping, best iteration is:
[338]	valid's auc: 0.767094	valid's binary_logloss: 0.5261
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.772531	valid's binary_logloss: 0.543583
Early stopping, best iteration is:
[289]	valid's auc: 0.773032	valid's binary_logloss: 0.53157
T

In [21]:
print("Best n_estimators: {}".format(cv_n_estimators))
print("Best n_estimators (AVG): {}".format(np.mean(cv_n_estimators)))
print("Best CV AUCs: {}".format(cv_scores))

Best n_estimators: [247, 314, 338, 289, 343]
Best n_estimators (AVG): 306.2
Best CV AUCs: [0.7712976330279125, 0.7737063220668352, 0.7670938525992196, 0.7730321443427791, 0.7747867034084339]


In [22]:
fi_arr = np.zeros_like(fi[0])
for fi_cv in fi:
    fi_arr += fi_cv

fi_df = {}
for name, fi_feat in zip(list(train.columns), fi_arr):
    fi_df[name] = fi_feat
fi_df = pd.DataFrame(pd.Series(fi_df), columns=['FI'])
fi_df['FI_normalized'] = fi_df['FI'] / fi_df['FI'].sum()
fi_df.reset_index(level=0, inplace=True)
fi_df.rename(columns={'index':'Feature'}, inplace=True)
fi_df.sort_values(by='FI', ascending=False)

Unnamed: 0,Feature,FI,FI_normalized
38,ORGANIZATION_TYPE,6771,0.147420
156,CREDIT_TERM,2801,0.060984
26,OCCUPATION_TYPE,992,0.021598
140,EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3,951,0.020705
39,EXT_SOURCE_1_x,848,0.018463
7,AMT_ANNUITY,761,0.016569
8,AMT_GOODS_PRICE,731,0.015916
18,DAYS_ID_PUBLISH,653,0.014217
175,bureau_DAYS_CREDIT_ENDDATE_max,650,0.014152
6,AMT_CREDIT,649,0.014130


In [23]:
fi_df.to_csv("fi_1.csv", index=False)

- **fi_1** - Poly features included. No OHE encoding. Pandas DF with categorical columns

<br>
<br>

**Get submission**

In [22]:
model = LGBMClassifier(n_estimators=300, 
                           num_leaves=31,
                           objective = 'binary', 
                           class_weight = 'balanced', 
                           learning_rate = 0.05, 
                           #reg_alpha = 0.1, 
                           #reg_lambda = 0.1, 
                           subsample = 0.8, 
                           colsample_by_tree=0.50,
                           n_jobs = 4, 
                           random_state = 1)
model.fit(train, 
          label, 
          eval_metric = 'auc',
          categorical_feature = 'auto',
         )

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_by_tree=0.5, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=4, num_leaves=31, objective='binary',
               random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [23]:
y_pred = model.predict_proba(test)[:,1]
submission_df = pd.DataFrame(test_id)
submission_df['TARGET'] = y_pred

In [24]:
submission_df.to_csv("28082019_5.csv", index=False)

- **0.76906** - "Domain", poly and bureau aggregated features
- **0.76910** - "Domain", poly and bureau aggregated features. Don't OHE categorical features
- **0.76972** - Don't OHE categorical features, colsample_by_tree = 0.5, no reg
- **0.76736** - Don't OHE categorical features, Don't include poly features, colsample_by_tree = 0.5, no reg
- **0.76972** - Don't OHE categorical features, colsample_by_tree = 0.5, no reg

<br>
<br>

**Drop low feature importance**

In [19]:
fi_df = pd.read_csv("fi_1.csv")
low_fi_feats = list(fi_df[fi_df['FI']==0].Feature.values)

train_drop_low_fi = train.copy()
test_drop_low_fi = test.copy()

train_drop_low_fi.drop(columns=low_fi_feats, inplace=True)
test_drop_low_fi.drop(columns=low_fi_feats, inplace=True)

In [26]:
label = train_label['TARGET'].copy()

cv_n_estimators = []
cv_scores = []
fi = []

skf = StratifiedKFold(n_splits=5)

for train_index, val_index in skf.split(X=train_drop_low_fi, y=label):
    #X_train, y_train = train[train_index], label[train_index]
    #X_val, y_val = train[val_index], label[val_index]
    
    # Split as DataFrame
    X_train, y_train = train_drop_low_fi.iloc[train_index,:], label.iloc[train_index]
    X_val, y_val = train_drop_low_fi.iloc[val_index,:], label.iloc[val_index]
    
    
    
    model = LGBMClassifier(n_estimators=10000, 
                           num_leaves=31,
                           objective = 'binary', 
                           class_weight = 'balanced', 
                           learning_rate = 0.05, 
                           #reg_alpha = 0.1, 
                           #reg_lambda = 0.1, 
                           subsample = 0.8, 
                           colsample_by_tree=0.50,
                           n_jobs = 4, 
                           random_state = 2)
    
    model.fit(X_train, 
              y_train, 
              eval_metric = 'auc',
              eval_set = [(X_val, y_val)],
              eval_names = ['valid'], 
              categorical_feature = 'auto',
              early_stopping_rounds = 100, 
              verbose = 200)
    
    cv_n_estimators.append(model.best_iteration_)
    cv_scores.append(model.best_score_['valid']['auc'])
    fi.append(model.feature_importances_)

Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.770815	valid's binary_logloss: 0.543502
Early stopping, best iteration is:
[247]	valid's auc: 0.771298	valid's binary_logloss: 0.537155
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.772807	valid's binary_logloss: 0.547614
[400]	valid's auc: 0.773049	valid's binary_logloss: 0.523202
Early stopping, best iteration is:
[314]	valid's auc: 0.773706	valid's binary_logloss: 0.532681
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.766202	valid's binary_logloss: 0.544239
[400]	valid's auc: 0.766746	valid's binary_logloss: 0.519297
Early stopping, best iteration is:
[338]	valid's auc: 0.767094	valid's binary_logloss: 0.5261
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.772531	valid's binary_logloss: 0.543583
Early stopping, best iteration is:
[289]	valid's auc: 0.773032	valid's binary_logloss: 0.53157
T

In [27]:
print("Best n_estimators: {}".format(cv_n_estimators))
print("Best n_estimators (AVG): {}".format(np.mean(cv_n_estimators)))
print("Best CV AUCs: {}".format(cv_scores))

Best n_estimators: [247, 314, 338, 289, 343]
Best n_estimators (AVG): 306.2
Best CV AUCs: [0.7712976330279125, 0.7737063220668352, 0.7670938525992196, 0.7730321443427791, 0.7747867034084339]


In [28]:
fi_arr = np.zeros_like(fi[0])
for fi_cv in fi:
    fi_arr += fi_cv

fi_df = {}
for name, fi_feat in zip(list(train_drop_low_fi.columns), fi_arr):
    fi_df[name] = fi_feat
fi_df = pd.DataFrame(pd.Series(fi_df), columns=['FI'])
fi_df['FI_normalized'] = fi_df['FI'] / fi_df['FI'].sum()
fi_df.sort_values(by='FI', ascending=False)

Unnamed: 0,FI,FI_normalized
ORGANIZATION_TYPE,6771,0.147420
CREDIT_TERM,2801,0.060984
OCCUPATION_TYPE,992,0.021598
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3,951,0.020705
EXT_SOURCE_1_x,848,0.018463
AMT_ANNUITY,761,0.016569
AMT_GOODS_PRICE,731,0.015916
DAYS_ID_PUBLISH,653,0.014217
bureau_DAYS_CREDIT_ENDDATE_max,650,0.014152
AMT_CREDIT,649,0.014130


<br>
<br>

**Get submission**

In [20]:
label = train_label['TARGET'].copy()
model = LGBMClassifier(n_estimators=325, 
                           num_leaves=31,
                           objective = 'binary', 
                           class_weight = 'balanced', 
                           learning_rate = 0.05, 
                           #reg_alpha = 0.1, 
                           #reg_lambda = 0.1, 
                           subsample = 0.8, 
                           colsample_by_tree=0.50,
                           n_jobs = 4, 
                           random_state = 2)
    
model.fit(train_drop_low_fi, 
          label, 
          eval_metric = 'auc',
          categorical_feature = 'auto',
         )

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_by_tree=0.5, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=325, n_jobs=4, num_leaves=31, objective='binary',
               random_state=2, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [21]:
y_pred = model.predict_proba(test_drop_low_fi)[:,1]
submission_df = pd.DataFrame(test_id)
submission_df['TARGET'] = y_pred

In [22]:
submission_df.to_csv("28082019_8.csv", index=False)

- **0.77098** - Don't OHE categorical features, colsample_by_tree = 0.5, no reg. Drop 0 feature importance features
- **0.76967** - Don't OHE categorical features, colsample_by_tree = 0.5, no reg. Drop 0 feature importance features. Seed=2 (different `FI_1.csv` file)
- **0.77098** - Don't OHE categorical features, colsample_by_tree = 0.5, no reg. Drop 0 feature importance features **Back to previous FI_1.csv**

In [None]:
train_drop_low_fi.to_csv("app_bureau_train_poly.csv", index=False)
test_drop_low_fi.to_csv("app_bureau_test_poly.csv", index=False)
pd.DataFrame(label).to_csv("train_label_dropped.csv", index=False)

<br>
<br>

# Incorporate all other table's features

Because of the one to many relationships between the unique identifiers within the tables, we will simply aggregate features using various aggregation statistics to combine the features into one table, just like how was done with bureau and bureau_balance

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import gc

In [2]:
print(os.listdir("data/"))

['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv']


In [3]:
app_test_df = pd.read_csv("data/application_test.csv")
column_description_df = pd.read_csv("data/HomeCredit_columns_description.csv", encoding='ISO-8859-1')
test_id = app_test_df.iloc[:,0]

def column_description_lookup(column):
    print(column_description_df[column_description_df['Row']==column].Description.values[0])

app_train_df = pd.read_csv("app_bureau_train.csv")
app_test_df = pd.read_csv("app_bureau_test.csv")
train_label = pd.read_csv("train_label_dropped.csv")


total_cat_feats=['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'EMERGENCYSTATE_MODE',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE']

for feat in total_cat_feats:
    app_train_df[feat] = app_train_df[feat].astype('category')
    app_test_df[feat] = app_test_df[feat].astype('category')

previous = pd.read_csv("data/previous_application.csv")
cash = pd.read_csv("data/POS_CASH_balance.csv")
credit = pd.read_csv('data/credit_card_balance.csv')
installments = pd.read_csv('data/installments_payments.csv')

In [4]:
previous['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
previous['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
previous['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
previous['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
previous['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
previous['APP_CREDIT_PERC'] = previous['AMT_APPLICATION'] / previous['AMT_CREDIT']

installments['PAYMENT_PERC'] = installments['AMT_PAYMENT'] / installments['AMT_INSTALMENT']
installments['PAYMENT_DIFF'] = installments['AMT_INSTALMENT'] - installments['AMT_PAYMENT']
installments['DPD'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']
installments['DBD'] = installments['DAYS_INSTALMENT'] - installments['DAYS_ENTRY_PAYMENT']
installments['DPD'] = installments['DPD'].apply(lambda x: x if x > 0 else 0)
installments['DBD'] = installments['DBD'].apply(lambda x: x if x > 0 else 0)

In [5]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    
    return agg

In [6]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object')).copy()

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    categorical.reset_index(level=0, inplace=True)
    
    return categorical

In [7]:
previous_counts = count_categorical(previous, group_var = 'SK_ID_CURR', df_name = 'previous')
previous_agg = agg_numeric(previous.drop(columns = ['SK_ID_PREV']), group_var = 'SK_ID_CURR', df_name = 'previous')
previous_agg = previous_agg.merge(previous_counts, how='inner', on='SK_ID_CURR')
del(previous_counts)

previous_agg_missing_df = pd.DataFrame(previous_agg.isna().sum(axis=0).sort_values(ascending=False)).rename(columns={0:"Total"})
previous_agg_missing_df['%'] = np.round(previous_agg_missing_df.Total/len(previous_agg)*100, 2)
to_drop = list(previous_agg_missing_df[previous_agg_missing_df['%']>90].index)
previous_agg.drop(columns=to_drop, inplace=True)

In [8]:
gc.enable()
gc.collect()

118

___

In [9]:
cash_counts = count_categorical(cash, group_var = 'SK_ID_PREV', df_name = 'cash')
cash_agg = agg_numeric(cash, group_var = 'SK_ID_PREV', df_name = 'cash')
cash_agg = cash_agg.merge(cash_counts, how='inner', on='SK_ID_PREV')
del(cash_counts)

cash_by_prev_loan = previous[['SK_ID_CURR', 'SK_ID_PREV']].merge(cash_agg, how='left', on='SK_ID_PREV')
cash_by_applicant = agg_numeric(cash_by_prev_loan, group_var='SK_ID_CURR', df_name = 'client_prev')
del(cash_by_prev_loan)

In [10]:
gc.enable()
gc.collect()

97

___

In [11]:
credit_counts = count_categorical(credit, group_var = 'SK_ID_PREV', df_name = 'credit')
credit_agg = agg_numeric(credit, group_var = 'SK_ID_PREV', df_name = 'credit')
credit_agg = credit_agg.merge(credit_counts, how='inner', on='SK_ID_PREV')
del(credit_counts)

credit_by_prev_loan = previous[['SK_ID_CURR', 'SK_ID_PREV']].merge(credit_agg, how='left', on='SK_ID_PREV')
credit_by_applicant = agg_numeric(credit_by_prev_loan, group_var='SK_ID_CURR', df_name = 'client_credit')
del(credit_by_prev_loan)

In [12]:
gc.enable()
gc.collect()

139

___

In [13]:
#installments_counts = count_categorical(installments, group_var = 'SK_ID_PREV', df_name = 'install')
installments_agg = agg_numeric(installments, group_var = 'SK_ID_PREV', df_name = 'install')
#installments_agg = installments_agg.merge(installments_counts, how='inner', on='SK_ID_PREV')
#del(installments_counts)

installments_by_prev_loan = previous[['SK_ID_CURR', 'SK_ID_PREV']].merge(installments_agg, how='left', on='SK_ID_PREV')
installments_by_applicant = agg_numeric(installments_by_prev_loan, group_var='SK_ID_CURR', df_name = 'client_install')
del(installments_by_prev_loan)

In [14]:
gc.enable()
gc.collect()

104

___

In [15]:
# Merge with the value counts of previous
app_train_df = app_train_df.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')
app_test_df = app_test_df.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of cash
app_train_df = app_train_df.merge(cash_by_applicant, on = 'SK_ID_CURR', how = 'left')
app_test_df = app_test_df.merge(cash_by_applicant, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of credit
app_train_df = app_train_df.merge(credit_by_applicant, on = 'SK_ID_CURR', how = 'left')
app_test_df = app_test_df.merge(credit_by_applicant, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of installments
app_train_df = app_train_df.merge(installments_by_applicant, on = 'SK_ID_CURR', how = 'left')
app_test_df = app_test_df.merge(installments_by_applicant, on = 'SK_ID_CURR', how = 'left')

In [16]:
gc.enable()
gc.collect()

69

<br>
<br>

In [17]:
app_train_df.to_csv("app_ALL_train.csv", index=False)
app_test_df.to_csv("app_ALL_test.csv", index=False)

<br>
<br>

In [18]:
app_train_missing_df = pd.DataFrame(app_train_df.isna().sum(axis=0).sort_values(ascending=False)).rename(columns={0:"Total"})
app_train_missing_df['%'] = np.round(app_train_missing_df.Total/len(app_train_df)*100, 2)
app_train_missing_df

Unnamed: 0,Total,%
client_credit_credit_AMT_PAYMENT_CURRENT_mean_max,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_mean_mean,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_max_mean,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_max_max,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_max_min,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_min_mean,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_min_max,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_min_min,254658,82.82
client_credit_credit_AMT_PAYMENT_CURRENT_mean_min,254658,82.82
client_credit_credit_CNT_DRAWINGS_ATM_CURRENT_mean_mean,254570,82.79


In [19]:
app_test_missing_df = pd.DataFrame(app_test_df.isna().sum(axis=0).sort_values(ascending=False)).rename(columns={0:"Total"})
app_test_missing_df['%'] = np.round(app_test_missing_df.Total/len(app_test_df)*100, 2)
app_test_missing_df

Unnamed: 0,Total,%
client_credit_credit_CNT_DRAWINGS_OTHER_CURRENT_mean_max,39650,81.34
client_credit_credit_CNT_DRAWINGS_OTHER_CURRENT_mean_mean,39650,81.34
client_credit_credit_CNT_DRAWINGS_POS_CURRENT_max_mean,39650,81.34
client_credit_credit_CNT_DRAWINGS_POS_CURRENT_mean_min,39650,81.34
client_credit_credit_CNT_DRAWINGS_POS_CURRENT_mean_max,39650,81.34
client_credit_credit_CNT_DRAWINGS_POS_CURRENT_mean_mean,39650,81.34
client_credit_credit_CNT_DRAWINGS_OTHER_CURRENT_min_min,39650,81.34
client_credit_credit_CNT_DRAWINGS_OTHER_CURRENT_min_max,39650,81.34
client_credit_credit_CNT_DRAWINGS_OTHER_CURRENT_min_mean,39650,81.34
client_credit_credit_AMT_DRAWINGS_ATM_CURRENT_min_min,39650,81.34


In [20]:
app_train_df.shape

(307500, 1349)

<br>
<br>

**CV**

In [42]:
train = app_train_df.iloc[:,1:].copy()
test = app_test_df.iloc[:,1:].copy()
label = train_label['TARGET'].copy()

cv_n_estimators = []
cv_scores = []
fi = []

skf = StratifiedKFold(n_splits=5)

for train_index, val_index in skf.split(X=train, y=label):
    #X_train, y_train = train[train_index], label[train_index]
    #X_val, y_val = train[val_index], label[val_index]
    
    # Split as DataFrame
    X_train, y_train = train.iloc[train_index,:], label.iloc[train_index]
    X_val, y_val = train.iloc[val_index,:], label.iloc[val_index]
    
    
    
    model = LGBMClassifier(device='GPU',
                           silent=False,
                           n_estimators=10000, 
                           num_leaves=31,
                           objective = 'binary', 
                           class_weight = 'balanced', 
                           learning_rate = 0.05, 
                           #reg_alpha = 0.1, 
                           #reg_lambda = 0.1, 
                           subsample = 0.8, 
                           colsample_by_tree=0.50,
                           n_jobs = 4, 
                           random_state = 2)
    
    model.fit(X_train, 
              y_train, 
              eval_metric = 'auc',
              eval_set = [(X_val, y_val)],
              eval_names = ['valid'], 
              categorical_feature = 'auto',
              early_stopping_rounds = 100, 
              verbose = 200)
    
    cv_n_estimators.append(model.best_iteration_)
    cv_scores.append(model.best_score_['valid']['auc'])
    fi.append(model.feature_importances_)

Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.784034	valid's binary_logloss: 0.531399
Early stopping, best iteration is:
[287]	valid's auc: 0.784501	valid's binary_logloss: 0.517955
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.787252	valid's binary_logloss: 0.53047
[400]	valid's auc: 0.788228	valid's binary_logloss: 0.502093
Early stopping, best iteration is:
[316]	valid's auc: 0.788857	valid's binary_logloss: 0.512807
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.780066	valid's binary_logloss: 0.528339
[400]	valid's auc: 0.78166	valid's binary_logloss: 0.499476
Early stopping, best iteration is:
[424]	valid's auc: 0.781833	valid's binary_logloss: 0.496596
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.785341	valid's binary_logloss: 0.52729
[400]	valid's auc: 0.786862	valid's binary_logloss: 0.498157
Early stopping, best iteration is:
[

In [43]:
print("Best n_estimators: {}".format(cv_n_estimators))
print("Best n_estimators (AVG): {}".format(np.mean(cv_n_estimators)))
print("Best CV AUCs: {}".format(cv_scores))

Best n_estimators: [287, 316, 424, 327, 394]
Best n_estimators (AVG): 349.6
Best CV AUCs: [0.7845012690360755, 0.788857323893546, 0.7818329259980382, 0.7870599081940427, 0.7877778250507216]


In [44]:
fi_arr = np.zeros_like(fi[0])
for fi_cv in fi:
    fi_arr += fi_cv

fi_df = {}
for name, fi_feat in zip(list(train.columns), fi_arr):
    fi_df[name] = fi_feat
fi_df = pd.DataFrame(pd.Series(fi_df), columns=['FI'])
fi_df['FI_normalized'] = fi_df['FI'] / fi_df['FI'].sum()
fi_df.reset_index(level=0, inplace=True)
fi_df.rename(columns={'index':'Feature'}, inplace=True)
fi_df.sort_values(by='FI', ascending=False)

Unnamed: 0,Feature,FI,FI_normalized
34,ORGANIZATION_TYPE,6223,0.118669
130,CREDIT_TERM,1453,0.027708
118,EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3,776,0.014798
7,AMT_ANNUITY,695,0.013253
23,OCCUPATION_TYPE,659,0.012567
35,EXT_SOURCE_1_x,622,0.011861
136,bureau_DAYS_CREDIT_max,521,0.009935
6,AMT_CREDIT,436,0.008314
19,OWN_CAR_AGE,420,0.008009
144,bureau_DAYS_CREDIT_ENDDATE_max,418,0.007971


<br>
<br>

In [45]:
fi_df.to_csv("fi_3.csv", index=False)

<br>
<br>

In [46]:
fi_df = pd.read_csv("fi_3.csv")
low_fi_feats = list(fi_df[fi_df['FI']==0].Feature.values)

train_drop_low_fi = train.copy()
test_drop_low_fi = test.copy()

train_drop_low_fi.drop(columns=low_fi_feats, inplace=True)
test_drop_low_fi.drop(columns=low_fi_feats, inplace=True)

In [None]:
label = train_label['TARGET'].copy()

cv_n_estimators = []
cv_scores = []
fi = []

skf = StratifiedKFold(n_splits=5)

for train_index, val_index in skf.split(X=train_drop_low_fi, y=label):
    # Split as DataFrame
    X_train, y_train = train_drop_low_fi.iloc[train_index,:], label.iloc[train_index]
    X_val, y_val = train_drop_low_fi.iloc[val_index,:], label.iloc[val_index]
    
    
    
    model = LGBMClassifier(#device='GPU',
                           #silent=False,
                           n_estimators=10000, 
                           num_leaves=31,
                           objective = 'binary', 
                           class_weight = 'balanced', 
                           learning_rate = 0.05, 
                           #reg_alpha = 0.1, 
                           #reg_lambda = 0.1, 
                           subsample = 0.8, 
                           colsample_by_tree=0.50,
                           n_jobs = 6, 
                           random_state = 2)
    
    model.fit(X_train, 
              y_train, 
              eval_metric = 'auc',
              eval_set = [(X_val, y_val)],
              eval_names = ['valid'], 
              categorical_feature = 'auto',
              early_stopping_rounds = 100, 
              verbose = 200)
    
    cv_n_estimators.append(model.best_iteration_)
    cv_scores.append(model.best_score_['valid']['auc'])
    fi.append(model.feature_importances_)

In [None]:
print("Best n_estimators: {}".format(cv_n_estimators))
print("Best n_estimators (AVG): {}".format(np.mean(cv_n_estimators)))
print("Best CV AUCs: {}".format(cv_scores))

In [None]:
fi_arr = np.zeros_like(fi[0])
for fi_cv in fi:
    fi_arr += fi_cv

fi_df = {}
for name, fi_feat in zip(list(train_drop_low_fi.columns), fi_arr):
    fi_df[name] = fi_feat
fi_df = pd.DataFrame(pd.Series(fi_df), columns=['FI'])
fi_df['FI_normalized'] = fi_df['FI'] / fi_df['FI'].sum()
fi_df.sort_values(by='FI', ascending=False)

<br>
<br>

**Get submission**

In [48]:
model = LGBMClassifier(device='GPU',
                           silent=False,
                           n_estimators=380, 
                           num_leaves=31,
                           objective = 'binary', 
                           class_weight = 'balanced', 
                           learning_rate = 0.05, 
                           #reg_alpha = 0.1, 
                           #reg_lambda = 0.1, 
                           subsample = 0.8, 
                           colsample_by_tree=0.50,
                           n_jobs = 4, 
                           random_state = 2)
    
model.fit(train_drop_low_fi, 
          label, 
          eval_metric = 'auc',
          categorical_feature = 'auto',
         )

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_by_tree=0.5, colsample_bytree=1.0, device='GPU',
               importance_type='split', learning_rate=0.05, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=380, n_jobs=4, num_leaves=31, objective='binary',
               random_state=2, reg_alpha=0.0, reg_lambda=0.0, silent=False,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [49]:
y_pred = model.predict_proba(test_drop_low_fi)[:,1]
submission_df = pd.DataFrame(test_id)
submission_df['TARGET'] = y_pred

In [50]:
submission_df.to_csv("29082019_5.csv", index=False)

- **0.78023** - Poly, bureau, previous (cash) features
- **0.77955** - Poly, bureau, previous (cash) features, FI<10
- **0.78018** - Poly, bureau, previous (cash) features
- **0.78025** - Poly, bureau, previous (cash) features, n_estimators=400
- **0.77963** - Poly, bureau, previous (cash) features, n_estimators=500


- **0.77904** - GPU training (colsample_by_tree doesn't work?) Different FI_2.csv.
- **0.77930** - CPU training (colsample_by_tree?) Different FI_2.csv, Different seed


- **0.78453** - ALL features, GPU training
- **0.78416** - ALL features, CPU training
- **0.78526** - ALL features, GPU training, 31 leaves

<br>
<br>

In [51]:
train_drop_low_fi.to_csv("app_ALL_train_poly.csv", index=False)
test_drop_low_fi.to_csv("app_ALL_test_poly.csv", index=False)
#pd.DataFrame(label).to_csv("train_label_dropped.csv", index=False)

<br>
<br>