## Setup and Loading Data

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [2]:
train = pd.read_csv("../input/2020aibootacampqualifiers/Train.csv", index_col="Applicant_ID")

train.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,11333126.0,...,0.392854,2.02,0.711632,0.0,0.0,charge,,1.129518,0.044335,no
Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,2533168.0,...,0.314281,8.08,0.183584,,0.0,charge,349.80573,1.620483,0.322436,no


In [3]:
test = pd.read_csv("../input/2020aibootacampqualifiers/Test.csv", index_col="Applicant_ID")

test.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000032,3236.0,0.34875,10.2006,0.0,0.0,418564.0,418564.0,418564.0,540710.0,0.0,...,,0.825,1.01,0.8,,0.0,charge,,0.0,0.011221
Apcnt_1000048,3284.0,1.2736,2.9606,9.0198,0.0,0.0,9858816.0,49014.0,1510098.0,18308285.0,...,18.8415,0.507694,4.04,0.623248,1.0,0.0,lending,,0.504974,0.043525


## Data Preprocessing

### Checking & Encoding Categorical Columns

In [4]:
train.default_status = train.default_status.map({'no': 0, 'yes': 1})

In [5]:
weight = 300
global_mean = train.default_status.mean()

# compute the number of values and mean of the column
aggs = train.groupby('form_field47')['default_status'].agg(['count', 'mean'])
counts, means = aggs['count'], aggs['mean']

# compute the smoothed means
smooth = (counts * means + weight + global_mean) / (counts + weight)

train.form_field47 = train.form_field47.map(smooth)
test.form_field47 = test.form_field47.map(smooth)

In [6]:
train.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,11333126.0,...,0.392854,2.02,0.711632,0.0,0.0,0.322506,,1.129518,0.044335,0
Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,2533168.0,...,0.314281,8.08,0.183584,,0.0,0.322506,349.80573,1.620483,0.322436,0


### Separating the Target from the predictors

In [7]:
y = train.pop("default_status")

train.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,11333126.0,...,,0.392854,2.02,0.711632,0.0,0.0,0.322506,,1.129518,0.044335
Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,2533168.0,...,,0.314281,8.08,0.183584,,0.0,0.322506,349.80573,1.620483,0.322436


### Handling Missing Values

In [8]:
from sklearn.impute import SimpleImputer

zero_imputer = SimpleImputer(strategy='constant', add_indicator=True)

In [9]:
imputed_train = zero_imputer.fit_transform(train)
imputed_test = zero_imputer.transform(test)

## Scaling the data

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
scaler_mm = MinMaxScaler()

scaled_train = pd.DataFrame(scaler_mm.fit_transform(imputed_train))
scaled_test = pd.DataFrame(scaler_mm.transform(imputed_test))

## Modelling

### Hyperparameter Tuning

XGBoost: learning_rate (0.01 - 0.2), max_depth, min_child_weight, colsample_bytree, subsample, n_estimators

CatBoost: learning_rate, depth, l2-leaf-reg, rsm, iterations

LightGBM: learning_rate, max_depth, num_leaves, min_data_in_leaf(min_data, min_child_samples), feature_fraction, bagging_fraction, num_iterations

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from rgf.sklearn import RGFClassifier, FastRGFClassifier
from sklearn.ensemble import RandomForestClassifier

In [13]:
rgf_params = {
    'max_leaf': [1000, 1200, 1500, 2000, 2500, 3000, 3500, 4000],
#     'algorithm': ['RGF', 'RGF_Opt', 'RGF_Sib'],
#     'l2': [1.0, 0.1, 0.01, 0.001],
#     'learning_rate': [0.03, 0.001, 0.005, 0.01, 0.1, 0.2, 0.3],
#     'n_iter': [250, 100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500],
}

rgf_model = RGFClassifier(algorithm='RGF',learning_rate=0.01, l2=0.001)

In [14]:
tune_rgf_1 = RandomizedSearchCV(rgf_model, 
                          param_distributions=rgf_params,
                          scoring='roc_auc')

In [15]:
tune_rgf_1.fit(scaled_train, y)



RandomizedSearchCV(estimator=RGFClassifier(l2=0.001, learning_rate=0.01),
                   param_distributions={'max_leaf': [1000, 1200, 1500, 2000,
                                                     2500, 3000, 3500, 4000]},
                   scoring='roc_auc')

In [16]:
tune_rgf_1.best_score_

0.8390604551316645

Default: 0.8389612679372922

In [17]:
tune_rgf_1.best_params_

{'max_leaf': 1000}

### Local Validation

In [18]:
tuned_rgf_1 = tune_rgf_1.best_estimator_

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def skf(model):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=99)
    local_validation = []
    test_predictions = []

    for i, (train_index, val_index) in enumerate(kf.split(scaled_train, y)):
        cv_train, cv_val = scaled_train.iloc[train_index], scaled_train.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model.fit(cv_train, y_train)
        y_pred = model.predict_proba(cv_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred)
        local_validation.append(auc_score)
        print("*" * 73 + "\nFold" + str(i) + ": " + str(auc_score))

        test_pred = model.predict_proba(scaled_test)[:, 1]
        test_predictions.append(test_pred)
        
    print("AUC mean score: ", np.mean(local_validation))
    return np.mean(test_predictions, axis=0)

In [20]:
test_pred = skf(tuned_rgf_1)

*************************************************************************
Fold0: 0.8436922501000214
*************************************************************************
Fold1: 0.8398410677058109
*************************************************************************
Fold2: 0.8399988645353125
*************************************************************************
Fold3: 0.8363769304785665
*************************************************************************
Fold4: 0.8361064493402773
AUC mean score:  0.8392031124319977


Default: 0.8389612679372922

### Final Steps

In [21]:
submission = pd.read_csv("../input/2020aibootacampqualifiers/SampleSubmission.csv")

In [22]:
submission.default_status = test_pred
submission.to_csv("sub_rgf", index=False)

In [23]:
submission.head(2)

Unnamed: 0,Applicant_ID,default_status
0,Apcnt_1000032,0.314945
1,Apcnt_1000048,0.354779


### Next Steps

How do we improve our model's accuracy?