## Setup and Loading Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("../input/2020aibootacampqualifiers/Train.csv", index_col="Applicant_ID")

train.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,11333126.0,...,0.392854,2.02,0.711632,0.0,0.0,charge,,1.129518,0.044335,no
Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,2533168.0,...,0.314281,8.08,0.183584,,0.0,charge,349.80573,1.620483,0.322436,no


In [3]:
test = pd.read_csv("../input/2020aibootacampqualifiers/Test.csv", index_col="Applicant_ID")

test.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000032,3236.0,0.34875,10.2006,0.0,0.0,418564.0,418564.0,418564.0,540710.0,0.0,...,,0.825,1.01,0.8,,0.0,charge,,0.0,0.011221
Apcnt_1000048,3284.0,1.2736,2.9606,9.0198,0.0,0.0,9858816.0,49014.0,1510098.0,18308285.0,...,18.8415,0.507694,4.04,0.623248,1.0,0.0,lending,,0.504974,0.043525


## Data Preprocessing

### Checking & Encoding Categorical Columns

In [4]:
train.default_status = train.default_status.map({'no': 0, 'yes': 1})

In [5]:
weight = 300
global_mean = train.default_status.mean()

# compute the number of values and mean of the column
aggs = train.groupby('form_field47')['default_status'].agg(['count', 'mean'])
counts, means = aggs['count'], aggs['mean']

# compute the smoothed means
smooth = (counts * means + weight + global_mean) / (counts + weight)

train.form_field47 = train.form_field47.map(smooth)
test.form_field47 = test.form_field47.map(smooth)

### Separating the Target from the predictors

In [6]:
train.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,11333126.0,...,0.392854,2.02,0.711632,0.0,0.0,0.322506,,1.129518,0.044335,0
Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,2533168.0,...,0.314281,8.08,0.183584,,0.0,0.322506,349.80573,1.620483,0.322436,0


In [7]:
y = train.pop("default_status")

train.head(2)

Unnamed: 0_level_0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
Applicant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,11333126.0,...,,0.392854,2.02,0.711632,0.0,0.0,0.322506,,1.129518,0.044335
Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,2533168.0,...,,0.314281,8.08,0.183584,,0.0,0.322506,349.80573,1.620483,0.322436


### Handling Missing Values

In [8]:
from sklearn.impute import SimpleImputer

zero_imputer = SimpleImputer(strategy='constant', add_indicator=True)
constant_imputer = SimpleImputer(strategy='constant', fill_value=-999)

In [9]:
imputed_train = zero_imputer.fit_transform(train)
imputed_test = zero_imputer.transform(test)

## Scaling the data

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
scaler_mm = MinMaxScaler()

scaled_train = pd.DataFrame(scaler_mm.fit_transform(imputed_train))
scaled_test = pd.DataFrame(scaler_mm.transform(imputed_test))

## Modelling

### Local Validation

In [12]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from rgf.sklearn import RGFClassifier

In [13]:
def skf(model):
    kf = RepeatedStratifiedKFold(n_splits=25, n_repeats=3, random_state=99)
    local_validation = []
    test_predictions = []

    for i, (train_index, val_index) in enumerate(kf.split(scaled_train, y)):
        cv_train, cv_val = scaled_train.iloc[train_index], scaled_train.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model.fit(cv_train, y_train)
        y_pred = model.predict_proba(cv_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred)
        local_validation.append(auc_score)
        print("*" * 73 + "\nFold" + str(i) + ": " + str(auc_score))

        test_pred = model.predict_proba(scaled_test)[:, 1]
        test_predictions.append(test_pred)
        
    print("AUC mean score: ", np.mean(local_validation))
    return np.mean(test_predictions, axis=0)

In [14]:
cat_params = {
    'n_estimators': 2500,
    'learning_rate': 0.01,
    'max_depth': 8,
    'objective': 'CrossEntropy',
    'random_seed': 3500,
    'thread_count': -1,
}

In [15]:
model1 = CatBoostClassifier(**cat_params)
model2 = CatBoostClassifier(verbose=False, thread_count=-1, random_state=999)
model3 = LGBMClassifier(boosting_type='dart', n_estimators=800, subsample_freq=20, colsample_bytree=.8, n_jobs=-1, random_state=999)
model4 = RGFClassifier(max_leaf=1500)

In [16]:
clf = VotingClassifier(estimators=[('cat_1', model1), 
                                   ('cat_2', model2),
                                   ('lgbm', model3), 
                                   ('rgf', model4)], 
                       voting='soft', 
                       n_jobs=-1, 
                       verbose=True)

test_pred = skf(clf)

*************************************************************************
Fold0: 0.8494277493054476
*************************************************************************
Fold1: 0.8447222653621165
*************************************************************************
Fold2: 0.8381854929164293
*************************************************************************
Fold3: 0.8384939431588756
*************************************************************************
Fold4: 0.8578777760521821
*************************************************************************
Fold5: 0.8522415488947559
*************************************************************************
Fold6: 0.8391658470086798
*************************************************************************
Fold7: 0.822357465790064
*************************************************************************
Fold8: 0.8451817052337318
*************************************************************************
Fold9: 0.8530752273472416
*

### Final Steps

In [17]:
submission = pd.read_csv("../input/2020aibootacampqualifiers/SampleSubmission.csv")

In [18]:
submission.default_status = test_pred
submission.to_csv("sub_strat2", index=False)

In [19]:
submission.head(2)

Unnamed: 0,Applicant_ID,default_status
0,Apcnt_1000032,0.304892
1,Apcnt_1000048,0.344676
