In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv('train_ctrUa4K.csv')

In [5]:
df.drop(columns = ['Loan_ID'], inplace = True)

In [6]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


### Data

In [7]:
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

### Build list of categorical and numerical features

In [8]:
numeric_features = []
categorical_features = []
for col in df.columns:
    if df[col].dtype == np.dtype('O') and col != 'Loan_Status':
        categorical_features.append(col)
    if df[col].dtype != np.dtype('O') and col != 'Loan_Status':
        numeric_features.append(col)

### Numerical features imputation and scaling

In [9]:
num_impute = SimpleImputer(strategy='median')
num_scale = StandardScaler()
num_steps = [('impute', num_impute), ('scale', num_scale)]
num_pipeline = Pipeline(num_steps)

### Categorical feature imputation and one-hot encoder

In [10]:
cat_impute = SimpleImputer(strategy='constant', fill_value='missing')
cat_encode = OneHotEncoder(handle_unknown='ignore')
cat_steps = [('imputer_cat', cat_impute), ('encode', cat_encode)]
cat_pipeline = Pipeline(cat_steps)

### Different piplelines for different features

In [11]:
data_preprocess = ColumnTransformer([('categorical', cat_pipeline, categorical_features),
                                     ('numerical', num_pipeline, numeric_features)])

### Random Forest

In [12]:
rf = RandomForestClassifier()

### Grid search for best preprocessing strategy and best parameters of Random Forest

In [13]:
final_steps = [('peprocess', data_preprocess), ('randomforest', rf)]
final_pipeline = Pipeline(final_steps)

kf = KFold(n_splits=5, shuffle=True)
grid = {'peprocess__numerical__impute__strategy': ['mean', 'median'],
       'randomforest__n_estimators': [50, 100, 200],
       'randomforest__max_depth': [2, 6, 8, 10],
       'randomforest__max_features':['auto', 'sqrt']}
gs = GridSearchCV(final_pipeline, grid, cv=kf)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('peprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('categorical',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer_cat',
                                                                                          SimpleImputer(add_indicator=False,
                                                                

In [14]:
gs.best_params_

{'peprocess__numerical__impute__strategy': 'mean',
 'randomforest__max_depth': 6,
 'randomforest__max_features': 'sqrt',
 'randomforest__n_estimators': 200}

In [15]:
gs.best_score_

0.8094462540716613