# Import dataset

In [18]:
import math
import numpy as np
import pandas as pd
import scipy
import scipy.stats as stats

import matplotlib.pyplot as plt
import plotly
import plotly.subplots
import plotly.express as px
import seaborn as sns
import sweetviz as sv

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix

In [2]:
def read_dataset(train_file, test_file):
    df_train = pd.read_csv('data/train.csv') \
        .set_index('id')
    df_test  = pd.read_csv('data/test.csv') \
        .set_index('id')
    
    X_train = df_train.drop(columns = ['loan_status'])
    y_train = df_train['loan_status']
    X_test = df_test

    return X_train, X_test, y_train

X_train_0, X_test_0, y_train_0 = read_dataset('data/train.csv', 'data/test.csv')

# Decision Tree

In [3]:
class BaseFitTransformer():
    def __init__(self, fn):
        self.fn = fn

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return self.fn(X)
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)
    
    @classmethod
    def from_fn(cls, fn):
        return BaseFitTransformer(fn)

feature_extractor = make_pipeline(
    BaseFitTransformer.from_fn(lambda df: pd.get_dummies(df))
)

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_0, y_train_0, test_size=.2)
X_train = feature_extractor.fit_transform(X_train)
X_valid = feature_extractor.transform(X_valid)

In [5]:
params_grid = dict(
    max_depth = [6, 8, 10, 12, 14],
    min_samples_split = [3, 5, 7, 9]
)

model_decision_tree_base = DecisionTreeClassifier()
model_decision_tree = GridSearchCV(
    model_decision_tree_base,
    params_grid,
    cv = 2,
    scoring = 'roc_auc',
    verbose = 3
)
model_decision_tree.fit(X_train, y_train)
score = model_decision_tree.score(X_valid, y_valid)

print('Model created with ROC score = {}'.format(score))

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV 1/2] END ..max_depth=6, min_samples_split=3;, score=0.908 total time=   0.0s
[CV 2/2] END ..max_depth=6, min_samples_split=3;, score=0.906 total time=   0.0s
[CV 1/2] END ..max_depth=6, min_samples_split=5;, score=0.907 total time=   0.0s
[CV 2/2] END ..max_depth=6, min_samples_split=5;, score=0.906 total time=   0.0s
[CV 1/2] END ..max_depth=6, min_samples_split=7;, score=0.908 total time=   0.0s
[CV 2/2] END ..max_depth=6, min_samples_split=7;, score=0.906 total time=   0.0s
[CV 1/2] END ..max_depth=6, min_samples_split=9;, score=0.908 total time=   0.0s
[CV 2/2] END ..max_depth=6, min_samples_split=9;, score=0.906 total time=   0.0s
[CV 1/2] END ..max_depth=8, min_samples_split=3;, score=0.906 total time=   0.0s
[CV 2/2] END ..max_depth=8, min_samples_split=3;, score=0.908 total time=   0.0s
[CV 1/2] END ..max_depth=8, min_samples_split=5;, score=0.909 total time=   0.0s
[CV 2/2] END ..max_depth=8, min_samples_split=5;

In [6]:
print(model_decision_tree.best_params_)
model_decision_tree

{'max_depth': 8, 'min_samples_split': 9}


# Random Forest

In [7]:
feature_extractor_forest = make_pipeline(
    BaseFitTransformer.from_fn(lambda df: pd.get_dummies(df)),
    StandardScaler()
)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_0, y_train_0, test_size=.2)
X_train = feature_extractor_forest.fit_transform(X_train)
X_valid = feature_extractor_forest.transform(X_valid)

In [9]:
params_grid = dict(
    n_estimators = [250],
    criterion = ['entropy'],
    max_depth = [500, 600, 750],
    min_samples_split = [25]
)

model_forest_base = RandomForestClassifier()
model_forest = GridSearchCV(
    model_forest_base,
    params_grid,
    cv = 4,
    scoring = 'roc_auc',
    verbose = 3
)
model_forest.fit(X_train, y_train)
score = model_forest.score(X_valid, y_valid)

print('Model created with ROC score = {}'.format(score))

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV 1/4] END criterion=entropy, max_depth=500, min_samples_split=25, n_estimators=250;, score=0.941 total time=  12.7s
[CV 2/4] END criterion=entropy, max_depth=500, min_samples_split=25, n_estimators=250;, score=0.943 total time=  11.3s
[CV 3/4] END criterion=entropy, max_depth=500, min_samples_split=25, n_estimators=250;, score=0.936 total time=  10.9s
[CV 4/4] END criterion=entropy, max_depth=500, min_samples_split=25, n_estimators=250;, score=0.936 total time=  11.1s
[CV 1/4] END criterion=entropy, max_depth=600, min_samples_split=25, n_estimators=250;, score=0.942 total time=  11.1s
[CV 2/4] END criterion=entropy, max_depth=600, min_samples_split=25, n_estimators=250;, score=0.942 total time=  14.8s
[CV 3/4] END criterion=entropy, max_depth=600, min_samples_split=25, n_estimators=250;, score=0.938 total time=   8.6s
[CV 4/4] END criterion=entropy, max_depth=600, min_samples_split=25, n_estimators=250;, score=0.936 total t

In [10]:
print(model_forest.best_params_)
model_forest

{'criterion': 'entropy', 'max_depth': 600, 'min_samples_split': 25, 'n_estimators': 250}


# Gradient Boosting

In [19]:
feature_extractor_grad_boost = make_pipeline(
    BaseFitTransformer.from_fn(lambda df: pd.get_dummies(df)),
    StandardScaler()
)

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_0, y_train_0, test_size=.2)
X_train = feature_extractor_grad_boost.fit_transform(X_train)
X_valid = feature_extractor_grad_boost.transform(X_valid)

In [28]:
params_grid = dict(
    n_estimators = [1000],
    loss = ['log_loss'],
    learning_rate = [0.1],
    criterion = ['friedman_mse']
)

model_grad_boost_base = GradientBoostingClassifier()
model_grad_boost = GridSearchCV(
    model_grad_boost_base,
    params_grid,
    cv = 4,
    scoring = 'roc_auc',
    verbose = 3
)
model_grad_boost.fit(X_train, y_train)
score = model_grad_boost.score(X_valid, y_valid)

print('Model created with ROC score = {}'.format(score))

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV 1/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=500;, score=0.958 total time=  30.6s
[CV 2/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=500;, score=0.956 total time=  32.8s
[CV 3/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=500;, score=0.959 total time=  31.0s
[CV 4/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=500;, score=0.957 total time=  31.7s
[CV 1/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=1000;, score=0.959 total time=  57.8s
[CV 2/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=1000;, score=0.958 total time=  59.9s
[CV 3/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=1000;, score=0.959 total time= 1.1min
[CV 4/4] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, n_estimators=1000;,

In [29]:
print(model_grad_boost.best_params_)
model_grad_boost

{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'log_loss', 'n_estimators': 1000}


# Logistic Regression

In [11]:
# class LogisticFeatureExtractor:
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return self.fn(X)
    
#     def fit_transform(self, X, y=None):
#         self.fit(X, y)
#         return self.transform(X)

In [12]:
safe_log = lambda x: np.log(x + .00001)

feature_extractor_lr = make_pipeline(
    BaseFitTransformer.from_fn(
        lambda df: df \
            .pipe(pd.get_dummies) \
            .assign(log_person_income  = df['person_income'].apply(safe_log)) \
            .assign(log_loan_amnt      = df['loan_amnt']    .apply(safe_log)) \
            .assign(sqrt_loan_int_rate = df['loan_int_rate'].apply(np.sqrt)) \
            .assign(sqrt_loan_percent_income = df['loan_percent_income'].apply(np.sqrt)) \
            .assign(log_loan_percent_income  = df['loan_percent_income'].apply(safe_log))
    ),
    StandardScaler()
)

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_0, y_train_0, test_size=.2)
X_train = feature_extractor_lr.fit_transform(X_train)
X_valid = feature_extractor_lr.transform(X_valid)

In [14]:
params_grid = dict(
    solver = ['lbfgs'],
    max_iter = [5000],
    C = [.05, .01, .5, .1, 1]
)

model_logistic_regression_base = LogisticRegression()
model_logistic_regression = GridSearchCV(
    model_logistic_regression_base,
    params_grid,
    cv = 2,
    scoring = 'roc_auc',
    verbose = 3
)
model_logistic_regression.fit(X_train, y_train)
score = model_logistic_regression.score(X_valid, y_valid)

print('Model created with ROC score = {}'.format(score))

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV 1/2] END C=0.05, max_iter=5000, solver=lbfgs;, score=0.907 total time=   0.0s
[CV 2/2] END C=0.05, max_iter=5000, solver=lbfgs;, score=0.910 total time=   0.0s
[CV 1/2] END C=0.01, max_iter=5000, solver=lbfgs;, score=0.906 total time=   0.0s
[CV 2/2] END C=0.01, max_iter=5000, solver=lbfgs;, score=0.909 total time=   0.0s
[CV 1/2] END C=0.5, max_iter=5000, solver=lbfgs;, score=0.908 total time=   0.0s
[CV 2/2] END C=0.5, max_iter=5000, solver=lbfgs;, score=0.911 total time=   0.1s
[CV 1/2] END C=0.1, max_iter=5000, solver=lbfgs;, score=0.907 total time=   0.1s
[CV 2/2] END C=0.1, max_iter=5000, solver=lbfgs;, score=0.910 total time=   0.1s
[CV 1/2] END ..C=1, max_iter=5000, solver=lbfgs;, score=0.908 total time=   0.1s
[CV 2/2] END ..C=1, max_iter=5000, solver=lbfgs;, score=0.911 total time=   0.2s
Model created with ROC score = 0.9074756105278353


# Submission

In [30]:
selected_feature_extractor = feature_extractor_grad_boost
selected_model = model_grad_boost

X_test = selected_feature_extractor.transform(X_test_0)
y_test = selected_model.predict_proba(X_test)[:, 1]

In [31]:
df_submit = pd.read_csv('data/sample_submission.csv')
df_submit['loan_status'] = y_test
df_submit.to_csv('submission.csv', index=False)
df_submit

Unnamed: 0,id,loan_status
0,58645,0.999058
1,58646,0.022242
2,58647,0.641216
3,58648,0.008470
4,58649,0.022835
...,...,...
39093,97738,0.032237
39094,97739,0.006773
39095,97740,0.009593
39096,97741,0.249291
