In [23]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
# Preprocessing and pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    auc,
    average_precision_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    roc_curve,
    precision_recall_curve, 
    roc_auc_score,
)
import pickle

# Import datasets

In [2]:
train_df = pd.read_csv('../data/train_df.csv')
val_df = pd.read_csv('../data/val_df.csv')
test_df = pd.read_csv('../data/test_df.csv')

In [3]:
train_df

Unnamed: 0,mg_acc,FSA,origin_date,property_type,purchase_price,amort_period,loan_size,interest_rate,TDS,default,Sample,cust_id,cust_age,cust_gender,cust_income,cust_cr_score,loan_years,lvr
0,733098,V2K,2012-01-12,detached,299000.0,25,239200.0,2.38,0.135672,1.0,Estimation,2277661,58,male,97500.0,900,10.742156,0.80
1,575927,V2T,2015-05-16,apartment,337000.0,25,256120.0,2.00,0.155294,0.0,Estimation,7394851,25,male,58800.0,899,7.401910,0.76
2,613989,V0N,2015-07-08,semi-detached/row/town-houses,249000.0,25,194220.0,1.67,0.117018,0.0,Estimation,1683540,54,female,70000.0,899,7.256801,0.78
3,50824,V1R,2018-03-26,detached,266000.0,25,210140.0,1.65,0.082235,0.0,Estimation,8924568,58,male,167600.0,899,4.540797,0.79
4,7399,V3S,2013-04-27,semi-detached/row/town-houses,568000.0,25,448720.0,2.11,0.294126,0.0,Estimation,936941,47,male,32900.0,898,9.452602,0.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2098,799786,V9G,2014-08-22,apartment,277000.0,30,218830.0,1.60,0.182272,0.0,Estimation,1057050,42,female,73800.0,387,8.132931,0.79
2099,18329,V8R,2018-03-07,semi-detached/row/town-houses,688000.0,25,550400.0,1.91,0.261726,0.0,Estimation,5908379,46,female,62400.0,385,4.592817,0.80
2100,942489,V1J,2018-05-05,semi-detached/row/town-houses,359000.0,25,287200.0,2.28,0.151774,0.0,Estimation,6920519,47,male,73900.0,369,4.431281,0.80
2101,793213,V2R,2018-10-29,semi-detached/row/town-houses,521000.0,25,416800.0,2.42,0.286790,0.0,Estimation,9485489,30,female,64400.0,360,3.946671,0.80


# Feature transformations

In [4]:
X_train = train_df.drop(columns=["default"])
X_test = test_df.drop(columns=["default"])
X_val = val_df.drop(columns=["default"])

y_train = train_df["default"]
y_val = val_df["default"]


In [24]:
X_train.head()

Unnamed: 0,mg_acc,FSA,origin_date,property_type,purchase_price,amort_period,loan_size,interest_rate,TDS,Sample,cust_id,cust_age,cust_gender,cust_income,cust_cr_score,loan_years,lvr
0,733098,V2K,2012-01-12,detached,299000.0,25,239200.0,2.38,0.135672,Estimation,2277661,58,male,97500.0,900,10.742156,0.8
1,575927,V2T,2015-05-16,apartment,337000.0,25,256120.0,2.0,0.155294,Estimation,7394851,25,male,58800.0,899,7.40191,0.76
2,613989,V0N,2015-07-08,semi-detached/row/town-houses,249000.0,25,194220.0,1.67,0.117018,Estimation,1683540,54,female,70000.0,899,7.256801,0.78
3,50824,V1R,2018-03-26,detached,266000.0,25,210140.0,1.65,0.082235,Estimation,8924568,58,male,167600.0,899,4.540797,0.79
4,7399,V3S,2013-04-27,semi-detached/row/town-houses,568000.0,25,448720.0,2.11,0.294126,Estimation,936941,47,male,32900.0,898,9.452602,0.79


In [25]:
train_df["property_type"].unique()

array(['detached', 'apartment', 'semi-detached/row/town-houses'],
      dtype=object)

In [26]:
train_df["FSA"].unique()

array(['V2K', 'V2T', 'V0N', 'V1R', 'V3S', 'V9R', 'V5H', 'V4B', 'V3X',
       'V9T', 'V2E', 'V3B', 'V2A', 'V1N', 'V9Y', 'V8A', 'V8B', 'V2J',
       'V4K', 'V8J', 'V4V', 'V3R', 'V1V', 'V3H', 'V0J', 'V5B', 'V1S',
       'V5P', 'V0B', 'V2B', 'V9B', 'V7G', 'V5S', 'V1E', 'V8P', 'V8T',
       'V1C', 'V8W', 'V0S', 'V1G', 'V0E', 'V7C', 'V4Z', 'V6X', 'V3A',
       'V5E', 'V3C', 'V2R', 'V4N', 'V5C', 'V3Y', 'V2L', 'V6P', 'V9C',
       'V2N', 'V2G', 'V4L', 'V0T', 'V9L', 'V3T', 'V1B', 'V0C', 'V9N',
       'V4M', 'V3J', 'V8C', 'V2P', 'V4A', 'V4R', 'V8M', 'V6Y', 'V3V',
       'V8Z', 'V9M', 'V9P', 'V1Y', 'V0R', 'V6B', 'V1M', 'V2C', 'V1T',
       'V2Z', 'V3G', 'V3Z', 'V8G', 'V1Z', 'V4C', 'V0P', 'V1X', 'V0X',
       'V5R', 'V5X', 'V9Z', 'V1J', 'V7E', 'V6V', 'V4E', 'V1W', 'V9A',
       'V2Y', 'V2S', 'V3M', 'V3L', 'V8N', 'V4T', 'V4W', 'V6K', 'V8L',
       'V2W', 'V3N', 'V6E', 'V5W', 'V6C', 'V0G', 'V9J', 'V0K', 'V6L',
       'V8V', 'V6Z', 'V9W', 'V9E', 'V9G', 'V9V', 'V9K', 'V5A', 'V3E',
       'V2X', 'V8Y',

# Build the preprocessing pipeline 

The plan:

1. One-hot encode `property_type` and `FSA`. 
2. Drop `mg_acc`,`cust_id`, `origin_date`, `Sample` as they are unique identifiers that is unlikely to be useful.
3. Drop `cust_age` and `cust_gender` because our lending algorithm is NOT allowed to depend on gender and age.
4. Treat the rest as numeric and standardize them.

In [32]:
categorical_features = ["property_type", "FSA"]
drop_features = ["mg_acc","cust_id","origin_date","cust_age","cust_gender", "Sample"]
numeric_features = list(set(train_df.columns)
    - set(["default"])
    - set(categorical_features)
    - set(drop_features))

In [33]:
numeric_features 

['interest_rate',
 'loan_years',
 'lvr',
 'loan_size',
 'TDS',
 'cust_income',
 'cust_cr_score',
 'amort_period',
 'purchase_price']

In [34]:
preprocessor = make_column_transformer(
    (
        make_pipeline(SimpleImputer(),StandardScaler()),
        numeric_features),
    (OneHotEncoder(handle_unknown="ignore", sparse=False), categorical_features),
    ("drop", drop_features),)


In [36]:
preprocessor.fit(X_train)

In [37]:
preprocessor.named_transformers_

{'pipeline': Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 'onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse=False),
 'drop': 'drop'}

In [40]:
new_columns = (
    numeric_features
    + list(
        preprocessor.named_transformers_["onehotencoder"].get_feature_names_out(
            categorical_features
        )
    )
)

In [42]:
new_columns

['interest_rate',
 'loan_years',
 'lvr',
 'loan_size',
 'TDS',
 'cust_income',
 'cust_cr_score',
 'amort_period',
 'purchase_price',
 'property_type_apartment',
 'property_type_detached',
 'property_type_semi-detached/row/town-houses',
 'FSA_V0A',
 'FSA_V0B',
 'FSA_V0C',
 'FSA_V0E',
 'FSA_V0G',
 'FSA_V0H',
 'FSA_V0J',
 'FSA_V0K',
 'FSA_V0L',
 'FSA_V0M',
 'FSA_V0N',
 'FSA_V0P',
 'FSA_V0R',
 'FSA_V0S',
 'FSA_V0T',
 'FSA_V0X',
 'FSA_V1A',
 'FSA_V1B',
 'FSA_V1C',
 'FSA_V1E',
 'FSA_V1G',
 'FSA_V1H',
 'FSA_V1J',
 'FSA_V1K',
 'FSA_V1L',
 'FSA_V1M',
 'FSA_V1N',
 'FSA_V1P',
 'FSA_V1R',
 'FSA_V1S',
 'FSA_V1T',
 'FSA_V1V',
 'FSA_V1W',
 'FSA_V1X',
 'FSA_V1Y',
 'FSA_V1Z',
 'FSA_V2A',
 'FSA_V2B',
 'FSA_V2C',
 'FSA_V2E',
 'FSA_V2G',
 'FSA_V2H',
 'FSA_V2J',
 'FSA_V2K',
 'FSA_V2L',
 'FSA_V2M',
 'FSA_V2N',
 'FSA_V2P',
 'FSA_V2R',
 'FSA_V2S',
 'FSA_V2T',
 'FSA_V2V',
 'FSA_V2W',
 'FSA_V2X',
 'FSA_V2Y',
 'FSA_V2Z',
 'FSA_V3A',
 'FSA_V3B',
 'FSA_V3C',
 'FSA_V3E',
 'FSA_V3G',
 'FSA_V3H',
 'FSA_V3J',
 'FSA_V3

In [41]:
pd.DataFrame(
    preprocessor.transform(X_train), index=X_train.index, columns=new_columns
)

Unnamed: 0,interest_rate,loan_years,lvr,loan_size,TDS,cust_income,cust_cr_score,amort_period,purchase_price,property_type_apartment,...,FSA_V9N,FSA_V9P,FSA_V9R,FSA_V9S,FSA_V9T,FSA_V9V,FSA_V9W,FSA_V9X,FSA_V9Y,FSA_V9Z
0,0.666542,1.849176,0.646032,-0.888479,-1.287748,0.496924,1.709617,-0.292795,-0.900511,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.163173,0.314573,-1.440919,-0.813624,-1.081751,-0.600262,1.699841,-0.292795,-0.768973,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.883715,0.247905,-0.397444,-1.087474,-1.483580,-0.282730,1.699841,-0.292795,-1.073587,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.927384,-0.999904,0.124294,-1.017043,-1.848740,2.484335,1.699841,-0.292795,-1.014741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.077008,1.256719,0.124294,0.038454,0.375725,-1.334555,1.690065,-0.292795,0.030640,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2098,-1.036557,0.650424,0.124294,-0.978598,-0.798532,-0.174996,-3.305477,1.808733,-0.976665,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2099,-0.359684,-0.976004,0.646032,0.488295,0.035590,-0.498198,-3.325029,-0.292795,0.446024,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2100,0.448196,-1.050218,0.646032,-0.676123,-1.118709,-0.172161,-3.481446,-0.292795,-0.692819,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2101,0.753881,-1.272862,0.646032,-0.102762,0.298712,-0.441496,-3.569430,-0.292795,-0.132052,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Hyperparameter Tuning

## 1. SVM Training:

In [14]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score
import random 
random.seed(123)

svc_param_grid = {
    "gamma": np.logspace(-3, 2, 6), 
    "C": np.linspace(2, 4, 6),
    "kernel": ["linear",'rbf']
}

In [15]:
data = {'ml':[], "model":[], "val_roc_auc_score":[], "train_roc_auc_score":[]}
for g in random.choices(ParameterGrid(svc_param_grid), k=20):
    pipe_svm = make_pipeline(preprocessor, SVC(**g,class_weight='balanced',random_state=123,probability=True))
    pipe_svm.fit(X_train,y_train)
    data["ml"].append('SVM')
    data["model"].append(g)
    data['val_roc_auc_score'].append(roc_auc_score(y_val, pipe_svm.predict(X_val)))
    data['train_roc_auc_score'].append(roc_auc_score(y_train, pipe_svm.predict(X_train)))

In [16]:
svm = pd.DataFrame(data).sort_values(by="val_roc_auc_score", ascending=False)
svm

Unnamed: 0,ml,model,val_roc_auc_score,train_roc_auc_score
8,SVM,"{'kernel': 'rbf', 'gamma': 0.001, 'C': 4.0}",0.650844,0.631662
0,SVM,"{'kernel': 'rbf', 'gamma': 0.01, 'C': 2.0}",0.648201,0.658002
18,SVM,"{'kernel': 'linear', 'gamma': 100.0, 'C': 2.4}",0.637804,0.691588
15,SVM,"{'kernel': 'linear', 'gamma': 1.0, 'C': 2.0}",0.637804,0.692037
13,SVM,"{'kernel': 'linear', 'gamma': 0.001, 'C': 2.0}",0.637804,0.692037
11,SVM,"{'kernel': 'linear', 'gamma': 0.001, 'C': 2.8}",0.637804,0.691588
1,SVM,"{'kernel': 'linear', 'gamma': 1.0, 'C': 2.0}",0.637804,0.692037
10,SVM,"{'kernel': 'linear', 'gamma': 0.001, 'C': 2.8}",0.637804,0.691588
5,SVM,"{'kernel': 'linear', 'gamma': 0.01, 'C': 2.0}",0.637804,0.692037
19,SVM,"{'kernel': 'linear', 'gamma': 10.0, 'C': 2.8}",0.637804,0.691588


In [17]:
svm_best_params = svm.iloc[0,1]
svm_best = make_pipeline(preprocessor,  SVC(**svm_best_params,class_weight='balanced',random_state=123,probability=True))
svm_best.fit(X_train, y_train)
pickle.dump(svm_best, open('../models/svm.pkl', 'wb'))

## 2. Logistic regression Training:

In [18]:
lr_param_grid = {
    "penalty": ['l1','l2'], 
    "C": np.logspace(-3, 2, 6)
}

In [19]:
data = {'ml':[], "model":[], "val_roc_auc_score":[], "train_roc_auc_score":[]}
for g in random.choices(ParameterGrid(lr_param_grid), k=20):
    pipe_lr = make_pipeline(preprocessor, LogisticRegression(**g, class_weight='balanced',max_iter = 4000, solver='saga',random_state=123))
    pipe_lr.fit(X_train,y_train)
    data["ml"].append('LR')
    data["model"].append(g)
    data['val_roc_auc_score'].append(roc_auc_score(y_val, pipe_lr.predict(X_val)))
    data['train_roc_auc_score'].append(roc_auc_score(y_train, pipe_lr.predict(X_train)))

In [20]:
lr = pd.DataFrame(data).sort_values(by="val_roc_auc_score", ascending=False)
lr

Unnamed: 0,ml,model,val_roc_auc_score,train_roc_auc_score
7,LR,"{'penalty': 'l2', 'C': 0.01}",0.654573,0.631256
16,LR,"{'penalty': 'l2', 'C': 0.01}",0.654573,0.631256
10,LR,"{'penalty': 'l1', 'C': 0.1}",0.652679,0.626825
17,LR,"{'penalty': 'l1', 'C': 0.1}",0.652679,0.626825
8,LR,"{'penalty': 'l1', 'C': 100.0}",0.645952,0.691018
0,LR,"{'penalty': 'l1', 'C': 100.0}",0.645952,0.691018
5,LR,"{'penalty': 'l1', 'C': 100.0}",0.645952,0.691018
6,LR,"{'penalty': 'l1', 'C': 1.0}",0.645893,0.669787
9,LR,"{'penalty': 'l2', 'C': 10.0}",0.645893,0.686651
11,LR,"{'penalty': 'l2', 'C': 10.0}",0.645893,0.686651


In [21]:
lr_best_params = lr.iloc[0,1]
lr_best = make_pipeline(preprocessor, LogisticRegression(**lr_best_params, class_weight='balanced',max_iter = 4000, solver='saga',random_state=123))
lr_best.fit(X_train, y_train)
pickle.dump(lr_best, open('../models/lr.pkl', 'wb'))

In [22]:
coef_df.reset_index()

NameError: name 'coef_df' is not defined

### 3.1 Feature Importance

In [None]:
coefs = lr_best.named_steps["logisticregression"].coef_
data = {
    "coefficient": lr_best.named_steps["logisticregression"].coef_.tolist()[0],
    "magnitude": np.absolute(lr_best.named_steps["logisticregression"].coef_).tolist()[0],
}
coef_df = pd.DataFrame(data, index=new_columns).sort_values(
    "magnitude", ascending=False
)

coef_df[:15].plot.bar(y="magnitude", title="Logistic regression variable importance");

## 3. Random Forest Training:

In [None]:
rf_param_grid = {'bootstrap': [True, False],
               'max_depth': [2,4],
               'max_features': ['sqrt', 'log2'],
               'min_samples_leaf': [1, 2],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)]}

In [None]:
data = {'ml':[], "model":[], "val_roc_auc_score":[], "train_roc_auc_score":[]}
for g in random.choices(ParameterGrid(rf_param_grid), k=20):
    pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(**g,class_weight='balanced',random_state=123))
    pipe_rf.fit(X_train,y_train)
    data["ml"].append('RF')
    data["model"].append(g)
    data['val_roc_auc_score'].append(roc_auc_score(y_val, pipe_rf.predict(X_val)))
    data['train_roc_auc_score'].append(roc_auc_score(y_train, pipe_rf.predict(X_train)))

In [None]:
rf = pd.DataFrame(data).sort_values(by="val_roc_auc_score", ascending=False)
rf

In [None]:
rf_best_params = rf.iloc[0,1]
rf_best = make_pipeline(preprocessor, RandomForestClassifier(**rf_best_params,class_weight='balanced',random_state=123))
rf_best.fit(X_train, y_train)
pickle.dump(rf_best, open('../models/rf.pkl', 'wb'))

### 3.1 Feature Importance

In [None]:
data = {
    "Importance": rf_best.named_steps["randomforestclassifier"].feature_importances_,
}
imps = pd.DataFrame(
    data=data,
    index=new_columns,
).sort_values(by="Importance", ascending=False)

imps[:15].plot.bar(y="Importance", title="Random Forest variable importance");

## 4. XGboost Training:

In [None]:
xgb_param_grid = {
 'eta': [0.05,0.1,0.3],
 "n_estimators": [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)],
 "learning_rate" : [0.01,0.1,0.2,0.5,1],
 "max_depth" : [2, 3, 5, 8, 12],
 "gamma": np.logspace(-2, 2, 6),
 "booster": ['gbtree','gblinear'],
}

In [None]:
data = {'ml':[], "model":[], "val_roc_auc_score":[], "train_roc_auc_score":[]}

for g in random.choices(ParameterGrid(xgb_param_grid), k=20):
    pipe_xgb = make_pipeline(preprocessor, XGBClassifier(**g,class_weight='balanced',use_label_encoder =False, verbosity=0, random_state=123))
    pipe_xgb.fit(X_train,y_train)
    data["ml"].append('XGB')
    data["model"].append(g)
    data['val_roc_auc_score'].append(roc_auc_score(y_val, pipe_xgb.predict(X_val)))
    data['train_roc_auc_score'].append(roc_auc_score(y_train, pipe_xgb.predict(X_train)))

In [None]:
xgb = pd.DataFrame(data).sort_values(by="val_roc_auc_score", ascending=False)
xgb

In [None]:
xgb_best_params = xgb.iloc[1,1]
xgb_best = make_pipeline(preprocessor, XGBClassifier(**xgb_best_params,class_weight='balanced',random_state=123))
xgb_best.fit(X_train, y_train)
pickle.dump(xgb_best, open('../models/xgb.pkl', 'wb'))

In [None]:
nn_param_grid = {
    'alpha': [0.05, 0.01, 0.005, 0.001],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'activation':['logistic', 'tanh', 'relu']}

In [None]:
data

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                   hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X, y)
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs')

# Ensemble method (voting ensemble)

In [None]:
from sklearn.ensemble import VotingClassifier
models = {
    "logistic regression": lr_best,
    "random forest": rf_best,
    "svm": svm_best,
    'xgb_boost': xgb_best
}

In [None]:
averaging_model = VotingClassifier(list(models.items()), voting='soft')
averaging_model.fit(X_train, y_train);
data = {'ml':[], "model":[], "val_roc_auc_score":[], "train_roc_auc_score":[]}
data["ml"].append('voting ensemble')
data["model"].append(['svm','lr','rf','xgb_boost'])
data['val_roc_auc_score'].append(roc_auc_score(y_val, averaging_model.predict(X_val)))
data['train_roc_auc_score'].append(roc_auc_score(y_train, averaging_model.predict(X_train)))

In [None]:
pd.DataFrame(data).sort_values(by="val_roc_auc_score", ascending=False)

In [None]:
pickle.dump(averaging_model, open('../models/ensemble.pkl', 'wb'))