In [29]:
import numpy as np
import pandas as pd
from skl2onnx import to_onnx
from sklearn.dummy import DummyClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import HalvingRandomSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [30]:
RANDOM_STATE = 42
results_collection = dict()

In [31]:
data = pd.read_excel(
    "../data/raw/default of credit card clients.xls", header=[1], index_col="ID"
)

 **Variable Name** | **Role** | **Type** | **Demographic** | **Description**            | **Units** | **Missing Values** 
-------------------|----------|----------|-----------------|----------------------------|-----------|--------------------
 ID                | ID       | Integer  |                 |                            |           | no                 
 X1                | Feature  | Integer  |                 | LIMIT_BAL                  |           | no                 
 X2                | Feature  | Integer  | Sex             | SEX                        |           | no                 
 X3                | Feature  | Integer  | Education Level | EDUCATION                  |           | no                 
 X4                | Feature  | Integer  | Marital Status  | MARRIAGE                   |           | no                 
 X5                | Feature  | Integer  | Age             | AGE                        |           | no                 
 X6                | Feature  | Integer  |                 | PAY_0                      |           | no                 
 X7                | Feature  | Integer  |                 | PAY_2                      |           | no                 
 X8                | Feature  | Integer  |                 | PAY_3                      |           | no                 
 X9                | Feature  | Integer  |                 | PAY_4                      |           | no                 
 X10               | Feature  | Integer  |                 | PAY_5                      |           | no                 
 X11               | Feature  | Integer  |                 | PAY_6                      |           | no                 
 X12               | Feature  | Integer  |                 | BILL_AMT1                  |           | no                 
 X13               | Feature  | Integer  |                 | BILL_AMT2                  |           | no                 
 X14               | Feature  | Integer  |                 | BILL_AMT3                  |           | no                 
 X15               | Feature  | Integer  |                 | BILL_AMT4                  |           | no                 
 X16               | Feature  | Integer  |                 | BILL_AMT5                  |           | no                 
 X17               | Feature  | Integer  |                 | BILL_AMT6                  |           | no                 
 X18               | Feature  | Integer  |                 | PAY_AMT1                   |           | no                 
 X19               | Feature  | Integer  |                 | PAY_AMT2                   |           | no                 
 X20               | Feature  | Integer  |                 | PAY_AMT3                   |           | no                 
 X21               | Feature  | Integer  |                 | PAY_AMT4                   |           | no                 
 X22               | Feature  | Integer  |                 | PAY_AMT5                   |           | no                 
 X23               | Feature  | Integer  |                 | PAY_AMT6                   |           | no                 
 Y                 | Target   | Binary   |                 | default payment next month |           | no                 



### Additional Variable Information

This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study reviewed the literature and used the following 23 variables as explanatory variables:

X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.

X2: Gender (1 = male; 2 = female).

X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).

X4: Marital status (1 = married; 2 = single; 3 = others).

X5: Age (year).

X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.

X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. 

X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005.



In [32]:
data

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29997,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29999,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [33]:
data_train, data_test = train_test_split(
    data,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=data["default payment next month"],
)
assert (
    data_train["default payment next month"].mean()
    == data_test["default payment next month"].mean()
)
data_test.to_csv("../data/raw/test.csv")
del data_test, data

In [34]:
data_train

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27351,200000,1,1,2,32,-1,-1,-1,-2,-2,...,0,0,0,2182,0,0,0,0,0,1
22030,320000,2,1,2,28,0,0,0,0,0,...,25922,22982,20553,13300,12756,1295,10312,8394,8242,0
22167,170000,2,1,2,27,-1,-1,2,-1,-1,...,736,736,736,1536,0,736,736,736,736,0
1881,90000,2,3,1,40,0,0,0,0,0,...,46589,47730,49488,3416,4142,1700,1888,2710,1500,0
1320,250000,2,1,1,43,-1,-1,-1,0,0,...,17371,11481,5922,24890,48394,0,5461,15000,6000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25781,200000,2,2,1,32,0,0,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
13922,120000,2,2,2,24,0,0,0,0,0,...,113948,122127,121962,4200,4100,4100,10000,4560,0,0
3795,120000,2,1,2,24,0,0,0,0,0,...,49924,33188,19826,3700,2023,2016,2000,1200,1000,0
27566,360000,1,1,1,57,1,-2,-1,-1,-1,...,246,-46,-46,0,860,246,0,0,0,0


In [35]:
categorical_columns = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6",
]
continuous_columns = [
    "LIMIT_BAL",
    "AGE",
    "BILL_AMT1",
    "BILL_AMT2",
    "BILL_AMT3",
    "BILL_AMT4",
    "BILL_AMT5",
    "BILL_AMT6",
    "PAY_AMT1",
    "PAY_AMT2",
    "PAY_AMT3",
    "PAY_AMT4",
    "PAY_AMT5",
    "PAY_AMT6",
]
assert set(data_train.columns) - set(categorical_columns) - set(continuous_columns) == {
    "default payment next month"
}
assert set(categorical_columns).intersection(continuous_columns) == set()

In [36]:
print([data_train.columns.get_loc(c) for c in categorical_columns if c in data_train])
print([data_train.columns.get_loc(c) for c in continuous_columns if c in data_train])

[1, 2, 3, 5, 6, 7, 8, 9, 10]
[0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [9]:
data_train.to_numpy().dtype

dtype('int64')

In [10]:
one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoder.fit(data_train[categorical_columns])
onx = to_onnx(one_hot_encoder, data_train[categorical_columns].to_numpy()[:1])
with open("../src/models/one_hot_encoder.onnx", "wb") as f:
    f.write(onx.SerializeToString())
data_train = pd.concat(
    [
        data_train,
        pd.DataFrame(
            one_hot_encoder.transform(data_train[categorical_columns]),
            columns=one_hot_encoder.get_feature_names_out(),
            index=data_train.index,
        ),
    ],
    axis=1,
)
data_train = data_train.drop(columns=categorical_columns)

standard_scaler = StandardScaler()
standard_scaler.fit(data_train[continuous_columns])
onx = to_onnx(standard_scaler, data_train[continuous_columns].to_numpy()[:1])
with open("../src/models/standard_scaler.onnx", "wb") as f:
    f.write(onx.SerializeToString())
data_train = pd.concat(
    [
        data_train,
        pd.DataFrame(
            standard_scaler.transform(data_train[continuous_columns]),
            columns=standard_scaler.get_feature_names_out() + "_scaled",
            index=data_train.index,
        ),
    ],
    axis=1,
)
data_train = data_train.drop(columns=continuous_columns)

data_train

Unnamed: 0_level_0,default payment next month,SEX_1,SEX_2,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,...,BILL_AMT3_scaled,BILL_AMT4_scaled,BILL_AMT5_scaled,BILL_AMT6_scaled,PAY_AMT1_scaled,PAY_AMT2_scaled,PAY_AMT3_scaled,PAY_AMT4_scaled,PAY_AMT5_scaled,PAY_AMT6_scaled
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27351,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.683559,-0.672524,-0.660784,-0.650122,-0.212932,-0.289115,-0.294472,-0.317762,-0.316895,-0.296845
22030,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.548249,-0.269014,-0.283374,-0.305148,0.466079,0.334530,-0.222035,0.360326,0.239390,0.171699
22167,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.672821,-0.661067,-0.648697,-0.637769,-0.252385,-0.289115,-0.253303,-0.269365,-0.268119,-0.255004
1881,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.439065,0.052696,0.123036,0.180514,-0.137568,-0.086612,-0.199380,-0.193612,-0.137298,-0.211572
1320,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.022695,-0.402121,-0.472243,-0.550724,1.173916,2.076882,-0.294472,0.041338,0.677181,0.044245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25781,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.683559,-0.672524,-0.660784,-0.650122,-0.346194,-0.289115,-0.294472,-0.317762,-0.316895,-0.296845
13922,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.001756,1.101228,1.344780,1.396961,-0.089687,-0.088665,-0.065133,0.339810,-0.014696,-0.296845
3795,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.296671,0.104609,-0.115772,-0.317351,-0.120223,-0.190210,-0.181704,-0.186248,-0.237369,-0.239996
27566,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.671012,-0.668695,-0.661539,-0.650894,-0.346194,-0.247070,-0.280712,-0.317762,-0.316895,-0.296845


In [11]:
X = data_train.drop("default payment next month", axis=1)
y = data_train["default payment next month"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y,
)

In [13]:
X_train

Unnamed: 0_level_0,SEX_1,SEX_2,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,...,BILL_AMT3_scaled,BILL_AMT4_scaled,BILL_AMT5_scaled,BILL_AMT6_scaled,PAY_AMT1_scaled,PAY_AMT2_scaled,PAY_AMT3_scaled,PAY_AMT4_scaled,PAY_AMT5_scaled,PAY_AMT6_scaled
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3087,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.679182,-0.667854,-0.655857,-0.645087,-0.346194,-0.289115,-0.294472,-0.317762,-0.316895,-0.296845
20861,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.602163,-0.539634,-0.619729,-0.608161,-0.057318,-0.014987,0.188482,-0.153369,-0.316895,-0.296845
7672,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.922155,-0.141323,-0.069084,-0.114945,0.875268,-0.289115,1.607364,-0.153369,-0.316895,2.545572
8833,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.791108,3.072844,3.118597,2.676099,0.204930,0.219344,0.264891,1.326167,0.213279,0.157942
18097,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.660609,-0.650731,-0.660784,-0.650122,-0.346194,-0.220669,-0.294472,-0.317762,-0.316895,-0.296845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24855,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.325064,-0.307525,-0.300831,-0.301741,-0.242553,-0.215535,-0.221475,-0.252005,-0.250623,-0.250286
17897,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.105520,-0.073781,-0.027405,0.535861,-0.346194,-0.288773,-0.154631,-0.219126,-0.217487,-0.127664
12454,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.015851,-0.215637,-0.198867,-0.183326,-0.222215,-0.289115,-0.237753,-0.251939,-0.249298,-0.231924
16462,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.479713,-0.427666,-0.411449,-0.378514,-0.248477,-0.213726,-0.165819,-0.317762,-0.224114,-0.251366


# Dummy

In [14]:
dummy_classifier = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)
dummy_classifier.fit(X_train, y_train)
results_collection["Dummy"] = dummy_classifier

# Decision Tree

In [15]:
decision_tree_classifier = DecisionTreeClassifier(random_state=RANDOM_STATE)

In [16]:
decision_tree_classifier_param_distributions = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(1, 11),
    "min_samples_split": range(2, 21),
    "min_samples_leaf": range(1, 21),
}

In [17]:
decision_tree_classifier_halving_random_search_cv = HalvingRandomSearchCV(
    decision_tree_classifier,
    param_distributions=decision_tree_classifier_param_distributions,
    random_state=RANDOM_STATE,
).fit(X_train, y_train)

In [18]:
decision_tree_classifier_halving_random_search_cv.best_params_

{'min_samples_split': 15,
 'min_samples_leaf': 12,
 'max_depth': 1,
 'criterion': 'gini'}

In [19]:
decision_tree_classifier_halving_random_search_cv.best_score_

0.8121399176954732

In [20]:
data_train.columns[
    decision_tree_classifier_halving_random_search_cv.best_estimator_.feature_importances_.argmax()
]

'PAY_0_1'

In [21]:
results_collection["Decision Tree"] = decision_tree_classifier_halving_random_search_cv

# Selection

In [22]:
selection_results_collection = dict()
results_collection

{'Dummy': DummyClassifier(random_state=42, strategy='most_frequent'),
 'Decision Tree': HalvingRandomSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                       param_distributions={'criterion': ['gini', 'entropy'],
                                            'max_depth': range(1, 11),
                                            'min_samples_leaf': range(1, 21),
                                            'min_samples_split': range(2, 21)},
                       random_state=42)}

In [23]:
for model_type in results_collection.keys():
    model = results_collection[model_type]
    selection_results_collection[model_type] = accuracy_score(
        y_true=y_test, y_pred=model.predict(X_test)
    )

In [24]:
results = pd.DataFrame.from_dict(
    selection_results_collection, orient="index", columns=["accuracy"]
).sort_values(by="accuracy", ascending=False)
results

Unnamed: 0,accuracy
Decision Tree,0.813689
Dummy,0.778844


In [25]:
best_model = results.head(n=1).index.item()
best_model

'Decision Tree'

# Exporting

In [26]:
if type(results_collection[best_model]) is HalvingRandomSearchCV:
    estimator = results_collection[best_model].best_estimator_

In [27]:

onx = to_onnx(estimator, X_train.to_numpy()[:1])
with open("../src/models/best_scikit_learn.onnx", "wb") as f:
    f.write(onx.SerializeToString())

