In [1]:
import pandas as pd
import numpy as np
from sklearn import * 
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("/data/credit-default.csv")
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_length,installment_rate,personal_status,other_debtors,...,property,age,installment_plan,housing,existing_credits,default,dependents,telephone,foreign_worker,job
0,< 0 DM,6,critical,radio/tv,1169,unknown,> 7 yrs,4,single male,none,...,real estate,67,none,own,2,1,1,yes,yes,skilled employee
1,1 - 200 DM,48,repaid,radio/tv,5951,< 100 DM,1 - 4 yrs,2,female,none,...,real estate,22,none,own,1,2,1,none,yes,skilled employee
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 yrs,2,single male,none,...,real estate,49,none,own,1,1,2,none,yes,unskilled resident
3,< 0 DM,42,repaid,furniture,7882,< 100 DM,4 - 7 yrs,2,single male,guarantor,...,building society savings,45,none,for free,1,1,2,none,yes,skilled employee
4,< 0 DM,24,delayed,car (new),4870,< 100 DM,1 - 4 yrs,3,single male,none,...,unknown/none,53,none,for free,2,2,2,none,yes,skilled employee


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_length       1000 non-null object
installment_rate        1000 non-null int64
personal_status         1000 non-null object
other_debtors           1000 non-null object
residence_history       1000 non-null int64
property                1000 non-null object
age                     1000 non-null int64
installment_plan        1000 non-null object
housing                 1000 non-null object
existing_credits        1000 non-null int64
default                 1000 non-null int64
dependents              1000 non-null int64
telephone               1000 non-null object
foreign_worker          1000 non-null object
jo

In [4]:
df.default.value_counts()

1    700
2    300
Name: default, dtype: int64

In [5]:
target = "default"
label_encoder = preprocessing.LabelEncoder()

y = label_encoder.fit_transform(df[target])
X = df.drop(columns=[target])

In [6]:
cat_columns = [field for field in dict(X.dtypes) if X.dtypes[field] == "object"]
cat_columns

['checking_balance',
 'credit_history',
 'purpose',
 'savings_balance',
 'employment_length',
 'personal_status',
 'other_debtors',
 'property',
 'installment_plan',
 'housing',
 'telephone',
 'foreign_worker',
 'job']

In [7]:
num_columns = [field for field in dict(X.dtypes) if X.dtypes[field] != "object"]
num_columns

['months_loan_duration',
 'amount',
 'installment_rate',
 'residence_history',
 'age',
 'existing_credits',
 'dependents']

In [8]:
cat_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='error', drop="first"))
]) 

num_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('poly', preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ('scaler', preprocessing.StandardScaler()),
])

preprocessing_pipe = compose.ColumnTransformer([
    ("cat", cat_pipe, cat_columns),
    ("num", num_pipe, num_columns)
])



# Simple logistic regression

In [9]:
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", linear_model.LogisticRegression(random_state=1, solver="liblinear"))
])


param_grid = {
    "est__C": np.random.random(10) + 1
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                                , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Best score:  0.755 Best parameters:  {'est__C': 1.5662755393490793}


[Parallel(n_jobs=8)]: Done  35 out of  50 | elapsed:    1.9s remaining:    0.8s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    2.0s finished


# Ensemble Classifier

In [10]:
log_clf = linear_model.LogisticRegression(C = 1.53, solver= "liblinear", random_state=1) 
rnd_clf = ensemble.RandomForestClassifier(max_depth=6, n_estimators = 30, random_state=1) 
svm_clf = svm.SVC(C = 1.0, gamma = 0.15, random_state=1) 


estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.VotingClassifier(voting="hard", estimators=
                                      [('lr', log_clf), 
                                       ('rf', rnd_clf), 
                                       ('svm', svm_clf)
                                      ])
    )
])


param_grid = {
    "est__svm__C": np.linspace(1.0, 20, 10)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                                , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best score:  0.765 Best parameters:  {'est__svm__C': 5.222222222222222}


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.1s finished


# AdaBoost Classifier

In [11]:
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.AdaBoostClassifier(
          linear_model.LogisticRegression(random_state=1, solver="liblinear")
        , n_estimators=200
        , algorithm="SAMME.R"
        , learning_rate=0.051)

    )
])


param_grid = {
    "est__base_estimator__C": np.random.random(10) + 1
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                                , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    3.7s finished


Best score:  0.734 Best parameters:  {'est__base_estimator__C': 1.0258494070869997}


# Bagging classifier

In [12]:
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.BaggingClassifier(
                tree.DecisionTreeClassifier(), 
                max_samples= 0.5,
                n_estimators=50,
                bootstrap=True, 
                oob_score=True)
    )
])


param_grid = {
    "est__base_estimator__max_depth": np.arange(5, 15)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Best score:  0.757 Best parameters:  {'est__base_estimator__max_depth': 12}


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.1s finished
