### __Credit scoring dataset__

In [93]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [94]:
df = pd.read_csv('CreditScoring.csv')

### __Data cleaning__

In [95]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


* __Although most of the columns are numerical, some are categorical: status, home, marital [status], records, and job. The values we see in the DataFrame, however, are numbers, not strings. This means that we need to translate them to their actual names.__

In [96]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


* __In Pandas, we can use map for converting the numbers to strings. For that, we first define the dictionary with mapping from the current value (number) to the desired value (string):__

In [97]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


* __We repeat the same procedure for all the other columns.__

In [98]:
#home column:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

#marital, records, and job columns:
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'parttime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


* __NaN = income, assets, and debt.__

In [99]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [100]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


* __target variable status:__

In [101]:
df.status.value_counts()

status
ok         3200
default    1254
unk           1
Name: count, dtype: int64

In [102]:
df = df[df.status != 'unk']

### __Dataset preparation__

* Split the dataset into train, validation, and test.
* Handle missing values.
* Use one-hot encoding to encode categorical variables.
* Create the feature matrix X and the target variable y.

___split the data into three parts:___

* Training data (60%)
* Validation data (20%)
* Test data (20%)


In [103]:
from sklearn.model_selection import train_test_split

df_train_full, df_test, = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [104]:
len(df_train), len(df_val), len(df_test)

(2672, 891, 891)

In [105]:
y_train = (df_train.status == 'default').values
y_val = (df_val.status == 'default').values

* __The outcome we want to predict is status (y).__
* __Objective = determine if somebody fails to pay back the loan. The positive class is __default.____

In [106]:
del df_train['status']
del df_val['status']

* __X = feature matrix.__

In [107]:
# Replacing missing values with zero.
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

* __To use categorical variables, we need to encode them.__
* __In one-hot encoding, each value is encoded as “1” if it’s present (“hot”) or “0” if it’s absent (“cold”).__
* __DictVectorizer needs a list of dictionaries__

In [108]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

*   __**DictVectorizer**: This is used to convert lists of feature-value mappings (in the form of Python dictionaries) into a format that can be used by machine learning algorithms (a feature matrix).__
*   __**fit_transform**: This method is called on the training data __(dict_train)__. It first learns the vocabulary of all the features present in the data (fitting) and then converts the training data into a feature matrix (transforming).__
*   __**transform**: This method is called on the validation data __(dict_val).__ It uses the vocabulary learned from the training data to transform the validation data into a feature matrix. This ensures that the same features are used for both training and validation.__


In [109]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

## __Decision trees__

### __Decision tree classifier__

In [110]:
from sklearn.tree import DecisionTreeClassifier

*   **DecisionTreeClassifier()**: In this project, we are building a model to predict credit risk. We are using a **Decision Tree Classifier**, which is a type of model that makes predictions by learning a set of decision rules from the data. This is like creating a flowchart of questions to ask about a loan applicant to decide whether they are likely to default.

*   **fit(X_train, y_train)**: This is the training step. We are training our **Decision Tree** model on the **X_train** data, which contains the features of the loan applicants in our training set (like their income, age, etc.). The **y_train** variable is the target, which tells the model whether each applicant in the training set has defaulted or not. By calling **fit**, the model learns the patterns in the data that can be used to predict whether a new applicant will default.

In [111]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


* __AUC shows how well a model separates positive examples from negative examples.__
* __it describes the probability that a randomly chosen positive example (“default”) has a higher score than a randomly chosen negative example(“OK”).__
* __This is a relevant metric for the project: we want risky clients to have higher scores than nonrisky ones.__

In [112]:
from sklearn.metrics import roc_auc_score

* __Because we chose AUC as the evaluation metric, we need scores, not hard predictions.__

In [113]:
y_pred = dt.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_pred)

1.0

* __Let’s check the score on validation:__

In [114]:
y_pred = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.6669128865854544

* __One word: overfitting.__
---
* __The model overfit the training data, meaning it memorized the data instead of learning general patterns.__
* __As a result, it performed poorly on the validation set with unseen data.__
* __The learned rules were too specific to the training data and did not generalize to new customers.__
---
* __A tree with more levels can learn more complex rules. A tree with two levels is less complex than a tree with three levels and, thus, less prone to overfitting.__
---
*   **Each line in the output is a node with a condition.**
*   **We traverse the tree by following the conditions until we reach a final decision.**
*   **A class of True means the prediction is "default"; otherwise, it is "OK".**
*   **The condition records=no > 0.50 is true for customers with no previous records, due to the one-hot encoding scheme used for the records feature.**


In [116]:
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train, y_train)

from sklearn.tree import export_text

tree_text = export_text(dt, feature_names=dv.feature_names_)
print(tree_text)

|--- records=no <= 0.50
|   |--- seniority <= 6.50
|   |   |--- class: True
|   |--- seniority >  6.50
|   |   |--- class: False
|--- records=no >  0.50
|   |--- job=parttime <= 0.50
|   |   |--- class: False
|   |--- job=parttime >  0.50
|   |   |--- class: True



In [117]:
y_pred = dt.predict_proba(X_train)[:, 1]
auc = roc_auc_score(y_train, y_pred)
print('train auc', auc)

y_pred = dt.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
print('validation auc', auc)

train auc 0.7054989859726213
validation auc 0.6685264343319367


### __Decision tree learning algorithm__

*   **The goal of a decision tree is to create pure groups (nodes) where all observations belong to the same class.**
*   **The algorithm finds the best split by trying all possible values and selecting the one that results in the lowest impurity.**
*   **Impurity measures how mixed the classes are in a group. Common impurity measures include misclassification rate, Gini impurity, and entropy.**
*   **The misclassification rate is the percentage of observations in a group that do not belong to the majority class.**
*   **The overall impurity of a split is the weighted average of the impurity of the groups created by the split.**
---
*   **When building a decision tree with multiple features, the algorithm searches through all features and all possible thresholds to find the split that results in the lowest impurity.**
*   **This splitting process is applied recursively to the resulting groups, creating a tree structure.**
*   **To prevent the tree from becoming too complex and overfitting the data, we use stopping criteria to limit its growth.**
*   **Common stopping criteria include the group being pure, reaching the maximum allowed depth (controlled by the max_depth parameter), or the group being too small to split (controlled by the min_samples_leaf parameter).**
*   **By tuning these parameters, we can control the complexity of the decision tree and improve its performance on unseen data.**


### __Parameter tuning for decision tree__

*   **This code performs hyperparameter tuning for the max_depth of a decision tree.**
*   **It iterates through a list of different values for max_depth.**
*   **For each value, it trains a decision tree model and calculates the AUC score on the validation set.**
*   **The goal is to find the max_depth value that results in the best AUC score, which indicates the best model performance.**

In [118]:
for depth in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print('%4s -> %.3f' % (depth, auc))

   1 -> 0.606
   2 -> 0.669
   3 -> 0.739
   4 -> 0.761
   5 -> 0.767
   6 -> 0.757
  10 -> 0.689
  15 -> 0.674
  20 -> 0.656
None -> 0.654


*   **This code performs hyperparameter tuning for both the max_depth and min_samples_leaf of a decision tree.**
*   **It uses nested loops to iterate through different combinations of values for these two parameters.**
*   **For each combination, it trains a decision tree model and calculates the AUC score on the validation set.**
*   **The goal is to find the combination of max_depth and min_samples_leaf that results in the best AUC score, which indicates the best model performance.**

In [119]:
for m in [4, 5, 6]:
    print('depth: %s' % m)

    for s in [1, 5, 10, 15, 20, 50, 100, 200]:
        dt = DecisionTreeClassifier(max_depth=m, min_samples_leaf=s)
        dt.fit(X_train, y_train)
        y_pred = dt.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        print('%s -> %.3f' % (s, auc))
        
print()

depth: 4
1 -> 0.761
5 -> 0.761
10 -> 0.761
15 -> 0.764
20 -> 0.761
50 -> 0.753
100 -> 0.756
200 -> 0.747
depth: 5
1 -> 0.766
5 -> 0.768
10 -> 0.762
15 -> 0.773
20 -> 0.774
50 -> 0.768
100 -> 0.763
200 -> 0.759
depth: 6
1 -> 0.758
5 -> 0.762
10 -> 0.778
15 -> 0.785
20 -> 0.773
50 -> 0.772
100 -> 0.776
200 -> 0.768



In [120]:
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,15
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


## __Random forest__