<a href="https://colab.research.google.com/github/WattNotWhat/ANN-Hands-On-Project/blob/main/Copy_of_Loan_Hyperparameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# To help with reading and manipulation of data
import numpy as np
import pandas as pd

# To help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To split the data
from sklearn.model_selection import train_test_split

# To impute missing values
from sklearn.impute import SimpleImputer

# To build a Random forest classifier
from sklearn.ensemble import RandomForestClassifier

# To tune a model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# To get different performance metrics
import sklearn.metrics as metrics
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    accuracy_score,
    precision_score,
    f1_score,
)

# To suppress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df = pd.read_csv("/content/drive/MyDrive/DSBA/loan.csv")

In [5]:
data = df.copy()

In [6]:
data.head()

Unnamed: 0,customer_id,disbursed_amount,interest,market,employment,time_employed,householder,income,date_issued,target,loan_purpose,number_open_accounts,date_last_payment,number_credit_lines_12
0,0,23201.5,15.484,C,Teacher,<=5 years,RENT,84600.0,2013-06-11,0,Debt consolidation,4.0,2016-01-14,
1,1,7425.0,11.2032,B,Accountant,<=5 years,OWNER,102000.0,2014-05-08,0,Car purchase,13.0,2016-01-25,
2,2,11150.0,8.51,A,Statistician,<=5 years,RENT,69840.0,2013-10-26,0,Debt consolidation,8.0,2014-09-26,
3,3,7600.0,5.8656,A,Other,<=5 years,RENT,100386.0,2015-08-20,0,Debt consolidation,20.0,2016-01-26,
4,4,31960.0,18.7392,E,Bus driver,>5 years,RENT,95040.0,2014-07-22,0,Debt consolidation,14.0,2016-01-11,


In [7]:
data.time_employed.value_counts()

>5 years     5057
<=5 years    4414
Name: time_employed, dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             10000 non-null  int64  
 1   disbursed_amount        10000 non-null  float64
 2   interest                10000 non-null  float64
 3   market                  10000 non-null  object 
 4   employment              9389 non-null   object 
 5   time_employed           9471 non-null   object 
 6   householder             10000 non-null  object 
 7   income                  10000 non-null  float64
 8   date_issued             10000 non-null  object 
 9   target                  10000 non-null  int64  
 10  loan_purpose            10000 non-null  object 
 11  number_open_accounts    10000 non-null  float64
 12  date_last_payment       10000 non-null  object 
 13  number_credit_lines_12  238 non-null    float64
dtypes: float64(5), int64(2), object(7)
memo

In [9]:
data.isnull().sum()

customer_id                  0
disbursed_amount             0
interest                     0
market                       0
employment                 611
time_employed              529
householder                  0
income                       0
date_issued                  0
target                       0
loan_purpose                 0
number_open_accounts         0
date_last_payment            0
number_credit_lines_12    9762
dtype: int64

In [10]:
data["employment"].value_counts()

Civil Servant         884
Nurse                 880
Bus driver            874
Other                 863
Teacher               861
Accountant            852
Statistician          852
Secretary             846
Dentist               844
Taxi driver           825
Software developer    808
Name: employment, dtype: int64

In [11]:
x = {">5 years":0, "<=5 years":1}

data["time_employed"] = data["time_employed"].map(x)

In [12]:
x = {"Civil Servant":0, "Nurse":1, "Bus driver":2, "Other":3, "Teacher":4,
                            "Accountant":5, "Statistician":6, "Secretary":7, "Secretary":8, "Dentist":9
                            , "Taxi driver":10, "Software developer":11}

data["employment"] = data["employment"].map(x)

In [13]:
data["target"].value_counts(1)

0    0.9883
1    0.0117
Name: target, dtype: float64

In [14]:
X = data.drop(["target"], axis=1)
y = data["target"]

X = pd.get_dummies(X, drop_first=True)

In [15]:
#Splitting the data into training, validation and test set

#First we split the data into 2 parts, temporary and test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=5, stratify=y
)

#then we split the temporary dataset into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=5, stratify=y_temp
)

print(X_train.shape, X_val.shape, X_test.shape)

(6400, 2732) (1600, 2732) (2000, 2732)


In [16]:
X_train.head()

Unnamed: 0,customer_id,disbursed_amount,interest,employment,time_employed,income,number_open_accounts,number_credit_lines_12,market_B,market_C,...,date_last_payment_2016-01-18,date_last_payment_2016-01-19,date_last_payment_2016-01-20,date_last_payment_2016-01-21,date_last_payment_2016-01-22,date_last_payment_2016-01-23,date_last_payment_2016-01-24,date_last_payment_2016-01-25,date_last_payment_2016-01-26,date_last_payment_2016-01-27
3404,3404,16320.0,10.99,,,37050.0,11.0,,1,0,...,0,0,0,0,0,0,0,0,0,0
9285,9285,14000.0,16.5158,11.0,0.0,42320.0,10.0,,0,0,...,0,0,0,0,0,0,0,0,0,0
9496,9496,27936.0,12.3234,11.0,0.0,63050.0,7.0,,1,0,...,0,0,0,0,0,0,0,0,0,0
131,131,7760.0,8.4051,2.0,0.0,39200.0,15.0,1.0,1,0,...,0,0,0,0,0,0,0,0,0,0
719,719,18600.0,6.9696,5.0,0.0,73500.0,10.0,,0,0,...,1,0,0,0,0,0,0,0,0,0


In [17]:
#imputing missing values 
imp_median = SimpleImputer(missing_values=np.nan, strategy="median")

#fit the imputer on train data and transform the train data
X_train["employment"] = imp_median.fit_transform(X_train[["income"]])
X_train["time_employed"] = imp_median.fit_transform(X_train[["employment"]])

#transform the validation and test data using the imputer fit on train data
X_val["employment"] = imp_median.transform(X_val[["employment"]])
X_test["employment"] = imp_median.transform(X_test[["employment"]])

X_val["time_employed"] = imp_median.transform(X_val[["time_employed"]])
X_test["time_employed"] = imp_median.transform(X_test[["time_employed"]])

In [18]:
X_train.isnull().sum()

customer_id                     0
disbursed_amount                0
interest                        0
employment                      0
time_employed                   0
                               ..
date_last_payment_2016-01-23    0
date_last_payment_2016-01-24    0
date_last_payment_2016-01-25    0
date_last_payment_2016-01-26    0
date_last_payment_2016-01-27    0
Length: 2732, dtype: int64

In [19]:
print("Target value ratio in y")
print(y.value_counts(1))
print("*" * 80)
print("Target value ratio in y_train")
print(y_train.value_counts(1))
print("*" * 80)
print("Target value ratio in y_val")
print(y_val.value_counts(1))
print("*" * 80)
print("Target value ratio in y_test")
print(y_test.value_counts(1))
print("*" * 80)

Target value ratio in y
0    0.9883
1    0.0117
Name: target, dtype: float64
********************************************************************************
Target value ratio in y_train
0    0.988281
1    0.011719
Name: target, dtype: float64
********************************************************************************
Target value ratio in y_val
0    0.988125
1    0.011875
Name: target, dtype: float64
********************************************************************************
Target value ratio in y_test
0    0.9885
1    0.0115
Name: target, dtype: float64
********************************************************************************


## Model evaluation criterion


**What does a bank want?**
* A bank wants to minimize the loss - it can face 2 types of losses here: 
   * Whenever a bank lends money to a customer, they don't return it.
   * A bank doesn't lend money to a customer thinking a customer will default but in reality, the customer won't - opportunity loss.

**Which loss is greater ?**
* Lending to a customer who wouldn't be able to pay back.

**Since we want to reduce loan defaults we should use Recall as a metric of model evaluation instead of accuracy.**

* Recall - It gives the ratio of True positives to Actual positives, so high Recall implies low false negatives, i.e. low chances of predicting a bad customer as a good customer.


# Hyperparameter Tuning

In [20]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6400 entries, 3404 to 3364
Columns: 2732 entries, customer_id to date_last_payment_2016-01-27
dtypes: float64(7), int64(1), uint8(2724)
memory usage: 17.1 MB


In [21]:
X_train = X_train.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [22]:
y_train = y_train.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [23]:
y_val = y_val.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
y_test = y_test.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [24]:
X_val = X_val.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [25]:
# model without hyperparameter tuning
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [26]:
# Checking recall score on train and validation set
print("Recall on train and validation set")
print(recall_score(y_train, rf.predict(X_train)))
print(recall_score(y_val, rf.predict(X_val)))
print("")

# Checking Precision score on train and validation set
print("Precision on train and validation set")
print(precision_score(y_train, rf.predict(X_train)))
print(precision_score(y_val, rf.predict(X_val)))

print("")

# Checking Accuracy score on train and validation set
print("Accuracy on train and validation set")
print(accuracy_score(y_train, rf.predict(X_train)))
print(accuracy_score(y_val, rf.predict(X_val)))

Recall on train and validation set
0.9866666666666667
0.0

Precision on train and validation set
1.0
0.0

Accuracy on train and validation set
0.99984375
0.988125


## Grid Search CV
* Hyperparameter tuning is also tricky in the sense that there is no direct way to calculate how a change in the hyperparameter value will reduce the loss of your model, so we usually resort to experimentation. i.e we'll use Grid search
* Grid search is a tuning technique that attempts to compute the optimum values of hyperparameters. 
* It is an exhaustive search that is performed on the specific parameter values of a model.
* The parameters of the estimator/model used to apply these methods are optimized by cross-validated grid-search over a parameter grid.

In [27]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [28]:
print(np.arange(0.2, 0.7, 0.1))

print(np.arange(5,10))

[0.2 0.3 0.4 0.5 0.6]
[5 6 7 8 9]


### Let's tune Random forest using Grid Search

In [None]:
%%time

# Choose the type of classifier. 
rf1 = RandomForestClassifier(random_state=1)

# Grid of parameters to choose from
parameters = {"n_estimators": [150,200,250],
    "min_samples_leaf": np.arange(5, 10),
    "max_features": np.arange(0.2, 0.7, 0.1),
    "max_samples": np.arange(0.3, 0.7, 0.1),
    "class_weight" : ['balanced', 'balanced_subsample'],
    "max_depth":np.arange(3,4,5),
    "min_impurity_decrease":[0.001, 0.002, 0.003]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)

# Run the grid search
grid_obj = GridSearchCV(rf1, parameters, scoring=acc_scorer, cv=2, n_jobs= -1, verbose = 2)
# verbose = 2 tells about the number of fits, which can give an idea of how long will the model take in tuning
# n_jobs = -1 so that all CPU cores can be run parallelly to optimize the Search

grid_obj = grid_obj.fit(X_train, y_train)

# Print the best combination of parameters
grid_obj.best_params_


Fitting 2 folds for each of 1800 candidates, totalling 3600 fits


In [30]:
grid_obj.best_score_

AttributeError: ignored

In [None]:
# Set the clf to the best combination of parameters
rf1_tuned = RandomForestClassifier(
    class_weight="balanced",
    max_features=0.2,
    max_samples=0.6000000000000001,
    min_samples_leaf=5,
    n_estimators=150,
    max_depth=3,
    random_state=1,
    min_impurity_decrease=0.001,
)

# Fit the best algorithm to the data.
rf1_tuned.fit(X_train, y_train)

In [None]:
# Checking recall score on train and validation set
print("Recall on train and validation set")
print(recall_score(y_train, rf1_tuned.predict(X_train)))
print(recall_score(y_val, rf1_tuned.predict(X_val)))
print("")

# Checking precision score on train and validation set
print("Precision on train and validation set")
print(precision_score(y_train, rf1_tuned.predict(X_train)))
print(precision_score(y_val, rf1_tuned.predict(X_val)))
print("")

# Checking accuracy score on train and validation set
print("Accuracy on train and validation set")
print(accuracy_score(y_train, rf1_tuned.predict(X_train)))
print(accuracy_score(y_val, rf1_tuned.predict(X_val)))

## Randomized Search CV
* Random search is a tuning technique that attempts to compute the optimum values of hyperparameters randomly unlike grid search

In [None]:
%%time

# Choose the type of classifier. 
rf2 = RandomForestClassifier(random_state=1)

# Grid of parameters to choose from
parameters = {"n_estimators": [150,200,250],
    "min_samples_leaf": np.arange(5, 10),
    "max_features": np.arange(0.2, 0.7, 0.1), 
    "max_samples": np.arange(0.3, 0.7, 0.1),
    "max_depth":np.arange(3,4,5),
    "class_weight" : ['balanced', 'balanced_subsample'],
    "min_impurity_decrease":[0.001, 0.002, 0.003]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)

# Run the random search
grid_obj = RandomizedSearchCV(rf2, parameters,n_iter=30, scoring=acc_scorer,cv=5, random_state = 1, n_jobs = -1, verbose = 2)
# using n_iter = 30, so randomized search will try 30 different combinations of hyperparameters
# by default, n_iter = 10

grid_obj = grid_obj.fit(X_train, y_train)

# Print the best combination of parameters
grid_obj.best_params_


In [None]:
grid_obj.best_score_

In [None]:
# Set the clf to the best combination of parameters
rf2_tuned = RandomForestClassifier(
    class_weight="balanced",
    max_features=0.2,
    max_samples=0.5,
    min_samples_leaf=5,
    n_estimators=150,
    random_state=1,
    max_depth=3,
    min_impurity_decrease=0.003,
)

# Fit the best algorithm to the data.
rf2_tuned.fit(X_train, y_train)

In [None]:
# Checking recall score on train and validation set
print("Recall on train and validation set")
print(recall_score(y_train, rf2_tuned.predict(X_train)))
print(recall_score(y_val, rf2_tuned.predict(X_val)))
print("")
print("Precision on train and validation set")
# Checking precision score on train and validation set
print(precision_score(y_train, rf2_tuned.predict(X_train)))
print(precision_score(y_val, rf2_tuned.predict(X_val)))
print("")
print("Accuracy on train and validation set")
# Checking accuracy score on train and validation set
print(accuracy_score(y_train, rf2_tuned.predict(X_train)))
print(accuracy_score(y_val, rf2_tuned.predict(X_val)))

In [None]:
model = rf1_tuned

In [None]:
# Checking recall score on test set
print("Recall on test set")
print(recall_score(y_test, model.predict(X_test)))
print("")

# Checking precision score on test set
print("Precision on test set")
print(precision_score(y_test, model.predict(X_test)))
print("")

# Checking accuracy score on test set
print("Accuracy on test set")
print(accuracy_score(y_test, model.predict(X_test)))