# Problem statement
  In this notebook we will train our  cleaned prosper Loan data on a **Random Forest** Model by applying severeal hyperparameter tuning options  in order to reach the highest accuracy
  Then we will compare the different parameters combinations in order to find the best results

In [None]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


## Importing the cleaned CSV file

In [None]:
link = 'https://drive.google.com/file/d/1HsRMSs2A120li2AZgvAtyqXHLaDAiSgn/view?usp=sharing'
 
import pandas as pd
 
# to get the id part of the file
id = link.split("/")[-2]
 
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('xclara.csv') 
 
df = pd.read_csv('xclara.csv')

## Importing the necessary libraries for the modelling

In [None]:
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn import metrics

## Substracting the data
Since RandomForest have a lot of hyperparameters to tune, if we use the whole database for the training phase we will consume a lot of time. A common approach od to use a subset of the data to find the best parameters then apply them to the rest. The subset consists of *10000* rows

In [None]:
df.shape

(77584, 77)

In [None]:
sub_df = df.sample(n=10000)

In [None]:
sub_df.shape

(10000, 77)

## Splitting the Data into features (X) and labels (y) to be predicted 

In [None]:
X= sub_df.drop(columns=['ListingCreationDate', 'DateCreditPulled','FirstRecordedCreditLine','IncomeRange','LoanOriginationDate','target','ClosedDate'])
y = sub_df['target']

## Splitting the Data into a training test (80% of the data) and a test set (20% of the data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [None]:
# param = {'max_depth': [6,9, None], 
#          'n_estimators':[50, 70, 100, 150], 
#           'max_features': randint(1,6),
#           'criterion' : ['gini', 'entropy'],
#           'bootstrap':[True, False],
#           'mln_samples_leaf': randint(1,4)}

## Hyperparameter tuning using RandomizedSearchCV
The Random Forest model  has several hyperparameters to tune. So applying GridSearchCV first is too complex and time consuming. Randomized Search allows us to limit the time that will be consumed on the GridSearch step.

In [None]:
# Create a hyperparameter grid for the RandomizedSearchCV 
rf_grid = {"n_estimators": [100,500,1000],
          "max_depth": [None,3,5,10],
           "min_samples_split": [2,10,20],
           "min_samples_leaf":[2,10,20],
           "bootstrap" : [True, False]}

In [None]:
# Applying RandomizedSearchCV on our data subset
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                               param_distributions = rf_grid,
                               cv=5,
                               n_iter=25,
                                n_jobs=-1,
                               verbose=2)
# Fit random hyperparameter search model for RandomForest
rs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=25,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [None, 3, 5, 10],
                                        'min_samples_leaf': [2, 10, 20],
                                        'min_samples_split': [2, 10, 20],
                                        'n_estimators': [100, 500, 1000]},
                   verbose=2)

In [None]:
# Extracting the best parameters from the RandomizedSearchCV model

rs_rf.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [None]:
# The test accuracy after using the best parameters
rs_rf.score(X_test,y_test)

0.9765

## Hyperparameter tuning using GridSearchCV
I used values that are close to the results provided by the random search done previously to see if we can find even better values

In [None]:
# Create a hyperparameter grid for the GridSearchCV
rf2_grid = {"n_estimators": [100,200],
          "max_depth": [10,12],
          "min_samples_split": [1,2,4],
          "min_samples_leaf":[1,2,4]
          }

In [None]:
# Applying GridSearchCV on our data subset
gs_rf = GridSearchCV(RandomForestClassifier(bootstrap=False),
                         param_grid = rf2_grid,
                         cv = 3,
                         verbose=2,
                     n_jobs=-1) #verbose to have outputs of what happening with GridSearchCV
# Fit grid hyperparameter search for RandomForest
gs_rf.fit(X_train,y_train)

In [None]:
# Extracting the best parameters from the GridSearchCV model
gs_rf.best_params_

{'max_depth': 12,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 100}

In [None]:
# The training accuracy after using the best parameters
gs_rf.score(X_train, y_train)

0.997375

In [None]:
# The test accuracy after using the best parameters
gs_rf.score(X_test,y_test)


0.978

## Training the model on the whole data set

In [None]:
X= df.drop(columns=['ListingCreationDate', 'DateCreditPulled','FirstRecordedCreditLine','IncomeRange','LoanOriginationDate','target','ClosedDate'])
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [None]:
model= RandomForestClassifier(bootstrap=False,max_depth= 12,min_samples_leaf= 2, min_samples_split= 4, n_estimators=100)
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=False, max_depth=12, min_samples_leaf=2,
                       min_samples_split=4)

In [None]:
# The test accuracy after using the best parameters
model.score(X_test,y_test)

0.9794419024295934