### Exercise 9.2

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

Import the dataset and ensure that it loaded properly.

In [87]:
# Loading data into dataframe.
loan_df = pd.read_csv(r'Data/Loan_Train.csv')

# Checking few rows of dataframe.
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Prepare the data for modeling by performing the following steps:
Drop the column “Load_ID.”

In [88]:
# Removing 'Loan_ID' column.
loan_df = loan_df.drop(['Loan_ID'], axis=1)

Drop any rows with missing data.

In [89]:
# Removing missing data.
loan_df = loan_df.dropna()

Convert the categorical features into dummy variables.

In [90]:
# Getting categorical columns.
cat_cols = loan_df.select_dtypes(include=['object']).columns

# Using the get_dummies function to create the dummy variables.
loan_df_dummies = pd.get_dummies(loan_df, columns=cat_cols)

# View the first 5 rows of the dataframe
loan_df_dummies.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,...,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_N,Loan_Status_Y
1,4583,1508.0,128.0,360.0,1.0,0,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
2,3000,0.0,66.0,360.0,1.0,0,1,0,1,1,...,0,1,0,0,1,0,0,1,0,1
3,2583,2358.0,120.0,360.0,1.0,0,1,0,1,1,...,0,0,1,1,0,0,0,1,0,1
4,6000,0.0,141.0,360.0,1.0,0,1,1,0,1,...,0,1,0,1,0,0,0,1,0,1
5,5417,4196.0,267.0,360.0,1.0,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,1


Split the data into a training and test set, where the “Loan_Status” column is the target.

In [91]:
# Creating x & y arrays.
x = loan_df_dummies.drop(['Loan_Status_N','Loan_Status_Y'], axis=1)
y = loan_df_dummies['Loan_Status_Y']

# Creating training & test datasets.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).

In [92]:
# Creating standardizer.
standardizer = StandardScaler()

# Standardizing features.
x_train_s = standardizer.fit_transform(x_train)
x_test_s = standardizer.transform(x_test)

# Creating a KNN classifier.
knn = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)

# Creating a pipeline.
pipe = Pipeline([("standardizer", standardizer), ("knn", knn)])

Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model.

In [93]:
# Fit the pipeline.
pipe.fit(x_train_s, y_train)

# Building predictions.
pipe_pred = pipe.predict(x_test_s)

# Calculating accuracy.
accuracy_score(y_test, pipe_pred)

0.6875

Accuracy score is 68%.

Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).

In [94]:
# Creating space of candidate values.
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.

In [95]:
# Creating grid search.
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(x_train_s, y_train)

In [96]:
# Finding the best value for 'n_neighbors' parameter.
classifier.best_estimator_.get_params()["knn__n_neighbors"]

9

Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.

In [97]:
# Building the predictions.
classifier_pred = classifier.predict(x_test_s)

# Calculating the accuracy of the model.
accuracy_score(y_test, classifier_pred)

0.6875

Accuracy score is 68%

Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.

In [98]:
# Building new search space.
search_space2 = [{"classifier": [LogisticRegression()], 
                  "classifier__penalty": ['l2'],
                  "classifier__C": np.logspace(0, 4, 10)}, 
                 {"classifier": [RandomForestClassifier()], 
                  "classifier__n_estimators": [10, 100, 1000], 
                  "classifier__max_features": [1, 2, 3]}]

# Building new pipeline.
pipe2 = Pipeline([("classifier", RandomForestClassifier())])

# Fitting the pipeline.
pipe2.fit(x_train_s, y_train)

Pipeline(steps=[('classifier', RandomForestClassifier())])

In [99]:
# Creating grid search.
classifier2 = GridSearchCV(pipe2, search_space2, cv=5, verbose=0).fit(x_train_s, y_train)

What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [100]:
# Viewing the best model.
classifier2.best_estimator_.get_params()["classifier"]

LogisticRegression()

In [102]:
# Building predictions.
classifier2_pred = classifier2.predict(x_test_s)

# Calculating the accuracy.
accuracy_score(y_test, classifier2_pred)

0.75

Accuracy score is 75%.

Summarize your results

The accuracy of the model did not change much by the hyperparameter tuning the KNN model however it increased significantly (to 75%) by expanding the grid search into other models.