# Best Model Selection and Hyperparameter Tuning

1. Import the dataset and ensure that it loaded properly.

In [170]:
import pandas as pd

In [171]:
loan_df = pd.read_csv('/Users/theranmeadows/Desktop/dsc550datamining/week8/Loan_Train.csv')
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


2. Prepare the data for modeling by performing the following steps:

- Drop the column “Load_ID.”

In [172]:
loan_df = loan_df.drop(['Loan_ID'], axis = 1)
loan_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


- Drop any rows with missing data.

In [173]:
loan_df.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [174]:
loan_df = loan_df.dropna()

In [175]:
loan_df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

- Convert the categorical features into dummy variables.

In [176]:
cols = list(loan_df.columns)

num_cols = list(loan_df.select_dtypes(include = ['number']).columns)

cat_cols = list(set(cols)- set(num_cols))

In [177]:
loan_dummies = pd.get_dummies(loan_df, columns = cat_cols)
loan_dummies.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Gender_Female,...,Self_Employed_Yes,Loan_Status_N,Loan_Status_Y,Education_Graduate,Education_Not Graduate,Married_No,Married_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,4583,1508.0,128.0,360.0,1.0,False,True,False,False,False,...,False,True,False,True,False,False,True,True,False,False
2,3000,0.0,66.0,360.0,1.0,True,False,False,False,False,...,True,False,True,True,False,False,True,False,False,True
3,2583,2358.0,120.0,360.0,1.0,True,False,False,False,False,...,False,False,True,False,True,False,True,False,False,True
4,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,...,False,False,True,True,False,True,False,False,False,True
5,5417,4196.0,267.0,360.0,1.0,False,False,True,False,False,...,True,False,True,True,False,False,True,False,False,True


3. Split the data into a training and test set, where the “Loan_Status” column is the target.

In [178]:
from sklearn.model_selection import train_test_split

In [179]:
X = loan_dummies.drop('Loan_Status_Y', axis = 1)
y = loan_dummies['Loan_Status_Y']

In [180]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

4. Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).

In [181]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

In [182]:
standardize = StandardScaler()

In [183]:
standard_X_train = standardize.fit_transform(X_train)
standard_X_test = standardize.transform(X_test)

In [184]:
knn = KNeighborsClassifier(n_neighbors = 4, n_jobs = -1)

In [185]:
pipe = Pipeline([("standardize", standardize), ("knn", knn)])

5. Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model.

In [186]:
from sklearn.metrics import accuracy_score

In [187]:
pipe.fit(standard_X_train, y_train)

In [188]:
pipe_predictions = pipe.predict(standard_X_test)

In [189]:
accuracy_score(y_test, pipe_predictions)

0.90625

6. Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).

In [190]:
search = [{"knn__n_neighbors": [1,2,3,4,5,6,7,8,9,10]}]

7. Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.

In [191]:
classify = GridSearchCV(pipe, search, cv = 5, verbose = 0).fit(standard_X_train, y_train)

In [192]:
classify.best_estimator_.get_params()["knn__n_neighbors"]

4

8. Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.

In [193]:
classify_predictions = classify.predict(standard_X_test)

In [194]:
accuracy_score(y_test, classify_predictions)

0.90625

9. Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.

In [195]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [196]:
search_2 = [{"classify": [LogisticRegression()], 
                  "classify__penalty": ['l2'],
                  "classify__C": np.logspace(0, 4, 10)}, 
                 {"classify": [RandomForestClassifier()], 
                  "classify__n_estimators": [10, 100, 1000], 
                  "classify__max_features": [1, 2, 3]}]

In [197]:
pipe_2 = Pipeline([("classify", RandomForestClassifier())])

In [198]:
pipe_2.fit(standard_X_train, y_train)

In [199]:
classify_2 = GridSearchCV(pipe_2, search_2, cv = 5, verbose = 0).fit(standard_X_train, y_train)

10. What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [203]:
classify_2.best_estimator_.get_params()['classify']

In [201]:
classify_2_predictions = classify_2.predict(standard_X_test)

In [110]:
accuracy_score(y_test, classify_2_predictions)

1.0

11. Summarize your Results

The hyperparameter tuning of the KNN model really didn't improve the accuracy all that much. Running it through several times always had a slight decrease in accuracy by 3-6% or was exactly the same. 