In [1]:
#1 Importing and loading the dataset:

import pandas as pd

url = 'https://drive.google.com/file/d/1LqxyVCCs9zBuEmiEkyz-9UTGXtvuT-AS/view?usp=sharing'
file_id = url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

In [2]:
#2 Preparing the data:

# drop the "Load_ID" column
df.drop("Loan_ID", axis=1, inplace=True)

# drop rows with missing data
df.dropna(inplace=True)

# convert categorical features into dummy variables
df = pd.get_dummies(df, drop_first=True)

In [3]:
#3 Splitting the data:

from sklearn.model_selection import train_test_split

X = df.drop("Loan_Status_Y", axis=1)
y = df["Loan_Status_Y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:
#4 Creating the pipeline:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline([
    ("scaler", MinMaxScaler()),
    ("knn", KNeighborsClassifier())
])

In [5]:
#5 Fitting a default KNN classifier:

pipe.fit(X_train, y_train)
accuracy = pipe.score(X_test, y_test)
print(f"Default KNN Classifier Accuracy: {accuracy}")

Default KNN Classifier Accuracy: 0.78125


In [6]:
#6 Creating a search space:

import numpy as np

search_space = [{"knn__n_neighbors": np.arange(1, 11)}]

In [7]:
#7 Fitting a grid search:

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, search_space, cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid=[{'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])}])

In [8]:
#8 Finding the accuracy of the best model:

accuracy = grid_search.score(X_test, y_test)
print(f"Best KNN Classifier Accuracy: {accuracy}")

Best KNN Classifier Accuracy: 0.7916666666666666


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define pipeline
pipe = Pipeline([
    ("scaler", MinMaxScaler()),
    ("classifier", SVC())
])

# Define search space for hyperparameters
search_space = [
    {
        "classifier": [SVC()],
        "classifier__C": [0.1, 1, 10],
        "classifier__kernel": ["linear", "rbf", "poly"]
    }
]

# Perform grid search
grid_search = GridSearchCV(pipe, search_space, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('classifier', SVC())]),
             param_grid=[{'classifier': [SVC(C=0.1, kernel='linear')],
                          'classifier__C': [0.1, 1, 10],
                          'classifier__kernel': ['linear', 'rbf', 'poly']}],
             scoring='accuracy')

In [10]:
#10 Finding the best model:

best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print(f"Best Model: {type(best_model.named_steps['classifier'])}")
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Model Accuracy: {accuracy}")

Best Model: <class 'sklearn.svm._classes.SVC'>
Best Hyperparameters: {'classifier': SVC(C=0.1, kernel='linear'), 'classifier__C': 0.1, 'classifier__kernel': 'linear'}
Best Model Accuracy: 0.8229166666666666


In [None]:
#11 Summarize your results.
## The dataset was loaded and prepared for modeling by dropping the "Loan_ID" column, 
## dropping any rows with missing data, and converting categorical features into dummy variables. 
## The data was split into a training and test set and created a pipeline with a min-max scaler and 
## a KNN classifier. It was fitted to default KNN classifier to the data with this pipeline and reported the 
## model accuracy on the test set. Then created a search space for the KNN classifier where the "n_neighbors" 
## parameter varied from 1 to 10 and fitted a grid search with our pipeline, search space, and 5-fold 
## cross-validation to find the best value for the "n_neighbors" parameter. Then found the accuracy of the 
## grid search best model on the test set. For the last part, repeated the search with the same pipeline but expanded 
## the search space to include logistic regression and random forest models with the hyperparameter values 
## in section 12.3 of the Machine Learning with Python Cookbook. Then found the best model and hyperparameters 
## found in the grid search and the accuracy of this model on the test set.