In [17]:
#importing important libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix, f1_score

In [2]:
#import warnings to ignore unnecessary warmings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")


In [3]:
#Loading the dataset
data = pd.read_excel("sample_data.xlsx")
data.head()

Unnamed: 0,Income,LoanAmount,CreditScore,InterestRate,Target
0,85994,50587,520,15.23,0
1,50432,124440,458,4.81,0
2,84208,129188,451,21.17,1
3,31713,44799,743,7.07,0
4,20437,9139,633,6.51,0


In [5]:
#Seperating features and target variable from the dataset
X = data.drop(columns=["Target"])
y = data["Target"]

In [6]:
#Seperate numeric and categorical data
numeric_features = X.select_dtypes(include=['int64','float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [8]:
#Create transformers for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer,numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
#Create a pipeline with KNN model
base_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',KNeighborsClassifier())
])

In [11]:
#Perform Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2,stratify=y, random_state = 42)

In [12]:
#Tran and evaluate the base model
base_pipeline.fit(X_train,y_train)

In [14]:
#Predictions and eval matrics
print("EVAL METRICS FOR BASE MODEL")
y_train_pred = base_pipeline.predict(X_train)
y_test_pred = base_pipeline.predict(X_test)

def evaluate_model(y_true, y_pred, data_type="Train"):
    print(f"Evaluation metrics for {data_type}data")
    print(f"Accuracy :{accuracy_score(y_true,y_pred)}")
    print(f"F1_score :{f1_score(y_true,y_pred)}")
    print(f"Classification Report")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true,y_pred))
    print("\n")

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")
    

EVAL METRICS FOR BASE MODEL
Evaluation metrics for Traindata
Accuracy :0.8975
F1_score :0.046511627906976744
Classification Report
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       359
           1       0.50      0.02      0.05        41

    accuracy                           0.90       400
   macro avg       0.70      0.51      0.50       400
weighted avg       0.86      0.90      0.85       400

Confusion Matrix:
[[358   1]
 [ 40   1]]


Evaluation metrics for Testdata
Accuracy :0.9
F1_score :0.0
Classification Report
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        90
           1       0.00      0.00      0.00        10

    accuracy                           0.90       100
   macro avg       0.45      0.50      0.47       100
weighted avg       0.81      0.90      0.85       100

Confusion Matrix:
[[90  0]
 [10  0]]




In [18]:
#Hyperparameter tuning 
param_grid = {
    'classifier__n_neighbors':[3,5,7,9],  #Different number of neighbors
    'classifier__weights':['uniform','distance'], #weight options
    'classifier__metric':['euclidean','manhattan'] #Distance Metrics
}

grid_search = GridSearchCV(base_pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

Best Hyperparameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 5, 'classifier__weights': 'uniform'}


In [19]:
#Build and evaluate the best model
print("")
print("-----Eval Metrics For Best Model-----")
print("")

best_pipeline= grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred, y_train, "Train(Best Model)")
evaluate_model(y_test_best_pred, y_test, "Test(Best Model)")


-----Eval Metrics For Best Model-----

Evaluation metrics for Train(Best Model)data
Accuracy :0.8975
F1_score :0.046511627906976744
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       398
           1       0.02      0.50      0.05         2

    accuracy                           0.90       400
   macro avg       0.51      0.70      0.50       400
weighted avg       0.99      0.90      0.94       400

Confusion Matrix:
[[358  40]
 [  1   1]]


Evaluation metrics for Test(Best Model)data
Accuracy :0.9
F1_score :0.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       100
           1       0.00      0.00      0.00         0

    accuracy                           0.90       100
   macro avg       0.50      0.45      0.47       100
weighted avg       1.00      0.90      0.95       100

Confusion Matrix:
[[90 10]
 [ 0  0]]




In [21]:
#Compare metrics
metrics_comparison = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score'],
    'BaseModelTrain':[accuracy_score(y_train,y_train_pred),f1_score(y_train,y_train_pred)],
    'BaseModelTest':[accuracy_score(y_test,y_test_pred), f1_score(y_test,y_test_pred)],
    'BestModelTrain':[accuracy_score(y_train,y_train_best_pred),f1_score(y_train,y_train_best_pred)]
})
print(metrics_comparison)


     Metric  BaseModelTrain  BaseModelTest  BestModelTrain
0  Accuracy        0.897500            0.9        0.897500
1  F1 Score        0.046512            0.0        0.046512


In [22]:
# Train the final model
final_model = best_pipeline.fit(X,y)

In [23]:
#Save the final model
joblib.dump(final_model,'final_KNN_classifier_model.pkl')

print("")
print("Final model saved as 'final_KNN_classifier_model.pkl")


Final model saved as 'final_KNN_classifier_model.pkl
