In [1]:
import os
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from joblib import dump

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Datasets Paths
X_train_resampled_path = os.path.join('..','datasets','prepared_data','X_train_resampled.csv') # SMOTE applied  
X_train_reduced_path = os.path.join('..','datasets','prepared_data','X_train_reduced.csv')     # PCA applied (It was resampled before)
X_test_path = os.path.join('..','datasets','prepared_data','X_test_transformed.csv') 
X_test_reduced_path = os.path.join('..','datasets','prepared_data','X_test_reduced.csv') 
y_train_path = os.path.join('..','datasets','prepared_data','y_train_resampled.csv') # SMOTE applied
y_test_path = os.path.join('..','datasets','prepared_data','y_test.csv')

# Load Datasets
X_train_resampled = pd.read_csv(X_train_resampled_path) # PCA Reduced Dataset
X_train_reduced = pd.read_csv(X_train_reduced_path)     # PCA applied (It was resampled before)
X_test_reduced = pd.read_csv(X_test_reduced_path)               # PCA
X_test = pd.read_csv(X_test_path)             
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# Ensure y are 1D arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [3]:
# Define the base model
knn = KNeighborsClassifier()

# Define the hyperparameters to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9],                        # Number of neighbors to consider
    'weights': ['uniform', 'distance'],                 # Weighting of neighbors (equal or distance-based)
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metrics to use
    'p': [1, 2]                                         # Parameter for Minkowski metric (1 = Manhattan, 2 = Euclidean)
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='recall',  # Change to 'f1' or 'accuracy' if better suited for the problem
    cv=5,               # Number of cross-validation folds
    n_jobs=-1,          # Use all available CPU cores
    verbose=1           # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_resampled, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'metric': 'euclidean', 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: 0.8782251857496666


In [4]:
# Predict
y_pred = best_model.predict(X_test)

In [5]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.73      0.79      1552
           1       0.48      0.69      0.57       561

    accuracy                           0.72      2113
   macro avg       0.68      0.71      0.68      2113
weighted avg       0.77      0.72      0.74      2113



In [6]:
# Save trained model
dump(best_model, os.path.join('..','trained_models','knn.joblib'))

['..\\trained_models\\knn.joblib']

### With reduced Dimesionality

In [7]:
# Define the base model
knn = KNeighborsClassifier()

# Define the hyperparameters to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9],                        # Number of neighbors to consider
    'weights': ['uniform', 'distance'],                 # Weighting of neighbors (equal or distance-based)
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metrics to use
    'p': [1, 2]                                         # Parameter for Minkowski metric (1 = Manhattan, 2 = Euclidean)
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='recall',  # Change to 'f1' or 'accuracy' if better suited for the problem
    cv=5,               # Number of cross-validation folds
    n_jobs=-1,          # Use all available CPU cores
    verbose=1           # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_reduced, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model_w_reduced = grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'metric': 'euclidean', 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: 0.8669022670984949


In [8]:
# Predict# Predict
y_pred = best_model_w_reduced.predict(X_test_reduced)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.74      0.80      1552
           1       0.49      0.68      0.57       561

    accuracy                           0.72      2113
   macro avg       0.68      0.71      0.68      2113
weighted avg       0.76      0.72      0.74      2113

