In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from collections import Counter
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
class KNNMahalanobis:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)
        cov_matrix = np.cov(self.X_train.T)
        self.S_inv = np.linalg.inv(cov_matrix)

    def _mahalanobis_distances(self, x):
        return [mahalanobis(x, train_x, self.S_inv) for train_x in self.X_train]

    def predict(self, X_test):
        X_test = np.array(X_test)
        predictions = []
        for x in X_test:
            distances = self._mahalanobis_distances(x)
            k_indices = np.argsort(distances)[:self.k]
            k_labels = self.y_train[k_indices]
            most_common = Counter(k_labels).most_common(1)[0][0]
            predictions.append(most_common)
        return predictions


K=7
Confusion Matrix:
[[2814  230]
 [ 707  249]]
Accuracy: 0.7658, Precision: 0.5198, Recall: 0.2605, F1-score: 0.3470



In [4]:
df_train=pd.read_csv('One-Hot/Scaled/Train_Scaled_Cont_OH.csv')
df_valid=pd.read_csv('One-Hot/Scaled/Valid_Scaled_Cont_OH.csv')
df_test=pd.read_csv('One-Hot/Scaled/Test_Scaled_Cont_OH.csv')

In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 51 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             4000 non-null   float64
 1   AnnualIncome                    4000 non-null   float64
 2   CreditScore                     4000 non-null   float64
 3   Experience                      4000 non-null   float64
 4   LoanAmount                      4000 non-null   float64
 5   LoanDuration                    4000 non-null   float64
 6   NumberOfDependents              4000 non-null   float64
 7   MonthlyDebtPayments             4000 non-null   float64
 8   CreditCardUtilizationRate       4000 non-null   float64
 9   NumberOfOpenCreditLines         4000 non-null   float64
 10  NumberOfCreditInquiries         4000 non-null   float64
 11  DebtToIncomeRatio               4000 non-null   float64
 12  BankruptcyHistory               40

In [8]:
class KNNMahalanobis:
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.S_inv = None
        self.distance_matrix = None

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)
        cov_matrix = np.cov(self.X_train.T)
        self.S_inv = np.linalg.inv(cov_matrix)

    def compute_distance_matrix(self, X_test):
        X_test = np.array(X_test)
        self.distance_matrix = np.zeros((len(X_test), len(self.X_train)))

        for i, test_point in enumerate(X_test):
            for j, train_point in enumerate(self.X_train):
                self.distance_matrix[i, j] = mahalanobis(test_point, train_point, self.S_inv)

    def predict(self, k):
        predictions = []
        for dist_row in self.distance_matrix:
            k_indices = np.argsort(dist_row)[:k]
            k_labels = self.y_train[k_indices]
            most_common = Counter(k_labels).most_common(1)[0][0]
            predictions.append(most_common)
        return predictions

X_train_orig = df_train.iloc[:, :-2]
y_train_orig = df_train.iloc[:, -2]
X_test_orig = df_test.iloc[:, :-2]
y_test_orig = df_test.iloc[:, -2]

knn = KNNMahalanobis()
knn.fit(X_train_orig, y_train_orig)
knn.compute_distance_matrix(X_test_orig)

k_values = [1, 3, 5, 7, 11]
accuracy_scores = []

for k in k_values:
    y_pred = knn.predict(k)

    cm = confusion_matrix(y_test_orig, y_pred)
    acc = accuracy_score(y_test_orig, y_pred)
    precision = precision_score(y_test_orig, y_pred, zero_division=0)
    recall = recall_score(y_test_orig, y_pred, zero_division=0)
    f1 = f1_score(y_test_orig, y_pred, zero_division=0)

    print(f"K={k}")  
    print("Confusion Matrix:")
    print(cm)
    print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}\n")

    accuracy_scores.append(acc)


K=1
Confusion Matrix:
[[2564  480]
 [ 499  457]]
Accuracy: 0.7552, Precision: 0.4877, Recall: 0.4780, F1-score: 0.4828

K=3
Confusion Matrix:
[[2812  232]
 [ 532  424]]
Accuracy: 0.8090, Precision: 0.6463, Recall: 0.4435, F1-score: 0.5261

K=5
Confusion Matrix:
[[2875  169]
 [ 554  402]]
Accuracy: 0.8193, Precision: 0.7040, Recall: 0.4205, F1-score: 0.5265

K=7
Confusion Matrix:
[[2909  135]
 [ 584  372]]
Accuracy: 0.8203, Precision: 0.7337, Recall: 0.3891, F1-score: 0.5085

K=11
Confusion Matrix:
[[2971   73]
 [ 627  329]]
Accuracy: 0.8250, Precision: 0.8184, Recall: 0.3441, F1-score: 0.4845



In [3]:
df_train = pd.read_csv("./One-Hot/RobustScaled/train_OH_Rscaled.csv")
df_test = pd.read_csv("./One-Hot/RobustScaled/test_OH_Rscaled.csv")

class KNNMahalanobis:
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.S_inv = None
        self.distance_matrix = None

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)
        cov_matrix = np.cov(self.X_train.T)
        self.S_inv = np.linalg.inv(cov_matrix)

    def compute_distance_matrix(self, X_test):
        X_test = np.array(X_test)
        self.distance_matrix = np.zeros((len(X_test), len(self.X_train)))

        for i, test_point in enumerate(X_test):
            for j, train_point in enumerate(self.X_train):
                self.distance_matrix[i, j] = mahalanobis(test_point, train_point, self.S_inv)

    def predict(self, k):
        predictions = []
        for dist_row in self.distance_matrix:
            k_indices = np.argsort(dist_row)[:k]
            k_labels = self.y_train[k_indices]
            most_common = Counter(k_labels).most_common(1)[0][0]
            predictions.append(most_common)
        return predictions

X_train_orig = df_train.iloc[:, :-2]
y_train_orig = df_train.iloc[:, -2]
X_test_orig = df_test.iloc[:, :-2]
y_test_orig = df_test.iloc[:, -2]

knn = KNNMahalanobis()
knn.fit(X_train_orig, y_train_orig)
knn.compute_distance_matrix(X_test_orig)

k_values = [1, 3, 5, 7, 11]
accuracy_scores = []

for k in k_values:
    y_pred = knn.predict(k)

    cm = confusion_matrix(y_test_orig, y_pred)
    acc = accuracy_score(y_test_orig, y_pred)
    precision = precision_score(y_test_orig, y_pred, zero_division=0)
    recall = recall_score(y_test_orig, y_pred, zero_division=0)
    f1 = f1_score(y_test_orig, y_pred, zero_division=0)

    print(f"K={k}")  
    print("Confusion Matrix:")
    print(cm)
    print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}\n")

    accuracy_scores.append(acc)


K=1
Confusion Matrix:
[[2550  494]
 [ 495  461]]
Accuracy: 0.7528, Precision: 0.4827, Recall: 0.4822, F1-score: 0.4825

K=3
Confusion Matrix:
[[2790  254]
 [ 540  416]]
Accuracy: 0.8015, Precision: 0.6209, Recall: 0.4351, F1-score: 0.5117

K=5
Confusion Matrix:
[[2870  174]
 [ 555  401]]
Accuracy: 0.8177, Precision: 0.6974, Recall: 0.4195, F1-score: 0.5238

K=7
Confusion Matrix:
[[2902  142]
 [ 599  357]]
Accuracy: 0.8147, Precision: 0.7154, Recall: 0.3734, F1-score: 0.4907

K=11
Confusion Matrix:
[[2957   87]
 [ 624  332]]
Accuracy: 0.8223, Precision: 0.7924, Recall: 0.3473, F1-score: 0.4829

