In [57]:
import numpy as np
import pandas as pd

In [58]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
       
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X_test):
        
        probs = []
        for x in X_test:
            distances = self.compute_distance(self.X_train, x)
            k_idx = np.argsort(distances)[:self.k]
            neighbor_labels = self.y_train[k_idx]
            prob = sum(neighbor_labels) / self.k  
            probs.append(prob)
        
        return probs

    def compute_distance(self, X1, X2):
    
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unknown distance metric")

In [59]:
def preprocess_data(train_path, test_path):
    # Load datasets
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Feature lists
    cat_feats = ['Geography', 'Gender']
    num_feats = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
    ord_feats = ['NumOfProducts', 'HasCrCard', 'IsActiveMember']

    # Standardize numerical features using training data statistics
    num_train = train_df[num_feats].values
    num_mean = num_train.mean(axis=0)
    num_std = num_train.std(axis=0)
    train_num = (num_train - num_mean) / num_std
    test_num = (test_df[num_feats].values - num_mean) / num_std

    # Standardize ordinal features using training data statistics
    ord_train = train_df[ord_feats].values
    ord_mean = ord_train.mean(axis=0)
    ord_std = ord_train.std(axis=0)
    train_ord = (ord_train - ord_mean) / ord_std
    test_ord = (test_df[ord_feats].values - ord_mean) / ord_std

    # One-hot encode categorical features using training data mappings
    train_cat_arrays = []
    test_cat_arrays = []
    for feat in cat_feats:
        vals = train_df[feat].unique()
        mapping = {val: idx for idx, val in enumerate(vals)}
        # Training data encoding
        train_encoded = train_df[feat].map(mapping).values
        train_one_hot = np.zeros((train_encoded.size, len(vals)))
        train_one_hot[np.arange(train_encoded.size), train_encoded] = 1
        train_cat_arrays.append(train_one_hot)
        # Test data encoding
        test_encoded = test_df[feat].map(mapping).fillna(-1).astype(int)
        test_one_hot = np.zeros((test_encoded.size, len(vals)))
        valid_idx = test_encoded >= 0
        test_one_hot[np.arange(test_encoded.size)[valid_idx], test_encoded[valid_idx]] = 1
        test_cat_arrays.append(test_one_hot)

    train_cat = np.hstack(train_cat_arrays)
    test_cat = np.hstack(test_cat_arrays)

    # Combine features
    train_X = np.hstack([train_num, train_ord, train_cat])
    test_X = np.hstack([test_num, test_ord, test_cat])

    # Extract labels
    train_y = train_df['Exited'].values

    return train_X, train_y, test_X

In [60]:
def cross_validate(X, y, knn, n_splits=5):

    idx = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    auc_list = []

    for i in range(n_splits):
        test_idx = idx[i * fold_size:(i + 1) * fold_size]
        train_idx = np.setdiff1d(idx, test_idx)
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        knn.fit(X_train, y_train)

        y_scores = []
        for x in X_test:
            dists = knn.compute_distance(knn.X_train, x)
            k_idx = np.argsort(dists)[:knn.k]
            k_labels = knn.y_train[k_idx]
            prob = np.sum(k_labels) / knn.k
            y_scores.append(prob)

        # Compute ROC AUC score
        y_true = np.array(y_test)
        y_score = np.array(y_scores)
        sort_idx = np.argsort(y_score)[::-1]
        y_true = y_true[sort_idx]
        y_score = y_score[sort_idx]

        diffs = np.where(np.diff(y_score))[0]
        last_idx = np.array([len(y_true) - 1])
        thresh = np.hstack((diffs, last_idx))

        tps = np.cumsum(y_true)[thresh]
        fps = (1 + thresh) - tps

        tpr = tps / tps[-1]
        fpr = fps / fps[-1]

        zero = np.array([0])
        tpr_diff = np.hstack((np.diff(tpr), zero))
        fpr_diff = np.hstack((np.diff(fpr), zero))
        auc = np.dot(tpr, fpr_diff) + np.dot(tpr_diff, fpr_diff) / 2
        auc_list.append(auc)

    return np.mean(auc_list)

 

In [61]:
X, y, X_test = preprocess_data('train.csv','test.csv')

best_score = 0
best_k = 0
best_metric = ''

for k in range(45,70,5):  
    for metric in ['euclidean','manhattan']: 
        print(f"\nEvaluating: k={k}, metric={metric}")

        knn = KNN(k=k, distance_metric=metric)  
        cv_score = cross_validate(X, y, knn, n_splits=5)  

        print(f"CV Score for k={k}, metric={metric}: {cv_score:.4f}")

        if cv_score > best_score:
            best_score = cv_score
            best_k = k
            best_metric = metric

print(f"\nBest KNN Parameters: k={best_k}, Metric={best_metric} with ROC AUC Score: {best_score:.4f}")
best_knn = KNN(k=best_k, distance_metric=best_metric)
best_knn.fit(X, y)
test_predictions = best_knn.predict(X_test)

submission_df = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission_df.to_csv('submission.csv', index=False)


Evaluating: k=45, metric=euclidean
CV Score for k=45, metric=euclidean: 0.9120

Evaluating: k=45, metric=manhattan
CV Score for k=45, metric=manhattan: 0.9095

Evaluating: k=50, metric=euclidean
CV Score for k=50, metric=euclidean: 0.9130

Evaluating: k=50, metric=manhattan
CV Score for k=50, metric=manhattan: 0.9103

Evaluating: k=55, metric=euclidean
CV Score for k=55, metric=euclidean: 0.9119

Evaluating: k=55, metric=manhattan
CV Score for k=55, metric=manhattan: 0.9091

Evaluating: k=60, metric=euclidean
CV Score for k=60, metric=euclidean: 0.9131

Evaluating: k=60, metric=manhattan
CV Score for k=60, metric=manhattan: 0.9098

Evaluating: k=65, metric=euclidean
CV Score for k=65, metric=euclidean: 0.9122

Evaluating: k=65, metric=manhattan
CV Score for k=65, metric=manhattan: 0.9100

Best KNN Parameters: k=60, Metric=euclidean with ROC AUC Score: 0.9131
