In [224]:
import numpy as np
import pandas as pd
from collections import Counter

In [225]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weights='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.weights = weights

    def fit(self, X, y):
        self.x_train = X
        self.y_train = y


    def predict(self, X):

        predictions = []
        for x in X:
            distances = self.compute_distance(x, self.x_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            most_common = np.argmax(np.bincount(k_nearest_labels))
            predictions.append(most_common)
        return np.array(predictions)


    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")


In [226]:


# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Transform categorical variables, scale features, Should also handle potential missing values?

    # Dropping uneeded data
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname', "Geography", "HasCrCard", "IsActiveMember"], errors='ignore')
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname', "Geography", "HasCrCard", "IsActiveMember"], errors='ignore')
    X_train = train_data.drop('Exited', axis=1)
    y_train = train_data['Exited']


    # Trasnform categorical variables
    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(test_data, drop_first=True)

    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # scale values
    X_train = (X_train - X_train.mean()) / X_train.std()
    X_test = (X_test - X_train.mean()) / X_train.std()

    return X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy()

In [227]:
def roc_auc(y_true, y_pred_prob):
    sorted_indices = np.argsort(-y_pred_prob)
    y_true_sorted = y_true[sorted_indices]

    tpr = np.cumsum(y_true_sorted == 1) / np.sum(y_true == 1)
    fpr = np.cumsum(y_true_sorted == 0) / np.sum(y_true == 0)

    auc = np.trapz(tpr, fpr)

    return auc

def cross_validate(X, y, knn, n_splits=5):
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    fold_size = len(X) // n_splits
    roc_auc_scores = []

    for fold in range(n_splits):
        start = fold * fold_size
        end = (fold + 1) * fold_size
        val_idx = indices[start:end]
        train_idx = np.concatenate([indices[:start], indices[end:]])

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        knn.fit(X_train, y_train)

        # Get predicted probabilities
        y_pred_prob = knn.predict(X_val)

        # Calculate ROC AUC score
        roc_score = roc_auc(y_val, y_pred_prob)
        roc_auc_scores.append(roc_score)

    return np.mean(roc_auc_scores)

In [228]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_k = None
best_distance = None
best_score = 0
for k in range(1, 16):
    for distance in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=distance)
        score = cross_validate(X, y, knn)
        print(f"k={k}, distance={distance}, score={score:.4f}")
        if score > best_score:
            best_score = score
            best_k = k
            best_distance = distance

print("Best k:", best_k)
print("Best distance:", best_distance)
print("Best score:", best_score)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: 0.766565086914876
k=1, distance=euclidean, score=0.7384
k=1, distance=manhattan, score=0.7390
k=2, distance=euclidean, score=0.7007
k=2, distance=manhattan, score=0.6955
k=3, distance=euclidean, score=0.7549
k=3, distance=manhattan, score=0.7658
k=4, distance=euclidean, score=0.7306
k=4, distance=manhattan, score=0.7327
k=5, distance=euclidean, score=0.7763
k=5, distance=manhattan, score=0.7695
k=6, distance=euclidean, score=0.7466
k=6, distance=manhattan, score=0.7447
k=7, distance=euclidean, score=0.7653
k=7, distance=manhattan, score=0.7661
k=8, distance=euclidean, score=0.7524
k=8, distance=manhattan, score=0.7444
k=9, distance=euclidean, score=0.7673
k=9, distance=manhattan, score=0.7668
k=10, distance=euclidean, score=0.7521
k=10, distance=manhattan, score=0.7538
k=11, distance=euclidean, score=0.7720
k=11, distance=manhattan, score=0.7686
k=12, distance=euclidean, score=0.7583
k=12, distance=manhattan, score=0.7528
k=13, distance=euclidean, score=0.7653
