In [61]:
import numpy as np
import pandas as pd

In [62]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X = X
        self.y = y
        pass

    def predict(self, X):
        # TODO: Implement the predict method

        predictions = []

        for row in X:

            distances = np.apply_along_axis(self.compute_distance, 1, self.X, row)
            dis_classifications = np.array([[distances[i], self.y[i]] for i in range(len(self.y))]) 
            sort_dis_class = dis_classifications[dis_classifications[:, 0].argsort()]

            topk = sort_dis_class[:self.k]
            classification = self.neighbor_majority(topk)

            predictions.append(classification)

        return np.array(predictions)

    def neighbor_majority(self, topk):

        #class_counts = {1:0, 0:0}
        inv_sq_sums = {1:0, 0:0}
        total_inv_sq = 0
        for dis, classification in topk:
            #class_counts[classification] += 1
            total_inv_sq += 1 / (dis ** 2) 
            inv_sq_sums[classification] += 1 / (dis ** 2)

        return inv_sq_sums[1] / total_inv_sq


        

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation

        return np.linalg.norm(X1 - X2)

In [63]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    labels = train_data["Exited"].to_numpy(dtype=int)
    
    train_dropped = train_data.drop(columns=["id", "CustomerId", "Surname", "Geography", "Exited"])
    test_dropped = test_data.drop(columns=["id", "CustomerId", "Surname", "Geography"])

    train_encoded = pd.get_dummies(train_dropped, columns=["Gender"], dtype=int)
    test_encoded = pd.get_dummies(test_dropped, columns=["Gender"], dtype=int)

    for col_name in train_encoded.columns:
        max_val = max(train_encoded[col_name].max(), test_encoded[col_name].max())
        if max_val != 0:
            train_encoded[col_name] = train_encoded[col_name] / max_val
            test_encoded[col_name] = test_encoded[col_name] / max_val

    train_np = train_encoded.to_numpy(dtype=float)
    test_np = test_encoded.to_numpy(dtype=float)

    train_rank = np.linalg.matrix_rank(train_np)
    reduced_rank = train_rank - 2
    u, s, vt = np.linalg.svd(train_np, full_matrices=False)
    s = np.real(s[:train_rank])
    sig = np.diag(s)
    u = u[:, :reduced_rank]
    sig = sig[:reduced_rank,:reduced_rank]
    vt = vt[:reduced_rank]
    train_np = u @ sig @ vt

    test_rank = np.linalg.matrix_rank(test_np)
    reduced_rank = test_rank - 2
    u, s, vt = np.linalg.svd(test_np, full_matrices=False)
    s = np.real(s[:test_rank])
    sig = np.diag(s)
    u = u[:, :reduced_rank]
    sig = sig[:reduced_rank,:reduced_rank]
    vt = vt[:reduced_rank]
    test_np = u @ sig @ vt


    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    return train_np, labels, test_np

In [64]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores

    data_splits = np.array_split(X, n_splits)
    class_splits = np.array_split(y, n_splits)

    scores = []

    for i in range(n_splits):
        curr_test_data = data_splits[i]
        curr_test_classes = class_splits[i]

        
        curr_train_data = [data_splits[j] for j in range(n_splits) if j != i]
        curr_train_classes = [class_splits[i] for j in range(n_splits) if j != i]

        train = np.concatenate(curr_train_data)
        train_classes = np.concatenate(curr_train_classes)


        knn.fit(train, train_classes)

        test_prediction = knn.predict(curr_test_data)
        weighted_sum = 0
        for pred_prob, curr_class in zip(test_prediction, curr_test_classes):
            if curr_class == 1:
                weighted_sum += pred_prob
            else:
                weighted_sum += 1 - pred_prob
        scores.append(weighted_sum / len(curr_test_classes))

    return sum(scores) / float(len(scores))



In [65]:
# Load and preprocess data
X, y, X_test = preprocess_data('data/train.csv', 'data/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
for k_value in range(5, 21):
    print(f"Value for k {k_value}")
    knn = KNN(k=k_value, distance_metric="euclidean")
    score = cross_validate(X, y, knn)
    print(f"Score: {score}")



# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=17, distance_metric="euclidean")
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('data/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: 0.6788788414268625
Value for k 5
Score: 0.6788788414268625
Value for k 6
Score: 0.6793145734829527
Value for k 7
Score: 0.6794747754031979
Value for k 8
Score: 0.6789836228934245
Value for k 9
Score: 0.6788864558171425
Value for k 10
Score: 0.6788128044799515
Value for k 11
Score: 0.6786773960072263
Value for k 12
Score: 0.6787141838119941
Value for k 13
Score: 0.6786492880139507
Value for k 14
Score: 0.6786093151572491
Value for k 15
Score: 0.67869339480633
Value for k 16
Score: 0.6786704801898862
Value for k 17
Score: 0.6785957255638295
Value for k 18
Score: 0.6787262732050319
Value for k 19
Score: 0.6785945808376195
Value for k 20
Score: 0.6784685056140416
