In [None]:
import numpy as np
import pandas as pd

In [None]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data as numpy arrays
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def compute_distance(self, X1, X2):
        X1 = np.array(X1).reshape(1, -1)
        X2 = np.array(X2)

        if self.distance_metric == 'euclidean':
            distances = np.linalg.norm(X2 - X1, axis=1)
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X2 - X1), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
        return distances

    def predict(self, X):

        predictions = []

        for x_test in X:
            # Compute distances from x_test to all training points
            distances = self.compute_distance(x_test, self.X_train)

            # Get the indices of the k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]

            # Get the labels of the k nearest neighbors
            k_nearest_labels = self.y_train[k_indices]

            # Predict the most common label
            prediction = np.bincount(k_nearest_labels.astype(int)).argmax()
            predictions.append(prediction)

        return np.array(predictions)

    def predict_proba(self, X):
        """
        Predicts the probability of each class (churn or not) using the nearest neighbors.
        """
        proba = []
        for x_test in X:
            # Compute distances from x_test to all training points
            distances = self.compute_distance(x_test, self.X_train)

            # Get the indices of the k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]

            # Get the labels of the k nearest neighbors
            k_nearest_labels = self.y_train[k_indices]

            # Calculate the probability of class 1 (churn)
            proba_class_1 = np.mean(k_nearest_labels)
            proba.append(proba_class_1)

        return np.array(proba)


In [None]:
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop irrelevant columns (CustomerId, Surname)
    train_data = train_data.drop(columns=['CustomerId', 'Surname','Geography','Gender'], errors='ignore')
    test_data = test_data.drop(columns=['CustomerId', 'Surname','Geography','Gender'], errors='ignore')

    # Separate features and target in the training data
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited'].astype(int)  # Ensure target labels are integers

    # One-Hot Encoding of categorical variables
    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(test_data, drop_first=True)

    # Align columns of train and test to ensure they have the same columns
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    # Ensure all columns are numeric
    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

    # Handle missing values by filling with column means
    X_train.fillna(X_train.mean(), inplace=True)
    X_test.fillna(X_test.mean(), inplace=True)

    # Convert y_train to numpy array
    y_train = y_train.values

    # Return preprocessed datasets as numpy arrays
    return X_train.values, y_train, X_test.values


In [None]:
def cross_validate(X, y, knn, n_splits=5):
    """
    Perform basic K-fold cross-validation using numpy.
    """
    n_samples = len(X)
    fold_size = n_samples // n_splits
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    accuracy_scores = []

    for i in range(n_splits):
        # Create train/validation split
        val_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])

        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        # Fit the model and make predictions
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        # Compute accuracy
        accuracy = np.mean(y_pred == y_val)
        accuracy_scores.append(accuracy)

    return np.mean(accuracy_scores)

In [None]:
def tune_hyperparameters(X, y, k_values, distance_metrics, n_splits=5):
    """
    Tune hyperparameters k (number of neighbors) and distance metrics using cross-validation.

    Parameters:
    X (ndarray): Feature matrix (NumPy array).
    y (ndarray): Target labels (NumPy array).
    k_values (list): List of k values to try.
    distance_metrics (list): List of distance metrics to try.
    n_splits (int): Number of cross-validation splits.

    Returns:
    best_k (int): The best k value.
    best_metric (str): The best distance metric.
    best_score (float): The highest cross-validation score achieved.
    """
    best_k = None
    best_metric = None
    best_score = -1  # Initialize with a very low score

    for k in k_values:
        for metric in distance_metrics:
            knn = KNN(k=k, distance_metric=metric)
            score = cross_validate(X, y, knn, n_splits=5)
            print(f"K: {k}, Metric: {metric}, Cross-Validation Score: {score}")

            # Use accuracy (or another metric) for hyperparameter comparison
            if score > best_score:
                best_score = score
                best_k = k
                best_metric = metric

    print(f"Best K: {best_k}, Best Metric: {best_metric}, Best Score: {best_score}")
    return best_k, best_metric, best_score

In [None]:
# Load and preprocess data
X_train, y_train, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')
cv_scores = cross_validate(X_train, y_train, knn)
print("Cross-validation scores:", cv_scores)

k_values = [3, 5, 7, 9, 11, 15, 21, 30, 60]
distance_metrics = ['euclidean', 'manhattan']
# Perform hyperparameter tuning
best_k, best_metric, best_score = tune_hyperparameters(X_train, y_train, k_values, distance_metrics)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X_train, y_train)
test_predictions = knn.predict_proba(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: 0.7610666666666666
K: 3, Metric: euclidean, Cross-Validation Score: 0.7394
K: 3, Metric: manhattan, Cross-Validation Score: 0.7410666666666667
K: 5, Metric: euclidean, Cross-Validation Score: 0.7617333333333333
K: 5, Metric: manhattan, Cross-Validation Score: 0.762
K: 7, Metric: euclidean, Cross-Validation Score: 0.7740666666666667
K: 7, Metric: manhattan, Cross-Validation Score: 0.7769333333333333
K: 9, Metric: euclidean, Cross-Validation Score: 0.7814
K: 9, Metric: manhattan, Cross-Validation Score: 0.7822
K: 11, Metric: euclidean, Cross-Validation Score: 0.7863333333333333
K: 11, Metric: manhattan, Cross-Validation Score: 0.7880666666666667
K: 15, Metric: euclidean, Cross-Validation Score: 0.791
K: 15, Metric: manhattan, Cross-Validation Score: 0.7903333333333332
K: 21, Metric: euclidean, Cross-Validation Score: 0.7950666666666667
K: 21, Metric: manhattan, Cross-Validation Score: 0.7939999999999998
K: 30, Metric: euclidean, Cross-Validation Score: 0.79699999