In [16]:
import numpy as np
import pandas as pd

In [17]:
class KNN:
    def __init__(self, k=3, distance_metric='minkowski', p=2):
        self.k = k
        self.distance_metric = distance_metric
        self.p = p

    def fit(self, X, y):
        self.X_train = np.asarray(X, dtype=float)
        self.y_train = np.asarray(y, dtype=int)

    def compute_distance(self, X_train, x):
        X_train = np.asarray(X_train, dtype=float)
        x = np.asarray(x, dtype=float)
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X_train - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X_train - x), axis=1)
        else:
            raise ValueError("Unknown distance metric")
        return distances

    def predict(self, X):
        y_pred = []
        for x in X:
            x = np.asarray(x, dtype=float)
            distances = self.compute_distance(self.X_train, x)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            counts = np.bincount(k_nearest_labels)
            y_pred.append(np.argmax(counts))
        return np.array(y_pred)

    def predict_proba(self, X):
        y_proba = []
        for x in X:
            x = np.asarray(x, dtype=float)
            # Compute distances to all training points
            distances = self.compute_distance(self.X_train, x)

            # Get indices of the k nearest neighbors
            k_indices = distances.argsort()[:self.k]

            # Get labels of the k nearest neighbors
            k_nearest_labels = self.y_train[k_indices]

            # Get distances of the k nearest neighbors
            k_nearest_distances = distances[k_indices]

            # Compute weights based on distances (inverse of distances)
            weights = 1 / (k_nearest_distances + 1e-5)

            # For binary classification, calculate probabilities of each class
            class_1_weighted_sum = np.sum(weights[k_nearest_labels == 1])
            class_0_weighted_sum = np.sum(weights[k_nearest_labels == 0])
            total_weighted_sum = class_1_weighted_sum + class_0_weighted_sum

            # Probability of class 1 (positive class)
            proba_class_1 = class_1_weighted_sum / total_weighted_sum

            # Append the probability of class 1
            y_proba.append(proba_class_1)

        return np.array(y_proba)


In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess_data(train_path, test_path):
    # Load datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Save test IDs for future use
    test_ids = test_data['id'].values

    # Drop unwanted columns
    train_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
    test_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

    # Combine the two datasets to apply transformations consistently
    combined_data = pd.concat([train_data, test_data], keys=['train', 'test'])

    # One-hot encode categorical features and drop the first category
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    # Separate the data again into training and test datasets
    train_data_processed = combined_data.xs('train', level=0)
    test_data_processed = combined_data.xs('test', level=0)

    # Identify numerical features for scaling
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                          'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

    # Fill missing numerical values with the mean
    train_data_processed[numerical_features] = train_data_processed[numerical_features].fillna(train_data_processed[numerical_features].mean())
    test_data_processed[numerical_features] = test_data_processed[numerical_features].fillna(test_data_processed[numerical_features].mean())

    # Standardize the numerical features (scaling based on training data statistics)
    scaler = StandardScaler()
    train_data_processed[numerical_features] = scaler.fit_transform(train_data_processed[numerical_features])
    test_data_processed[numerical_features] = scaler.transform(test_data_processed[numerical_features])

    # Separate target variable 'Exited' from training data
    y_train = train_data_processed['Exited'].values.astype(int)
    X_train = train_data_processed.drop(['id', 'Exited'], axis=1).values

    # Prepare test data for prediction (drop 'Exited' column if it exists in the test data)
    X_test = test_data_processed.drop(['id', 'Exited'], axis=1, errors='ignore').values

    return X_train, y_train, X_test, test_ids


In [19]:
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(y) // n_splits
    indices = np.arange(len(y))
    np.random.shuffle(indices)

    auc_scores = []

    for fold in range(n_splits):
        val_indices = indices[fold * fold_size:(fold + 1) * fold_size]
        train_indices = np.setdiff1d(indices, val_indices)
        X_train_fold, X_val_fold = X[train_indices], X[val_indices]
        y_train_fold, y_val_fold = y[train_indices], y[val_indices]
        knn.fit(X_train_fold, y_train_fold)
        y_val_proba = knn.predict_proba(X_val_fold)
        auc = roc_auc_manual(y_val_fold, y_val_proba)
        auc_scores.append(auc)

    return auc_scores


def roc_auc_manual(y_true, y_pred):

    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[sorted_indices]


    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)


    tp_cumsum = np.cumsum(y_true_sorted)
    fp_cumsum = np.cumsum(1 - y_true_sorted)

    auc = np.sum(tp_cumsum[y_true_sorted == 0]) / (pos * neg)

    return auc

In [20]:
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')
k_values = [22]
distance_metrics = ['euclidean', 'manhattan']
best_score = 0
for k in k_values:
    for distance_metric in distance_metrics:
            knn = KNN(k=k, distance_metric=distance_metric)
            cv_scores = cross_validate(X, y, knn)
            mean_score = np.mean(cv_scores)
            print(f"k={k}, distance_metric={distance_metric}, CV Score={mean_score:.4f}")
            if mean_score > best_score:
                best_score = mean_score
                best_params = {'k': k, 'distance_metric': distance_metric}

print("\nBest Parameters:", best_params)
print(f"Best CV Score: {best_score:.4f}")


knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])

knn.fit(X, y)

test_predictions = knn.predict_proba(X_test)

submission = pd.DataFrame({
    'id': test_ids,
    'Exited': test_predictions
})
submission.to_csv('submissions.csv', index=False)
submission_csv = pd.read_csv('submissions.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_processed[numerical_features] = train_data_processed[numerical_features].fillna(train_data_processed[numerical_features].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_processed[numerical_features] = test_data_processed[numerical_features].fillna(test_data_processed[numerical_features].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/panda

k=20, distance_metric=euclidean, CV Score=0.9101
k=20, distance_metric=manhattan, CV Score=0.9089
k=21, distance_metric=euclidean, CV Score=0.9091
k=21, distance_metric=manhattan, CV Score=0.9087
k=22, distance_metric=euclidean, CV Score=0.9117
k=22, distance_metric=manhattan, CV Score=0.9094
k=23, distance_metric=euclidean, CV Score=0.9114
k=23, distance_metric=manhattan, CV Score=0.9106

Best Parameters: {'k': 22, 'distance_metric': 'euclidean'}
Best CV Score: 0.9117
