In [19]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from collections import Counter

In [8]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """ Store the training data (X and y) """
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        """ Predict the class labels for the provided data points """
        X = np.array(X)
        predictions = [self._predict_single_point(x) for x in X]
        return np.array(predictions)

    def _predict_single_point(self, x):
        """ Predict the class label for a single data point x """
        # Compute distances between x and all training data
        distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
        
        # Sort the distances and get the indices of the k closest points
        k_indices = np.argsort(distances)[:self.k]
        
        # Find the labels of the k closest points
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # Return the most common label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def compute_distance(self, X1, X2):
        """ Compute the distance between two points X1 and X2 """
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        # You can add more distance metrics l


In [9]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop columns that are not useful for prediction
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # Concatenate train and test data to ensure consistent dummies
    combined_data = pd.concat([train_data, test_data], axis=0)

    # Handle categorical variables (Geography and Gender)
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    # Split the data back into train and test sets
    train_data = combined_data.iloc[:len(train_data)]
    test_data = combined_data.iloc[len(train_data):]

    # Split features and target
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited']
    X_test = test_data.drop(columns=['Exited'], errors='ignore')

    # Scale features using StandardScaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, y_train, X_test


In [10]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    
    # Initialize KFold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    roc_auc_scores = []

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into training and validation sets
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)

        # Predict on the test set
        y_pred = knn.predict(X_test)

        # Compute the ROC AUC score for the current fold
        roc_auc = roc_auc_score(y_test, y_pred)
        roc_auc_scores.append(roc_auc)

    # Calculate the average ROC AUC score across all folds
    avg_roc_auc = np.mean(roc_auc_scores)
    
    return avg_roc_auc


In [20]:
# Load and preprocess data
X, y, X_test = preprocess_data('/Users/tianxiaoshan/Downloads/train.csv', '/Users/tianxiaoshan/Downloads/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
knn_model = KNeighborsClassifier()

# Set up the hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Trying different values of k
    'metric': ['euclidean', 'manhattan']  # Trying different distance metrics
}

# Set up GridSearchCV to automatically handle cross-validation and hyperparameter tuning
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Perform cross-validation and hyperparameter tuning
grid_search.fit(X, y)

# Output the best hyperparameters found
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation ROC AUC score: {grid_search.best_score_}")


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# Use the best model obtained from GridSearchCV
knn = grid_search.best_estimator_
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/Users/tianxiaoshan/Downloads/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: 0.7662875380177789
Best parameters: {'metric': 'manhattan', 'n_neighbors': 9}
Best cross-validation ROC AUC score: 0.8905169011239197
