In [21]:
import numpy as np
import pandas as pd

In [22]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weights='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.weights = weights

    def fit(self, X, y):
        """
        Stores the training data and labels.
        """
        self.X_train = np.array(X)  # Ensure X_train is a NumPy array
        self.y_train = np.array(y)  # Ensure y_train is a NumPy array

    def compute_distance(self, X1, X2):
        """
        Computes the distance between each row in X1 (training data) and X2 (a single test sample).
        """
        X1 = np.array(X1, dtype=np.float64)
        X2 = np.array(X2, dtype=np.float64)

        if X2.ndim == 0 or X2.shape == ():
            X2 = np.array([X2])

        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2.reshape(1, -1)) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2.reshape(1, -1)), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

    def predict(self, X):
        """
        Predicts the class labels for the given test data.
        """
        X = np.array(X, dtype=np.float64)
        predictions = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]

            if self.weights == 'uniform':
                # Majority vote for uniform weighting
                unique, counts = np.unique(k_nearest_labels, return_counts=True)
                predictions.append(unique[np.argmax(counts)])
            elif self.weights == 'distance':
                # Weighting by distance for distance-based weighting
                k_nearest_distances = distances[k_indices]
                inverse_distances = 1 / (k_nearest_distances + 1e-5)  # Avoid division by zero
                weighted_vote = {}
                for i, label in enumerate(k_nearest_labels):
                    if label in weighted_vote:
                        weighted_vote[label] += inverse_distances[i]
                    else:
                        weighted_vote[label] = inverse_distances[i]
                predictions.append(max(weighted_vote, key=weighted_vote.get))
            else:
                raise ValueError("Unsupported weighting scheme")
        
        return np.array(predictions)

In [23]:
def preprocess_data(train_data, test_data):
    # Concatenate train and test data for consistent preprocessing
    all_data = pd.concat([train_data, test_data], axis=0, sort=False)

    # Handle categorical variables
    all_data = pd.get_dummies(all_data, columns=['Geography', 'Gender'], drop_first=True)

    # Select features for model
    features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 
                'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']

    # Impute missing values with median
    all_data[features] = all_data[features].fillna(all_data[features].median())

    # Scale features
    for feature in features:
        all_data[feature] = (all_data[feature] - all_data[feature].mean()) / all_data[feature].std()

    # Split back into train and test
    X = all_data[features].iloc[:len(train_data)].values
    y = train_data['Exited'].astype(int).values
    X_test = all_data[features].iloc[len(train_data):].values

    return X, y, X_test, all_data['CustomerId'].iloc[len(train_data):].values, features

In [24]:
def cross_validate(X, y, knn, n_splits=5):
    # Ensure reproducibility
    np.random.seed(42)
    
    # Shuffle the data
    indices = np.random.permutation(len(X))
    X = X[indices]
    y = y[indices]
    
    # Determine fold size
    fold_size = len(X) // n_splits
    accuracies = []

    for i in range(n_splits):
        start = i * fold_size
        end = (i + 1) * fold_size if i < n_splits - 1 else len(X)

        # Create validation and training sets
        X_val = X[start:end]
        y_val = y[start:end]

        X_train = np.concatenate([X[:start], X[end:]], axis=0)
        y_train = np.concatenate([y[:start], y[end:]], axis=0)

        # Fit the KNN model on the training fold
        knn.fit(X_train, y_train)

        # Predict on the validation fold
        y_pred = knn.predict(X_val)

        # Compute accuracy for the current fold
        accuracy = np.mean(y_pred == y_val)
        accuracies.append(accuracy)

    # Return the average accuracy and all accuracies across folds
    return np.mean(accuracies), accuracies

In [25]:

def hyperparameter_tuning(X, y):
    best_k = None
    best_metric = None
    best_weight = None
    best_accuracy = -1
    results = []

    # Define ranges of k values and distance metrics to try
    k_values = [15,18,20,23,25]  # Example: Test k values from 10 to 20
    distance_metrics = ['euclidean', 'manhattan']  # You can test multiple distance metrics
    weights_options = ['uniform', 'distance']  # Test both uniform and distance-based weighting

    # Iterate over each combination of k, distance metric, and weighting
    for k in k_values:
        for metric in distance_metrics:
            for weight in weights_options:
                print(f"Testing k={k}, distance_metric={metric}, weight={weight}...")  # Print current step

                knn = KNN(k=k, distance_metric=metric, weights=weight)
                average_accuracy, _ = cross_validate(X, y, knn, n_splits=10)  # Perform cross-validation

                # Print the accuracy for this configuration
                print(f"Accuracy for k={k}, distance_metric={metric}, weight={weight}: {average_accuracy:.4f}")

                # Append results for each combination
                results.append((k, metric, weight, average_accuracy))

                # Check if this is the best accuracy so far
                if average_accuracy > best_accuracy:
                    best_accuracy = average_accuracy
                    best_k = k
                    best_metric = metric
                    best_weight = weight
                    print(f"New best accuracy: {best_accuracy:.4f} with k={k}, distance_metric={metric}, weight={weight}")

    return best_k, best_metric, best_weight, best_accuracy, results




In [26]:
# Load and preprocess data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess the data, unpack all the returned values
X, y, X_test, test_customer_ids, features = preprocess_data(train_df, test_df)

# Perform hyperparameter tuning
best_k, best_metric, best_weight, best_accuracy, results = hyperparameter_tuning(X, y)

print("Best k:", best_k)
print("Best Distance Metric:", best_metric)
print("Best Weighting Scheme:", best_weight)
print("Best Accuracy:", best_accuracy)

# Train the final model with the best hyperparameters
knn_optimal = KNN(k=best_k, distance_metric=best_metric, weights=best_weight)
knn_optimal.fit(X, y)

# Make final predictions on the test data
test_predictions = knn_optimal.predict(X_test)

# Create a submission DataFrame
submission_df = pd.DataFrame({'CustomerId': test_customer_ids, 'Exited': test_predictions})
submission_df.to_csv('submissions.csv', index=False)
print("Test predictions saved to 'submissions.csv'.")

Testing k=15, distance_metric=euclidean, weight=uniform...
Accuracy for k=15, distance_metric=euclidean, weight=uniform: 0.8765
New best accuracy: 0.8765 with k=15, distance_metric=euclidean, weight=uniform
Testing k=15, distance_metric=euclidean, weight=distance...
Accuracy for k=15, distance_metric=euclidean, weight=distance: 0.8780
New best accuracy: 0.8780 with k=15, distance_metric=euclidean, weight=distance
Testing k=15, distance_metric=manhattan, weight=uniform...
Accuracy for k=15, distance_metric=manhattan, weight=uniform: 0.8779
Testing k=15, distance_metric=manhattan, weight=distance...
Accuracy for k=15, distance_metric=manhattan, weight=distance: 0.8788
New best accuracy: 0.8788 with k=15, distance_metric=manhattan, weight=distance
Testing k=18, distance_metric=euclidean, weight=uniform...
Accuracy for k=18, distance_metric=euclidean, weight=uniform: 0.8761
Testing k=18, distance_metric=euclidean, weight=distance...
Accuracy for k=18, distance_metric=euclidean, weight=dist

In [27]:
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

test_ids = pd.read_csv('test.csv')['id']

# Create a DataFrame with the 'id' and 'Exited' predictions
submission_df = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})

# Save the predictions to a CSV file for submission
submission_df.to_csv('submissions.csv', index=False)

# Print confirmation
print("Test predictions saved to 'submissions.csv'.")

Test predictions saved to 'submissions.csv'.


In [28]:
if submission_df['id'].duplicated().any():
    raise ValueError("Duplicate CustomerId values found in submission!")

# Save the submission file
submission_df.to_csv('submissions.csv', index=False)
print("Test predictions saved to 'submissions.csv'.")

Test predictions saved to 'submissions.csv'.
