In [31]:
import numpy as np
import pandas as pd

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        """
        Initializes the KNearestNeighbors class with the number of neighbors (k)
        and the distance metric to use ('euclidean' or 'manhattan').
        """
        self.neighbors = k
        self.distance_metric = distance_metric

    def fit(self, X_train, y_train):
        """
        Stores the training data and labels.
    
        """
        self.X_train = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
        self.y_train = y_train.values if isinstance(y_train, pd.Series) else y_train

    def compute_distance(self, sample1, sample2):
        """
        Computes the distance between two samples based on the selected distance metric.

        """
        sample1 = np.array(sample1, dtype=np.float64)
        sample2 = np.array(sample2, dtype=np.float64)

        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((sample1 - sample2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(sample1 - sample2), axis=1)

    def predict(self, X_test):
        """
        Predicts the probability for each test sample to belong to a certain class.

        """
        X_test = X_test.values if isinstance(X_test, pd.DataFrame) else X_test

        if X_test.ndim == 1:
            X_test = X_test.reshape(1, -1)
        
        predictions = []

        for test_sample in X_test:
            distances = self.compute_distance(test_sample, self.X_train)
            k_nearest_indices = np.argsort(distances)[:self.neighbors]
            k_nearest_labels = self.y_train[k_nearest_indices]
            k_nearest_distances = distances[k_nearest_indices]
            weights = 1 / (k_nearest_distances + 1e-5)  # Add small value to avoid division by zero
            probability = np.sum(weights * k_nearest_labels) / np.sum(weights)
            predictions.append(probability)
        
        return np.array(predictions)

In [32]:
class SimpleStandardScaler:
    def __init__(self):
        """
        Initializes the SimpleStandardScaler class, which will store the mean and scale (standard deviation)
        for standardizing the data.
        """
        self.mean_ = None
        self.scale_ = None

    def fit(self, X):
        """
        Computes and stores the mean and standard deviation of the data.
        
        """
        X = self._check_input(X)
        self.mean_ = np.mean(X, axis=0)
        self.scale_ = np.std(X, axis=0, ddof=1)
        return self

    def transform(self, X):
        """
        Transforms the data by subtracting the mean and dividing by the standard deviation.
        
        """
        X = self._check_input(X)
        return (X - self.mean_) / self.scale_

    def fit_transform(self, X):
        """
        Fits the scaler to the data and then transforms it in one step.
        
        """
        return self.fit(X).transform(X)

    def _check_input(self, X):
        """
        Helper function to ensure the input is a numpy array. Converts pandas DataFrame or Series to numpy array.
        
        """
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            return X.values
        return np.array(X)

In [33]:
def preprocess_data(train_path, test_path):
    """
    Preprocesses the train and test data by cleaning, encoding, detecting outliers, and scaling.
    """
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop unnecessary columns
    train_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)
    test_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)

    # Convert specific columns to categorical
    train_data['HasCrCard'] = train_data['HasCrCard'].astype('object')
    train_data['IsActiveMember'] = train_data['IsActiveMember'].astype('object')
    test_data['HasCrCard'] = test_data['HasCrCard'].astype('object')
    test_data['IsActiveMember'] = test_data['IsActiveMember'].astype('object')

    # One-hot encode categorical variables
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # Get numerical features and remove the target variable
    numerical_features = train_data.select_dtypes(include=[np.number]).columns.tolist()
    if 'Exited' in numerical_features:
        numerical_features.remove('Exited')

    # Detect outliers
    def detect_outliers(df, features, threshold=3):
        outliers = np.zeros(df.shape[0])
        for feature in numerical_features:
            feature_mean = np.mean(df[feature])
            feature_std = np.std(df[feature])
            z_scores = np.abs((df[feature] - feature_mean) / feature_std)
            outliers += (z_scores > threshold).astype(int)
        return outliers > 0

    outliers = detect_outliers(train_data, numerical_features)
    train_data = train_data[~outliers]

    # Scale numerical features
    scaler = SimpleStandardScaler()
    train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
    test_data[numerical_features] = scaler.transform(test_data[numerical_features])

    # Separate features and target
    X = train_data.drop('Exited', axis=1).astype('float')
    y = train_data['Exited']
    X_test = test_data

    return X, y, X_test


In [34]:
class StratifiedKFoldCV:
    def __init__(self, num_folds=5, shuffle_data=True, seed=None):
        """
        Initializes StratifiedKFoldCV with the number of folds, shuffle option, and random seed.
        """
        self.num_folds = num_folds
        self.shuffle_data = shuffle_data
        self.seed = seed
    
    def split(self, features, labels):
        """
        Splits the dataset into training and validation indices, ensuring class proportions are maintained.
        """
        np.random.seed(self.seed)
        labels = np.array(labels)
        unique_classes, label_indices = np.unique(labels, return_inverse=True)
        num_classes = len(unique_classes)
        
        folds = [[] for _ in range(self.num_folds)]
        
        for cls in range(num_classes):
            cls_indices = np.where(label_indices == cls)[0]
            if self.shuffle_data:
                np.random.shuffle(cls_indices)
            fold_sizes = np.full(self.num_folds, len(cls_indices) // self.num_folds, dtype=int)
            fold_sizes[:len(cls_indices) % self.num_folds] += 1
            current = 0
            for fold, fold_size in enumerate(fold_sizes):
                folds[fold].extend(cls_indices[current:current + fold_size])
                current += fold_size
        
        for fold in range(self.num_folds):
            val_indices = np.array(folds[fold])
            train_indices = np.array([idx for f in range(self.num_folds) if f != fold for idx in folds[f]])
            yield train_indices, val_indices

# Compute ROC AUC scores
def calculate_roc_auc(true_labels, predicted_scores):
    """
    Computes the ROC AUC score given true labels and predicted probabilities.
    """
    desc_score_indices = np.argsort(-predicted_scores)
    true_labels = true_labels[desc_score_indices]
    predicted_scores = predicted_scores[desc_score_indices]
    
    distinct_value_indices = np.where(np.diff(predicted_scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, true_labels.size - 1]
    
    tps = np.cumsum(true_labels)[threshold_idxs]
    fps = 1 + threshold_idxs - tps
    
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    
    fpr = fps / fps[-1]
    tpr = tps / tps[-1]
    
    auc_value = np.trapz(tpr, fpr)
    return auc_value

# Define cross-validation function
def cross_validation(features, labels, model, num_folds=5):
    """
    Performs cross-validation, computes ROC AUC for each fold, and returns the scores.
    """
    stratified_kfold = StratifiedKFoldCV(num_folds=num_folds, shuffle_data=True, seed=42)
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(stratified_kfold.split(features, labels), 1):
        X_train, X_val = features.iloc[train_idx].reset_index(drop=True), features.iloc[val_idx].reset_index(drop=True)
        y_train, y_val = labels.iloc[train_idx].reset_index(drop=True), labels.iloc[val_idx].reset_index(drop=True)
        
        model.fit(X_train, y_train)
        y_pred_proba = model.predict(X_val)
        
        fold_score = calculate_roc_auc(y_val.values, y_pred_proba)
        fold_scores.append(fold_score)
        print(f"Fold {fold} ROC AUC: {fold_score}")
    
    avg_score = np.mean(fold_scores)
    print(f"Mean ROC AUC: {avg_score}")
    return fold_scores

In [35]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=20, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validation(X, y, knn)

print("Cross-validation scores:", cv_scores)

Fold 1 ROC AUC: 0.9205457498336564
Fold 2 ROC AUC: 0.9021309856837256
Fold 3 ROC AUC: 0.9185245361887313
Fold 4 ROC AUC: 0.9182808126462625
Fold 5 ROC AUC: 0.9174605951452302
Mean ROC AUC: 0.9153885358995211
Cross-validation scores: [0.9205457498336564, 0.9021309856837256, 0.9185245361887313, 0.9182808126462625, 0.9174605951452302]


In [36]:
# Hyperparameter tuning
k_values = [5, 10, 20, 30, 40, 50, 100, 200]
distance_metrics = ['euclidean', 'manhattan']

best_k = 0
best_metric = None
best_score = 0

# Iterate through each k value and distance metric to find the best combination
for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k, distance_metric=metric)
        scores = cross_validation(X, y, knn)
        mean_score = np.mean(scores)
        
        # Update the best parameters if the current score is higher
        if mean_score > best_score:
            best_score = mean_score
            best_k = k
            best_metric = metric

# Print the best hyperparameters and corresponding mean ROC AUC score
print(f"Optimal hyperparameters found: k = {best_k}, Distance Metric = {best_metric}")
print(f"Highest Score: {best_score:.4f}")

# Train the model on the full dataset using the best hyperparameters
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)

# Make predictions on the test set
test_predictions = knn.predict(X_test)

# Save the predictions in the submission file
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)

Fold 1 ROC AUC: 0.8833012570561847
Fold 2 ROC AUC: 0.8739552553820178
Fold 3 ROC AUC: 0.8932686526568941
Fold 4 ROC AUC: 0.8878768411875373
Fold 5 ROC AUC: 0.8718876187307852
Mean ROC AUC: 0.8820579250026839
Fold 1 ROC AUC: 0.8826666475878401
Fold 2 ROC AUC: 0.8708773636877465
Fold 3 ROC AUC: 0.8952162305313789
Fold 4 ROC AUC: 0.8807630317074289
Fold 5 ROC AUC: 0.871514792593952
Mean ROC AUC: 0.8802076132216692
Fold 1 ROC AUC: 0.9112208541113678
Fold 2 ROC AUC: 0.8964359559565289
Fold 3 ROC AUC: 0.91154911532295
Fold 4 ROC AUC: 0.9041306268067727
Fold 5 ROC AUC: 0.9011473007387694
Mean ROC AUC: 0.9048967705872777
Fold 1 ROC AUC: 0.9033215044608682
Fold 2 ROC AUC: 0.8912471113464167
Fold 3 ROC AUC: 0.9115512625973431
Fold 4 ROC AUC: 0.8999076538337998
Fold 5 ROC AUC: 0.9019051415592163
Mean ROC AUC: 0.9015865347595288
Fold 1 ROC AUC: 0.9205457498336564
Fold 2 ROC AUC: 0.9021309856837256
Fold 3 ROC AUC: 0.9185245361887313
Fold 4 ROC AUC: 0.9182808126462625
Fold 5 ROC AUC: 0.9174605951452