In [111]:
import numpy as np
import pandas as pd

In [112]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = np.array(X, dtype=np.float64)
        self.y_train = np.array(y, dtype=np.float64)

    def predict(self, X):
        predictions = []
        for x in X:
            np.array(x, dtype=np.float64)
            distances = self.compute_distance(x, self.X_train)
            nn_indices = np.argsort(distances)[:self.k]
            nn_labels = self.y_train[nn_indices]
            # Probability is the mean of the neighbor labels
            prob = np.mean(nn_labels)
            predictions.append(prob)
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # # Hint: Use numpy operations for efficient computation
        # print("Type of X1:", type(X1))
        # print("Type of X2:", type(X2))
        # print("Shape of X1:", np.shape(X1))
        # print("Shape of X2:", np.shape(X2))
        # print("X1 dtype:", X1.dtype)
        # print("X2 dtype:", X2.dtype)
        # print("First element of X1:", X1[0], "Type:", type(X1[0]))
        # print("First row of X2:", X2[0], "Types:", [type(x) for x in X2[0]])
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X2 - X1) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X2 - X1), axis=1)
        elif self.distance_metric == 'minkowski':
            distances = np.sum(np.abs(X1 - X2) ** 1.5, axis=1) ** (1/1.5)
        else:
            raise ValueError("Unsupported distance metric")
        return distances

In [113]:
def preprocess_data(train_path, test_path):
    import numpy as np
    import pandas as pd

    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Combine train and test data for consistent preprocessing
    train_data['is_train'] = 1
    test_data['is_train'] = 0
    data = pd.concat([train_data, test_data], sort=False).reset_index(drop=True)

    # Drop unnecessary columns
    data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

    # Handle missing values in categorical columns
    for col in ['Gender', 'Geography']:
        data[col].fillna(data[col].mode()[0], inplace=True)

    # Encode categorical variables
    data['Gender'] = data['Gender'].map({'Female': 0, 'Male': 1})

    # One-hot encode 'Geography'
    data = pd.get_dummies(data, columns=['Geography'], drop_first=True)

    # Convert any boolean columns to integers
    bool_cols = data.select_dtypes(include=['bool']).columns.tolist()
    if bool_cols:
        data[bool_cols] = data[bool_cols].astype(int)

    # Convert all columns to float64
    data = data.astype('float64')

    # Handle any remaining missing values after conversion
    data.fillna(0, inplace=True)

    # Feature scaling (standardization)
    numerical_cols = data.columns.drop(['Exited', 'is_train'])
    for feature in numerical_cols:
        mean = data[feature].mean()
        std = data[feature].std()
        if std != 0:
            data[feature] = (data[feature] - mean) / std
        else:
            data[feature] = 0.0

    # Split the data back into train and test sets
    train_data_processed = data[data['is_train'] == 1].drop('is_train', axis=1)
    test_data_processed = data[data['is_train'] == 0].drop(['is_train', 'Exited'], axis=1)

    # Extract features and target variable
    X_train = train_data_processed.drop('Exited', axis=1).values
    y_train = train_data_processed['Exited'].values
    X_test = test_data_processed.values

    return X_train, y_train, X_test


In [114]:
def stratified_k_fold_split(X, y, n_splits):
    """
    Generate indices to split data into training and test set while preserving the percentage of samples for each class.
    """
    indices = np.arange(len(y))
    unique_classes, y_indices = np.unique(y, return_inverse=True)
    class_counts = np.bincount(y_indices)
    fold_counts = (class_counts * n_splits) // len(y)
    folds = [[] for _ in range(n_splits)]

    for cls in unique_classes:
        cls_indices = indices[y == cls]
        np.random.shuffle(cls_indices)
        cls_folds = np.array_split(cls_indices, n_splits)
        for i in range(n_splits):
            folds[i].extend(cls_folds[i])
    return folds


In [115]:
def compute_roc_auc(y_true, y_scores):
    """
    Compute ROC AUC score from true labels and predicted scores.
    """
    # Sort scores and corresponding true labels
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)

    # Sort scores and corresponding true labels
    desc_score_indices = np.argsort(y_scores)[::-1]
    y_true = y_true[desc_score_indices]
    y_scores = y_scores[desc_score_indices]

    # Compute True Positive Rate (TPR) and False Positive Rate (FPR)
    thresholds = np.unique(y_scores)
    tpr = []
    fpr = []
    P = np.sum(y_true)
    N = len(y_true) - P

    for thresh in thresholds:
        TP = np.sum((y_scores >= thresh) & (y_true == 1))
        FP = np.sum((y_scores >= thresh) & (y_true == 0))
        TPR = TP / P if P != 0 else 0
        FPR = FP / N if N != 0 else 0
        tpr.append(TPR)
        fpr.append(FPR)

    # Sort FPR and TPR
    fpr = np.array(fpr)
    tpr = np.array(tpr)
    sorted_indices = np.argsort(fpr)
    fpr = fpr[sorted_indices]
    tpr = tpr[sorted_indices]

    # Compute AUC using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    return auc


In [116]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    n_samples = len(y)
    indices = np.arange(n_samples)

    # Shuffle indices
    np.random.shuffle(indices)

    # Stratify the data
    unique_classes = np.unique(y)
    folds = [[] for _ in range(n_splits)]
    class_indices = {cls: np.where(y == cls)[0] for cls in unique_classes}

    # Distribute samples to folds
    for cls in unique_classes:
        cls_indices = class_indices[cls]
        np.random.shuffle(cls_indices)
        cls_fold_sizes = np.full(n_splits, len(cls_indices) // n_splits)
        cls_fold_sizes[:len(cls_indices) % n_splits] += 1
        current = 0
        for fold_idx, fold_size in enumerate(cls_fold_sizes):
            folds[fold_idx].extend(cls_indices[current:current + fold_size])
            current += fold_size

    auc_scores = []

    for i in range(n_splits):
        test_indices = np.array(folds[i])
        train_indices = np.array([idx for fold in folds[:i] + folds[i+1:] for idx in fold])

        X_train_cv, X_val_cv = X[train_indices], X[test_indices]
        y_train_cv, y_val_cv = y[train_indices], y[test_indices]

        # Create a new instance of knn to avoid data leakage
        knn_cv = KNN(k=knn.k, distance_metric=knn.distance_metric)
        knn_cv.fit(X_train_cv, y_train_cv)

        # Predict probabilities
        y_pred_prob = knn_cv.predict(X_val_cv)

        # Compute ROC AUC score
        auc = compute_roc_auc(y_val_cv, y_pred_prob)
        auc_scores.append(auc)

    return auc_scores

In [117]:
# Load and preprocess data
X, y, X_test = preprocess_data('sample_data/train.csv', 'sample_data/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')
print("Type of np:", type(np))

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_auc = 0
best_k = 0
best_metric = ''
k_values = [3, 5, 7, 9, 11]
distance_metrics = ['euclidean', 'manhattan','minkowski']

for metric in distance_metrics:
    for k in k_values:
        # Create a knn object with current hyperparameters
        knn = KNN(k=k, distance_metric=metric)
        auc_scores = cross_validate(X, y, knn, n_splits=5)
        mean_auc = np.mean(auc_scores)
        print(f"Metric: {metric}, k: {k}, Mean ROC AUC: {mean_auc}")
        if mean_auc > best_auc:
            best_auc = mean_auc
            best_k = k
            best_metric = metric

print(f"\nBest Metric: {best_metric}, Best k: {best_k}, Best ROC AUC: {best_auc}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k= best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('sample_data/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


Type of np: <class 'module'>
Cross-validation scores: [0.8546740960033251, 0.8720108205714727, 0.8711460045120619, 0.8670431084061185, 0.8780925940483727]
Metric: euclidean, k: 3, Mean ROC AUC: 0.842372603382076
Metric: euclidean, k: 5, Mean ROC AUC: 0.8715665256018461
Metric: euclidean, k: 7, Mean ROC AUC: 0.8801769920105503
Metric: euclidean, k: 9, Mean ROC AUC: 0.8871519896910639
Metric: euclidean, k: 11, Mean ROC AUC: 0.891472734268105
Metric: manhattan, k: 3, Mean ROC AUC: 0.840912733640679
Metric: manhattan, k: 5, Mean ROC AUC: 0.8687442170202301
Metric: manhattan, k: 7, Mean ROC AUC: 0.8799701314356312
Metric: manhattan, k: 9, Mean ROC AUC: 0.8880040269526587
Metric: manhattan, k: 11, Mean ROC AUC: 0.8950077220278301
Metric: minkowski, k: 3, Mean ROC AUC: 0.8367313747197331
Metric: minkowski, k: 5, Mean ROC AUC: 0.8697336130310875
Metric: minkowski, k: 7, Mean ROC AUC: 0.8826949067491972
Metric: minkowski, k: 9, Mean ROC AUC: 0.8889128265387571
Metric: minkowski, k: 11, Mean ROC