In [302]:
import numpy as np
import pandas as pd

In [303]:
threshhold = 0.365

In [304]:
class KNN:
    def __init__(self, k=3,distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for test_sample in X:
            distances = self.compute_distance(test_sample, self.X_train)
            k_nearest_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices].astype(int)
            k_nearest_distances = distances[k_nearest_indices]
            
            weights = 1 / (k_nearest_distances + 1e-5)
            
            # Weighted sum for each class
            class_0_weight = np.sum(weights[k_nearest_labels == 0])
            class_1_weight = np.sum(weights[k_nearest_labels == 1])

            # Predict 1 if class_1_weight exceeds a threshold
            if class_1_weight / (class_1_weight + class_0_weight) > threshhold:
                predictions.append(1)
            else:
                predictions.append(0)
        
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        X1 = np.array(X1).reshape(1, -1)
        X2 = np.array(X2)

        # print("X1:", X1)
        # print("X2:", X2)

        if self.distance_metric == 'euclidean':
            # Debug print
            # print("Calculating Euclidean distance.")
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            # Debug print
            # print("Calculating Manhattan distance.")
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")



In [305]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop irrelevant columns
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # Separate the target variable from the training data
    y = train_data['Exited']
    train_data = train_data.drop(columns='Exited')

    # Concatenate train and test data for consistent one-hot encoding
    combined_data = pd.concat([train_data, test_data], axis=0)

    # Handle categorical variables: One-Hot Encoding for Geography and Gender
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    # Split the combined data back into train and test sets
    train_data = combined_data.iloc[:len(train_data)]
    test_data = combined_data.iloc[len(train_data):]

    # Ensure that the test set has the same columns as the train set
    test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

    # Handle missing values (fill numeric columns with median)
    numeric_columns = train_data.select_dtypes(include=[np.number]).columns
    train_data.loc[:, numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())
    test_data.loc[:, numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].median())

    # Convert all features to numeric types
    train_data.loc[:, numeric_columns] = train_data[numeric_columns].apply(pd.to_numeric, errors='coerce')
    test_data.loc[:, numeric_columns] = test_data[numeric_columns].apply(pd.to_numeric, errors='coerce')

    # Normalize numeric features
    for column in numeric_columns:
        min_val = train_data[column].min()
        max_val = train_data[column].max()
        train_data.loc[:, column] = (train_data[column] - min_val) / (max_val - min_val)
        test_data.loc[:, column] = (test_data[column] - min_val) / (max_val - min_val)

    # Convert boolean columns to integers
    boolean_columns = train_data.select_dtypes(include=['bool']).columns
    train_data.loc[:, boolean_columns] = train_data[boolean_columns].astype(bool).astype(int)
    test_data.loc[:, boolean_columns] = test_data[boolean_columns].astype(bool).astype(int)


    # Separate features (X) and target (y) for train data
    X = train_data.values 

    return X, y.values, test_data.values


In [306]:
def cross_validate(X, y, knn, n_splits=5):
    # Shuffle the data indices
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    # Split the data into n_splits parts
    fold_size = len(X) // n_splits
    accuracy_scores = []

    for i in range(n_splits):
        # Create the training and validation sets for the current fold
        validation_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
        
        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[validation_indices], y[validation_indices]

        # Fit the KNN classifier on the training data
        knn.fit(X_train, y_train)
        
        # Make predictions on the validation data
        y_pred = knn.predict(X_val)

        # Count TP, TN, FP, FN
        tp = np.sum((y_pred == 1) & (y_val == 1))  # True Positives
        tn = np.sum((y_pred == 0) & (y_val == 0))  # True Negatives
        fp = np.sum((y_pred == 1) & (y_val == 0))  # False Positives
        fn = np.sum((y_pred == 0) & (y_val == 1))  # False Negatives
        
        # print("tp = ", tp)
        # print("tn = ", tn)
        # print("fp = ", fp)
        # print("fn = ", fn)

        # Compute accuracy
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        accuracy_scores.append(accuracy)

    # Return the mean accuracy across all folds
    return np.mean(accuracy_scores)


In [307]:
def hyperparameter_tuning(X, y, k_values, distance_metrics):
    best_acc = 0
    best_params = {}
    
    for k in k_values:
        for distance_metric in distance_metrics:
            knn = KNN(k=k, distance_metric=distance_metric)
            acc_score = cross_validate(X, y, knn)
            # print(f"k={k}, distance_metric={distance_metric}, ROC acc={acc_score}")
            
            # Update best parameters if current acc is better
            if acc_score > best_acc:
                best_acc = acc_score
                best_params = {'k': k, 'distance_metric': distance_metric}
    
    print(f"Best Hyperparameters: k={best_params['k']}, distance_metric={best_params['distance_metric']}, score={best_acc}")
    return best_params


In [308]:
# Load and preprocess data
X, y, X_test = preprocess_data('./train.csv', './test.csv')
# print("X data type:", X.dtype)  # Should be numeric type (e.g., float64)
# print("y data type:", y.dtype)  # Should be int or float
# print("X_test data type:", X_test.dtype)  # Should be numeric type (e.g., float64)
# print("after preprocessing")
# print("X shape:", X.shape)
# print("X_test shape:", X_test.shape)

# # Display the first few values of X and X_test
# print("Sample of X:")
# print(X[:5])  # Shows the first 5 rows

# print("Sample of X_test:")
# print(X_test[:5])  # Shows the first 5 rows

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
# k_values = [3, 5, 7]
# distance_metrics = ['euclidean', 'manhattan']
k_values = [5]
distance_metrics = ['manhattan']

best_params = hyperparameter_tuning(X, y, k_values, distance_metrics)
threshhold = 0.395
# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('./test.csv')['id'], 'Exited': test_predictions}).to_csv(f'{threshhold}submissions{best_params['k']}{best_params['distance_metric'][:1]}.csv', index=False)

  train_data.loc[:, boolean_columns] = train_data[boolean_columns].astype(bool).astype(int)
  train_data.loc[:, boolean_columns] = train_data[boolean_columns].astype(bool).astype(int)
  train_data.loc[:, boolean_columns] = train_data[boolean_columns].astype(bool).astype(int)
  test_data.loc[:, boolean_columns] = test_data[boolean_columns].astype(bool).astype(int)
  test_data.loc[:, boolean_columns] = test_data[boolean_columns].astype(bool).astype(int)
  test_data.loc[:, boolean_columns] = test_data[boolean_columns].astype(bool).astype(int)


Cross-validation scores: 0.859
Best Hyperparameters: k=5, distance_metric=manhattan, score=0.8555333333333334
