In [8]:
import numpy as np
import pandas as pd

In [None]:
# Define the KNN class

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data
        self.X = np.array(X)
        self.y = np.array(y)

    def predict(self, X):
        """Make predictions based on the nearest neighbors' weighted average."""
        X = np.array(X)
        
        # Compute the distance matrix using the specified metric
        distances = self.compute_distance(X, self.X)
        
        predictions = []
        # Loop through each test point to make predictions
        for i in range(X.shape[0]):
            # Get indices of k nearest neighbors
            nearest_neighbors_indices = np.argsort(distances[i])[:self.k]
            nearest_labels = self.y[nearest_neighbors_indices]
            nearest_distances = distances[i][nearest_neighbors_indices]
    
            # Compute weights as the inverse square of the distance
            weights = 1 / (nearest_distances ** 2 + 1e-10)  # Add small value to avoid division by zero
            
            # Calculate the weighted average of the neighbors' labels
            weighted_average = np.sum(nearest_labels * weights) / np.sum(weights)
            
            # Append the weighted average as the prediction
            predictions.append(weighted_average)
        
        return np.array(predictions)


    def compute_distance(self, X1, X2):
        """Compute distance matrix based on the selected distance metric."""
        if self.distance_metric == 'euclidean':
            # Step 1: Compute squared sum of each row in X_test and X_train
            X_test_squared = np.sum(np.square(X1), axis=1).reshape(-1, 1)
            X_train_squared = np.sum(np.square(X2), axis=1).reshape(1, -1)
        
            # Step 2: Compute the dot product between X_test and X_train
            cross_term = np.dot(X1, X2.T)
        
            # Step 3: Compute the full distance matrix
            distances = np.sqrt(X_test_squared + X_train_squared - 2 * cross_term)
        
            return distances
        
        elif self.distance_metric == 'manhattan':
            # Manhattan distance computation
            return np.sum(np.abs(X1[:, np.newaxis] - X2), axis=2)

        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")


In [30]:
import pandas as pd
import numpy as np

# Imputation and scaling helper functions
def impute_mean(column):
    """Replace missing numerical values with the column mean."""
    return column.fillna(column.mean())

def min_max_scale(df):
    """Apply Min-Max scaling to the DataFrame."""
    return (df - df.min()) / (df.max() - df.min())

def impute_most_frequent(column):
    """Replace missing categorical values with the most frequent value."""
    return column.fillna(column.mode()[0])

def one_hot_encode(df, categorical_features):
    """Perform one-hot encoding for categorical features."""
    for feature in categorical_features:
        one_hot = pd.get_dummies(df[feature], prefix=feature)
        df = pd.concat([df.drop(columns=[feature]), one_hot], axis=1)
    return df

# SVD function from scratch
def svd(X, n_components):
    """Apply Singular Value Decomposition (SVD) for dimensionality reduction."""
    X_centered = X - np.mean(X, axis=0)
    U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)
    X_reduced = np.dot(U[:, :n_components], np.diag(S[:n_components]))
    return X_reduced

# Preprocess data
def preprocess_data(train_path, test_path, n_components=None):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate features and target variable in training data
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited']

    # Define numerical, categorical, and binary columns
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_features = ['Geography']
    binary_features = ['HasCrCard', 'IsActiveMember', 'Gender']

    # Convert binary features to binary (0 or 1) encoding for Gender
    X_train['Gender'] = X_train['Gender'].map({'Male': 0, 'Female': 1})
    test_data['Gender'] = test_data['Gender'].map({'Male': 0, 'Female': 1})

    # Preprocess numerical features (Impute missing values)
    X_train[numerical_features] = X_train[numerical_features].apply(impute_mean)
    test_data[numerical_features] = test_data[numerical_features].apply(impute_mean)

    # Scale numerical features
    X_train[numerical_features] = min_max_scale(X_train[numerical_features])
    test_data[numerical_features] = min_max_scale(test_data[numerical_features])

    # One-hot encode categorical features (e.g., Geography)
    X_train = one_hot_encode(X_train, categorical_features)
    test_data = one_hot_encode(test_data, categorical_features)

    # Align train and test columns (to handle any discrepancies after encoding)
    X_train, test_data = X_train.align(test_data, join='left', axis=1, fill_value=0)

    # Ensure all values are numeric before applying SVD
    X_train = X_train.select_dtypes(include=[np.number])
    test_data = test_data.select_dtypes(include=[np.number])

    # Apply SVD if n_components is specified
    if n_components is not None:
        X_train = svd(X_train.values, n_components)
        test_data = svd(test_data.values, n_components)

    return X_train, y_train, test_data

# Example usage
X_train, y_train, X_test = preprocess_data('train.csv', 'test.csv', n_components=10)


In [48]:
def roc_auc_score(y_true, y_pred_proba):
        """
        Compute ROC AUC score for binary classification from scratch.
    
        Parameters:
        - y_true: True binary labels (0 or 1)
        - y_pred_proba: Predicted probabilities for the positive class (1)
    
        Returns:
        - auc: Calculated ROC AUC score
        """
        # Convert y_true and y_pred_proba to numpy arrays to avoid index issues
        y_true = np.array(y_true)
        y_pred_proba = np.array(y_pred_proba)
    
        # Sort by predicted probabilities
        sorted_indices = np.argsort(y_pred_proba)
        y_true_sorted = y_true[sorted_indices]
        y_pred_proba_sorted = y_pred_proba[sorted_indices]
    
        # Initialize variables
        tpr_values = []
        fpr_values = []
        
        n_pos = np.sum(y_true == 1)  # Number of positives (class 1)
        n_neg = len(y_true) - n_pos  # Number of negatives (class 0)
        
        tp = 0
        fp = 0
    
        # Loop through sorted probabilities and calculate TPR and FPR at each threshold
        for i in range(len(y_true_sorted)):
            if y_true_sorted[i] == 1:
                tp += 1
            else:
                fp += 1
            tpr_values.append(tp / n_pos)
            fpr_values.append(fp / n_neg)
    
        # Compute the area under the curve (AUC) using the trapezoidal rule
        auc = 0.0
        for i in range(1, len(tpr_values)):
            auc += (fpr_values[i] - fpr_values[i - 1]) * (tpr_values[i] + tpr_values[i - 1]) / 2
    
        return auc

def cross_validate(X, y, knn, n_splits=5):
    """
    Perform cross-validation on the KNN classifier and calculate the ROC AUC score.

    Parameters:
    - X: Feature matrix (numpy array)
    - y: Labels (numpy array)
    - knn: KNN model
    - n_splits: Number of folds for cross-validation

    Returns:
    - mean_auc_score: The average ROC AUC score over the cross-validation folds.
    """
    n_samples = len(X)
    fold_size = n_samples // n_splits
    auc_scores = []

    for i in range(n_splits):
        # Create validation set
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]

        # Create training set (everything except the validation fold)
        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)

        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)

        # Get predicted probabilities for the validation set
        y_pred_proba = knn.predict(X_val)

        # Compute the ROC AUC score for the validation set
        auc_score = roc_auc_score(y_val, y_pred_proba)
        auc_scores.append(auc_score)

    # Return the average AUC score over all splits
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score

    


In [53]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation to evaluate model
cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)

# Hyperparameter tuning (you can adjust k and evaluate different values)
best_k = None
best_score = 0

for k in range(1, 21):  # Try different values of k
    knn = KNN(k=k, distance_metric='euclidean')
    cv_score = cross_validate(X, y, knn)
    print(f"k={k}, Cross-validation score: {cv_score}")
    if cv_score > best_score:
        best_score = cv_score
        best_k = k

print(f"Best k: {best_k} with cross-validation score: {best_score}")

# Train on full dataset with optimal hyperparameters (best k) and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)

# Make predictions on test set
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)


Cross-validation scores: 0.49109208049619324
k=1, Cross-validation score: 0.49465755138647066
k=2, Cross-validation score: 0.49359352995948924
k=3, Cross-validation score: 0.4902041294579652
k=4, Cross-validation score: 0.4893628624238439
k=5, Cross-validation score: 0.49109208049619324
k=6, Cross-validation score: 0.4895181237570547
k=7, Cross-validation score: 0.4888734439414154
k=8, Cross-validation score: 0.4902231140059793
k=9, Cross-validation score: 0.48913943721881636
k=10, Cross-validation score: 0.4864620588807008
k=11, Cross-validation score: 0.48614168061593566
k=12, Cross-validation score: 0.48393160614225206
k=13, Cross-validation score: 0.485177353294221
k=14, Cross-validation score: 0.4876273110620885
k=15, Cross-validation score: 0.48728428934379703
k=16, Cross-validation score: 0.4904829280234182
k=17, Cross-validation score: 0.48846165270426695
k=18, Cross-validation score: 0.4877013666901059
k=19, Cross-validation score: 0.4871657864931664
k=20, Cross-validation sco