In [3]:
import numpy as np
import pandas as pd

In [105]:
# Define the KNN class

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data
        self.X = np.array(X)
        self.y = np.array(y)

    def predict(self, X):
        """Make predictions based on the nearest neighbors."""
        X = np.array(X)
    
        # Ensure X is 2D (in case a single sample is passed)
        if X.ndim == 1:
            X = X.reshape(1, -1)
        
        # Ensure self.X (training data) is 2D
        if self.X.ndim == 1:
            self.X = self.X.reshape(1, -1)
    
        # Compute distances between test set and training set
        distances = self.compute_distance(X, self.X)
        
        probabilities = []
        for i in range(X.shape[0]):
            # Get indices of k nearest neighbors
            nearest_neighbors_indices = np.argsort(distances[i])[:self.k]
            nearest_labels = self.y[nearest_neighbors_indices]
            
            # Calculate the probability as the fraction of neighbors that are class 1 (churn)
            probability = np.mean(nearest_labels)  # The proportion of neighbors that are '1'
            probabilities.append(probability)
        
        return np.array(probabilities)  # Return probabiliti

        
    def compute_distance(self, X1, X2):
        """
        Compute distance matrix based on the selected distance metric.
        This function computes pairwise distances between each row in X1 and each row in X2.
        """
        # Ensure inputs are numpy arrays and convert them to float
        X1 = np.array(X1).astype(float)
        X2 = np.array(X2).astype(float)
    
    
        # Step 1: Compute squared sum of each row in X1 and X2
        X1_squared = np.sum(np.square(X1), axis=1).reshape(-1, 1)  # Shape: (m_test, 1)
        X2_squared = np.sum(np.square(X2), axis=1).reshape(1, -1)  # Shape: (1, m_train)
    
        # Step 2: Compute the dot product between X1 and X2
        cross_term = np.dot(X1, X2.T)  # Shape: (m_test, m_train)
    
        # Step 3: Compute the full distance matrix using broadcasting
        distance_matrix = X1_squared + X2_squared - 2 * cross_term
    
        # Safety check to ensure no negative values under the square root
        distance_matrix = np.maximum(distance_matrix, 0)
    
        # Ensure the input to np.isnan is numeric and safe to check
        if np.isnan(distance_matrix).any():
            raise ValueError("NaN values found in distance matrix")
    
        # Element-wise square root to get the final Euclidean distances
        distances = np.sqrt(distance_matrix)
    
    
        return distances



In [119]:
# Tune the decision threshold for predicting churn
def tune_threshold(y_true, y_pred_probs):
    thresholds = np.linspace(0, 1, 101)
    best_threshold = 0.5
    best_score = 0
    
    for threshold in thresholds:
        y_pred = (y_pred_probs >= threshold).astype(int)
        score = np.mean(y_true == y_pred)  # Accuracy score
        if score > best_score:
            best_score = score
            best_threshold = threshold
    
    return best_threshold


def handle_class_imbalance(X, y):
    data = pd.DataFrame(X)
    data['target'] = y
    majority_class = data[data['target'] == 0]
    minority_class = data[data['target'] == 1]
    
    # Oversample the minority class by replicating rows
    minority_upsampled = minority_class.sample(n=len(majority_class), replace=True, random_state=42)
    upsampled_data = pd.concat([majority_class, minority_upsampled])
    upsampled_data = upsampled_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Separate X and y again
    X_resampled = upsampled_data.drop(columns=['target']).to_numpy()
    y_resampled = upsampled_data['target'].to_numpy()
    
    return X_resampled, y_resampled

# Add feature engineering (e.g., age groups, balance bins)
def feature_engineering(df):
    bins = [18, 30, 45, 60, 100]
    labels = ['18-30', '31-45', '46-60', '60+']
    df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    df['BalanceBin'] = np.where(df['Balance'] == 0, 'Zero', 'Non-zero')
    return df

def impute_mean(column):
    """Replace missing numerical values with the column mean."""
    return column.fillna(column.mean())

def min_max_scale(df):
    """Apply Min-Max scaling to the DataFrame."""
    return (df - df.min()) / (df.max() - df.min())

def impute_most_frequent(column):
    """Replace missing categorical values with the most frequent value."""
    return column.fillna(column.mode()[0])

def one_hot_encode(df, categorical_features):
    """Perform one-hot encoding for categorical features."""
    for feature in categorical_features:
        one_hot = pd.get_dummies(df[feature], prefix=feature)
        df = pd.concat([df.drop(columns=[feature]), one_hot], axis=1)
    return df

def standard_scale(X, mean=None, std=None):
    """Standardize features by removing the mean and scaling to unit variance."""
    if mean is None and std is None:
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)
    
    # Avoid division by zero for features with zero variance
    std[std == 0] = 1
    
    return (X - mean) / std


def pca(X, n_components):
    # Step 1: Center the data
    X_centered = X - np.mean(X, axis=0)
    covariance_matrix = np.cov(X_centered, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    top_eigenvectors = eigenvectors[:, sorted_indices[:n_components]]
    X_reduced = np.dot(X_centered, top_eigenvectors)
    
    # Return reduced data and the components (eigenvectors)
    return X_reduced, top_eigenvectors, np.mean(X, axis=0)



def preprocess_data(train_path, test_path, scaling_type='standard', apply_pca=False, n_components=None):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle NaN values in a pandas-friendly way (pandas' fillna)
    train_data.fillna(0, inplace=True)  # Replace NaNs with 0
    test_data.fillna(0, inplace=True)   # Replace NaNs with 0

    # Feature engineering (age groups, balance bins)
    train_data = feature_engineering(train_data)
    test_data = feature_engineering(test_data)

    # Drop unnecessary columns
    X_train = train_data.drop(columns=['Surname', 'id', 'CustomerId', 'Exited'])
    y_train = train_data['Exited']
    X_test = test_data.drop(columns=['Surname', 'id', 'CustomerId'])

    # Define numerical, categorical, and binary columns
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_features = ['Geography', 'AgeGroup', 'BalanceBin']
    binary_features = ['HasCrCard', 'IsActiveMember', 'Gender']

    # Convert binary features to binary (0 or 1) encoding for Gender
    X_train['Gender'] = X_train['Gender'].map({'Male': 0, 'Female': 1})
    X_test['Gender'] = X_test['Gender'].map({'Male': 0, 'Female': 1})

    # Preprocess numerical features (Impute missing values)
    X_train[numerical_features] = X_train[numerical_features].apply(impute_mean, axis=0)
    X_test[numerical_features] = X_test[numerical_features].apply(impute_mean, axis=0)

    # Apply scaling
    if scaling_type == 'standard':
        X_train[numerical_features] = standard_scale(X_train[numerical_features])
        X_test[numerical_features] = standard_scale(X_test[numerical_features])
    elif scaling_type == 'minmax':
        X_train[numerical_features] = min_max_scale(X_train[numerical_features])
        X_test[numerical_features] = min_max_scale(X_test[numerical_features])

    # Apply one-hot encoding to categorical features
    X_train = one_hot_encode(X_train, categorical_features)
    X_test = one_hot_encode(X_test, categorical_features)

    # Convert bool columns to int after one-hot encoding
    X_train = X_train.astype({col: 'int' for col in X_train.select_dtypes(include=['bool']).columns})
    X_test = X_test.astype({col: 'int' for col in X_test.select_dtypes(include=['bool']).columns})

    # Align train and test columns
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    # Apply PCA if specified
    if apply_pca and n_components is not None:
        X_train, pca_components, train_mean = pca(X_train.to_numpy(), n_components=n_components)
        X_test = np.dot(X_test - train_mean, pca_components)

    return X_train, X_test, y_train


In [93]:
def calculate_roc_auc(y_true, y_pred_probs):
    """
    Compute ROC and AUC score from scratch.
    
    Parameters:
    - y_true: Actual binary labels (0 or 1)
    - y_pred_probs: Predicted probabilities for the positive class (1)
    
    Returns:
    - auc: Area under the ROC curve
    """
    y_true = np.array(y_true)
    y_pred_probs = np.array(y_pred_probs)
    
    # Step 1: Sort predictions and corresponding true labels by predicted probabilities
    sorted_indices = np.argsort(y_pred_probs)[::-1]  # Sort in descending order
    y_true_sorted = y_true[sorted_indices]
    
    # Step 2: Initialize variables to compute TPR and FPR
    tpr = []  # True positive rate
    fpr = []  # False positive rate
    
    # Count positives and negatives
    P = np.sum(y_true == 1)  # Number of positive examples
    N = np.sum(y_true == 0)  # Number of negative examples
    
    # Initialize true positives and false positives counts
    tp = 0  # True positives
    fp = 0  # False positives
    
    # Step 3: Compute TPR and FPR at different threshold levels
    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == 1:
            tp += 1
        else:
            fp += 1
        
        tpr.append(tp / P)  # True positive rate
        fpr.append(fp / N)  # False positive rate
    
    # Step 4: Calculate the AUC using the trapezoidal rule
    auc = np.trapz(tpr, fpr)  # Trapezoidal approximation for AUC
    return auc


def cross_validate(X, y, knn, n_splits=5):
    # Convert y to a numpy array and reshape it for concatenation
    y = np.array(y).reshape(-1, 1)

    # Combine X and y so they can be shuffled together
    data = np.hstack((X, y))

    # Shuffle the data randomly
    np.random.shuffle(data)

    # Split X and y back
    X = data[:, :-1]
    y = data[:, -1]

    # Determine fold size
    fold_size = len(X) // n_splits

    # List to store the AUC scores for each fold
    auc_scores = []
    for i in range(n_splits):
        # Create validation set
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]


        # Create training set (everything except the validation set)
        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)


        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)

        # Get predicted probabilities for the validation set
        y_pred_probs = knn.predict(X_val)

        # Compute the ROC AUC score for the validation set
        auc_score = calculate_roc_auc(y_val, y_pred_probs)
        auc_scores.append(auc_score)

    # Return the average AUC score over all splits
    return np.mean(auc_scores)


In [132]:
# Load and preprocess data


best_k = 7
best_score = 0

# Loop to test different values of n_components (for PCA) and k (for KNN)
#for i in range(7, 11):
 #   X_train, X_test, y_train = preprocess_data('train.csv', 'test.csv', scaling_type='standard', apply_pca=True, n_components=i)
X_train, X_test, y_train = preprocess_data('train.csv', 'test.csv', scaling_type='standard',apply_pca = True, n_components = 21)
X_train_resampled, y_train_resampled = handle_class_imbalance(X_train, y_train)
for k in range(18, 29, 2):  # Try different values of k
    knn = KNN(k=k, distance_metric='euclidean')
    cv_score = cross_validate(X_train, y_train, knn)
    print(f"Cross-validation AUC score: {cv_score} for n_comp: {k}")
    if cv_score > best_score:
        best_score = cv_score
        best_k = k
# Handle class imbalance

print(f"Best score: {best_score}, Best k: {best_k}")

# Train the KNN model on the resampled training data with the best k (after handling class imbalance)
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X_train_resampled, y_train_resampled)

# Predict probabilities for the test set (using the trained model)
y_test_pred_probs = knn.predict(X_test)  # Get probabilities for the test set

# Save the test predictions as probabilities
test_ids = pd.read_csv('test.csv')['id']  # Extract the 'id' column from the test.csv file

# Create the submission DataFrame with 'id' and predicted 'Exited' probabilities
#submission = pd.DataFrame({'id': test_ids, 'Exited': y_test_pred_probs})
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': y_test_pred_probs}).to_csv('submissions.csv', index=False) 

print("Test predictions saved to 'submissions.csv'")


Cross-validation AUC score: 0.909906692368782 for n_comp: 18
Cross-validation AUC score: 0.9126118608904441 for n_comp: 20
Cross-validation AUC score: 0.9142143490920855 for n_comp: 22
Cross-validation AUC score: 0.9133032057333843 for n_comp: 24
Cross-validation AUC score: 0.914146932290615 for n_comp: 26
Cross-validation AUC score: 0.9133039229332454 for n_comp: 28
Best score: 0.9142143490920855, Best k: 22
Test predictions saved to 'submissions.csv'
