In [219]:
import numpy as np
import pandas as pd

In [225]:
class CustomKNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict_proba(self, X):
        probabilities = []
        for x in X.values:
            distances = self.compute_distance(x, self.X_train.values)
            k_indices = np.argpartition(distances, self.k)[:self.k]
            k_nearest_labels = self.y_train.iloc[k_indices]
            prob = np.mean(k_nearest_labels)
            probabilities.append(prob)
        return np.array(probabilities)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X2 - X1)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

In [226]:

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
    
    categorical_columns = ['Geography', 'Gender']
    numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    
    # Handle missing values
    for col in numerical_columns:
        combined_data[col].fillna(combined_data[col].median(), inplace=True)
    for col in categorical_columns:
        combined_data[col].fillna('missing', inplace=True)
    
    combined_data = pd.get_dummies(combined_data, columns=categorical_columns, drop_first=True)
    
    feature_columns = numerical_columns + [col for col in combined_data.columns if col.startswith(tuple(categorical_columns))]
    
    # Standard scaling
    combined_data[feature_columns] = (combined_data[feature_columns] - combined_data[feature_columns].mean()) / combined_data[feature_columns].std()
    
    train_size = len(train_data)
    X = combined_data.loc[:train_size-1, feature_columns]
    y = train_data['Exited']
    X_test = combined_data.loc[train_size:, feature_columns]

    return X, y, X_test


In [227]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    indices = np.random.permutation(len(X))
    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    
    return X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]

def roc_auc_score(y_true, y_pred):
    positive_indices = np.where(y_true == 1)[0]
    negative_indices = np.where(y_true == 0)[0]
    
    positive_predictions = y_pred[positive_indices]
    negative_predictions = y_pred[negative_indices]
        
    correct_order = np.sum(positive_predictions[:, None] > negative_predictions) 
    return correct_order / (len(positive_indices) * len(negative_indices))

def cross_validate(X, y, model, n_splits=5):
    scores = []
    
    for _ in range(n_splits):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(0, 1000))
        
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_val)
        
        score = roc_auc_score(y_val, y_pred_proba)
        scores.append(score)
    
    return scores

In [228]:
X, y, X_test = preprocess_data('train.csv', 'test.csv')
    
# Initial model evaluation
initial_model = CustomKNN(k=5, distance_metric='euclidean')
cv_scores = cross_validate(X, y, initial_model)
print("Initial cross-validation scores:", cv_scores)
print("Mean ROC AUC score:", np.mean(cv_scores))

    # Hyperparameter tuning
k_values = [3, 5, 7, 9, 12]
distance_metrics = ['euclidean', 'manhattan', 'minkowski']
best_score = 0
best_params = {}

for k in k_values:
    for metric in distance_metrics:
        model = CustomKNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, model)
        mean_score = np.mean(scores)
            
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'k': k, 'distance_metric': metric}

print("Best parameters:", best_params)
print("Best ROC AUC score:", best_score)

    # Train on full dataset with optimal hyperparameters and make predictions on test set
final_model = CustomKNN(**best_params)
final_model.fit(X, y)
test_predictions = final_model.predict_proba(X_test)

    # Round predictions to 2 decimal points
rounded_predictions = np.round(test_predictions, 2)

    # Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': rounded_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [np.float64(0.7827393099966703), np.float64(0.819753998373543), np.float64(0.8114446802742773), np.float64(0.7861315496098105), np.float64(0.8229182427660009)]
Mean ROC AUC score: 0.8045975562040603
Best parameters: {'k': 11, 'distance_metric': 'manhattan'}
Best ROC AUC score: 0.8611916316802983
