In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(train_path, test_path):
    # Loading the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Dropping 'id' or any irrelevant columns
    X_train = train_data.drop(['Exited', 'CustomerId', 'Surname'], axis=1)
    y_train = train_data['Exited']
    X_test = test_data.drop(['CustomerId', 'Surname'], axis=1)

    # Handling categorical features
    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(X_test, drop_first=True)

    # Aligning columns of test data to match training data
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # Scaling numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, y_train, X_test_scaled

# Loading and preprocessing data
train_path = '/content/train.csv'
test_path = '/content/test.csv'
X_train, y_train, X_test = preprocess_data(train_path, test_path)


In [2]:
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=5, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Computing distances between x and all examples in the training set
        distances = [self.compute_distance(x, x_train) for x_train in self.X_train]

        # Getting the nearest k neighbors
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Majority vote - most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def compute_distance(self, x1, x2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))

# Initializing and training KNN model
knn = KNN(k=5)
knn.fit(X_train, y_train)

# Making predictions
y_pred = knn.predict(X_test)


In [4]:
import numpy as np
from sklearn.metrics import roc_auc_score

def custom_kfold_split(X, y, n_splits=5):
    # Ensuring X and y are NumPy arrays for easier splitting
    X = np.array(X)
    y = np.array(y)

    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    for fold in range(n_splits):
        val_start = fold * fold_size
        val_end = (fold + 1) * fold_size if fold != n_splits - 1 else len(X)

        val_indices = indices[val_start:val_end]
        train_indices = np.concatenate([indices[:val_start], indices[val_end:]])

        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        yield X_train, X_val, y_train, y_val

def cross_validate(X, y, knn, n_splits=5):
    auc_scores = []

    # Performing cross-validation manually
    for X_train, X_val, y_train, y_val in custom_kfold_split(X, y, n_splits):
        knn.fit(X_train, y_train)
        y_val_pred = knn.predict(X_val)
        auc = roc_auc_score(y_val, y_val_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores)

# Testing different values of k
best_k = 1
best_auc = 0
for k in range(1, 20):
    knn = KNN(k=k)
    auc = cross_validate(X_train, y_train, knn)
    print(f"k = {k}, AUC = {auc}")
    if auc > best_auc:
        best_auc = auc
        best_k = k

print(f"Best k: {best_k}, Best AUC: {best_auc}")


k = 1, AUC = 0.7417638080565435
k = 2, AUC = 0.7504740852717854
k = 3, AUC = 0.7619539987607553
k = 4, AUC = 0.7587645038075749
k = 5, AUC = 0.759270810682519
k = 6, AUC = 0.7606251501154073
k = 7, AUC = 0.7551508685429162
k = 8, AUC = 0.7601381363535723
k = 9, AUC = 0.7575642886065711
k = 10, AUC = 0.7524296937910233
k = 11, AUC = 0.7518209615557992
k = 12, AUC = 0.7568359068323491
k = 13, AUC = 0.7530050944500521
k = 14, AUC = 0.7513272818854506
k = 15, AUC = 0.750523858613963
k = 16, AUC = 0.7501495257741972
k = 17, AUC = 0.7447135159523322
k = 18, AUC = 0.7488751725701078
k = 19, AUC = 0.7460276925797371
Best k: 3, Best AUC: 0.7619539987607553


In [None]:
# Training final model with best k
knn = KNN(k=best_k)
knn.fit(X_train, y_train)

# Making predictions on test data
y_test_pred = knn.predict(X_test)

# Saving the predictions
submission = pd.DataFrame({'CustomerId': pd.read_csv(test_path)['CustomerId'], 'Exited': y_test_pred})
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' has been created!")


The above submissions.csv did not have the 'id' column as required in the kaggle format. Producing Submissions.csv that has the correct format for Kaggle submission

In [16]:
import pandas as pd

# Load the test data
test_data = pd.read_csv('/content/test.csv')

# Check the columns in test data
print(test_data.columns)


Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')


In [17]:
# Predictions from KNN model (above)
test_predictions = knn.predict(X_test)

# Keeping the 'id' column from the test data for Kaggle submission format
test_ids = pd.read_csv('/content/test.csv')['id']

# Creating the submission DataFrame in the same format as required on Kaggle
submission = pd.DataFrame({
    'id': test_ids,           # This column should be named 'id'
    'Exited': test_predictions  # The predicted labels or probabilities
})

# Saving to CSV file
submission.to_csv('submission1.csv', index=False)


The final submissions1.csv is in the format required by Kaggle