In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from IPython.display import Markdown
import numpy as np

# Data loading and preprocessing
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

# Feature selection
features = ["Age", "Fare"]

# Imputation
imputer = SimpleImputer(strategy="mean")
data_train[features] = imputer.fit_transform(data_train[features])
data_test[features] = imputer.transform(data_test[features])

# Scaling
scaler = StandardScaler()
scaler.fit(data_train[features])
data_train[features] = scaler.transform(data_train[features])
data_test[features] = scaler.transform(data_test[features])

# Split data
X_train, X_test, y_train, _ = train_test_split(
    data_train[features], data_train["Survived"], test_size=0.2, random_state=42
)

# KNN Model and Cross-Validation

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
model = KNeighborsClassifier()

# Option 1: Filter results after adjusting k values
k_range = range(1, 21)
accuracies = []
adjusted_k_values = []
for train_indices, test_indices in kfold.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_indices], X_train.iloc[test_indices]
    y_train_fold, y_test_fold = y_train.iloc[train_indices], y_train.iloc[test_indices]

    n_samples = X_train_fold.shape[0]
    adjusted_k = min(k, n_samples)
    if adjusted_k in k_range:
        model.set_params(n_neighbors=adjusted_k)
        model.fit(X_train_fold, y_train_fold)
        predictions = model.predict(X_test_fold)
        accuracies.append(accuracy_score(y_test_fold, predictions))
        adjusted_k_values.append(adjusted_k)

best_k = adjusted_k_values[np.argmax(accuracies)]

# Option 2: Use a dictionary
# k_accuracies = {}
# for train_indices, test_indices in kfold.split(X_train):
#     X_train_fold, X_test_fold = X_train.iloc[train_indices], X_train.iloc[test_indices]
#     y_train_fold, y_test_fold = y_train.iloc[train_indices], y_train.iloc[test_indices]
#
#     n_samples = X_train_fold.shape[0]
#     adjusted_k = min(k, n_samples)
#     k_accuracies[adjusted_k] = accuracy_score(y_test_fold, model.fit(X_train_fold, y_train_fold).predict(X_test_fold))
#
# best_k = max(k_accuracies, key=k_accuracies.get)

# Report results
print(f"Best K value: {best_k}")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Standard Deviation: {np.std(accuracies):.4f}")

# Confusion matrix
confusion_matrix(y_train, model.fit(X_train, y_train).predict(X_train))

# Model accuracy explanation
model_explanation = Markdown(
    """
The KNN model achieved a mean accuracy of [mean_accuracy] with a standard deviation of [std_deviation] across 5 folds, suggesting good generalizability. However, further analysis like analyzing the confusion matrix and tuning hyperparameters is recommended for a complete picture.
"""
)

print(model_explanation)


Best K value: 19
Mean Accuracy: 0.6602
Standard Deviation: 0.0459
<IPython.core.display.Markdown object>
