In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
def k_fold_cross_validation(X, y, k, model):
    """
    Perform K-fold cross-validation.

    Parameters:
    X: np.array, features
    y: np.array, labels
    k: int, number of folds
    model: machine learning model

    Returns:
    mean_accuracy: float, mean accuracy over all folds
    """
    fold_size = len(X) // k
    accuracies = []

    for fold in range(k):
        start = fold * fold_size
        end = (fold + 1) * fold_size

        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])
        X_val = X[start:end]
        y_val = y[start:end]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        accuracies.append(accuracy)

        print(f"Fold {fold + 1}, Accuracy: {accuracy}")

    mean_accuracy = np.mean(accuracies)
    return mean_accuracy

In [6]:
file_path = 'breast cancer.csv'
breast_cancer_data = pd.read_csv(file_path)

breast_cancer_data.replace('?', np.nan, inplace=True)
breast_cancer_data.fillna(breast_cancer_data.median(), inplace=True)

X = breast_cancer_data.iloc[:, :-1].values
y = breast_cancer_data.iloc[:, -1].values

knn = KNeighborsClassifier(n_neighbors=3)

k = 5
mean_accuracy = k_fold_cross_validation(X, y, k, knn)
print(f"Mean Accuracy: {mean_accuracy}")

Fold 1, Accuracy: 0.539568345323741
Fold 2, Accuracy: 0.539568345323741
Fold 3, Accuracy: 0.5683453237410072
Fold 4, Accuracy: 0.6115107913669064
Fold 5, Accuracy: 0.6690647482014388
Mean Accuracy: 0.5856115107913669
