## Индивидуальное задание

In [46]:
import warnings
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    data = pd.read_csv("./heartDisease/processed.switzerland.csv", delimiter=',')

    X = data.drop("age", axis=1)
    y = data["age"]

    results_df = pd.DataFrame(columns=['test_size', 'k', 'accuracy', 'cv_accuracy']) 

    k_values = range(1, 31)
    for test_size in [0.1, 0.2, 0.3, 0.4, 0.5]:
        for k in k_values:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)
            y_predicted = knn.predict(X_test)
            accuracy = accuracy_score(y_test, y_predicted)
            scores = cross_val_score(knn, X, y)
            results_df = pd.concat([
                results_df,
                pd.DataFrame({
                    'test_size': test_size,
                    'k': [k],
                    'accuracy': [accuracy],
                    'cv_accuracy': scores.mean()
                })],
                ignore_index=True
            )

    results_df = results_df.sort_values(by='accuracy', ascending=False)
    print(results_df)

    test_size   k  accuracy  cv_accuracy
2         0.1   3  0.230769     0.056667
10        0.1  11  0.230769     0.080667
9         0.1  10  0.230769     0.080667
8         0.1   9  0.230769     0.080667
7         0.1   8  0.230769     0.072667
..        ...  ..       ...          ...
84        0.3  25  0.027027     0.080333
81        0.3  22  0.027027     0.056333
80        0.3  21  0.027027     0.056333
83        0.3  24  0.027027     0.064333
82        0.3  23  0.027027     0.072667

[150 rows x 4 columns]


Таким образом самый точный результат выдаёт размер тестового набора 10% и k=3