## Preprocessing and Random-Forest


In [4]:
import numpy as np

import openml
from collections import Counter


In [8]:
def euclidean_distance(
        example_1: np.ndarray,
        example_2: np.ndarray,
) -> np.float:

    sum_squared_distance = 0
    nr_features = example_1.shape[0]
    for feature_index in range(0, nr_features):
        sum_squared_distance += np.power(example_1[feature_index] - example_2[feature_index], 2)

    return np.sqrt(sum_squared_distance)


def knn(
    query_example: np.ndarray,
    X_train: np.ndarray,
    y_train: np.ndarray,
    k: int,
) -> np.int:
    # TODO Write the main algorithm here
    distances = []
    for train_example, train_label in zip(X_train, y_train):
        distance = euclidean_distance(query_example, train_example)
        distances.append((distance, train_label))

    distances.sort(key=lambda x: x[0])  # Sort distances in ascending order

    k_nearest = distances[:k]  # Select the k nearest neighbors

    labels = [neighbor[1] for neighbor in k_nearest]  # Extract labels of nearest neighbors

    # Count the occurrences of each label
    label_counts = Counter(labels)

    most_common_label = label_counts.most_common(1)[0][0]  # Get the most common label

    return most_common_label

# diabetes dataset
task = openml.tasks.get_task(267)
train_indices, test_indices = task.get_train_test_split_indices()
dataset = task.get_dataset()
X, y, categorical_indicator, _ = dataset.get_data(
    dataset_format='array',
    target=dataset.default_target_attribute,
)

X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]

NR_NEIGHBORS = 10

nr_correct_pred = 0
for example, label in zip(X_test, y_test):
    y_pred = knn(example, X_train, y_train, k=NR_NEIGHBORS)
    if y_pred == label:
        nr_correct_pred += 1

accuracy = nr_correct_pred / X_test.shape[0]
print(f'The accuracy of the k-nearest neighbor algorithm with k = {NR_NEIGHBORS} is {accuracy}')


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


The accuracy of the k-nearest neighbor algorithm with k = 10 is 0.7193675889328063
