In [98]:
import numpy as np
import pandas as pd
# from sklearn import neighbors
# from sklearn.model_selection import train_test_split

In [99]:
def train_test_split(x, y, test_size):
  # Shuffle the data
  indices = np.arange(len(x))
  np.random.shuffle(indices)
  
  # Split the indices based on the test size
  test_count = int(len(x) * test_size)
  test_indices = indices[:test_count]
  train_indices = indices[test_count:]
  
  # Split the data
  x_train, x_test = x[train_indices], x[test_indices]
  y_train, y_test = y[train_indices], y[test_indices]
  
  return x_train, x_test, y_train, y_test

In [100]:
import numpy as np

class neighbors:
    class KNeighborsClassifier:
        def __init__(self, n_neighbors=7, p=2):
            """
            Initialize the KNeighborsClassifier.

            Parameters:
            - n_neighbors (int): Number of neighbors to consider.
            - p (int): The power parameter for the Minkowski distance.
            """
            self.n_neighbors = n_neighbors
            self.p = p
            self.x_train = None
            self.y_train = None

        def fit(self, x_train, y_train):
            """
            Fit the training data.

            Parameters:
            - x_train (ndarray): Training data features.
            - y_train (ndarray): Training data labels.
            """
            self.x_train = np.asarray(x_train, dtype=float)
            self.y_train = np.asarray(y_train)

        def _minkowski_distance(self, x1, x2):
            """
            Compute the Minkowski distance between two points.

            Parameters:
            - x1 (ndarray): First point.
            - x2 (ndarray): Second point.

            Returns:
            - float: Minkowski distance.
            """
            x1 = np.asarray(x1, dtype=float)
            x2 = np.asarray(x2, dtype=float)
            return np.sum(np.abs(x1 - x2) ** self.p) ** (1 / self.p)

        def predict(self, x_test):
            """
            Predict the labels for the test data.

            Parameters:
            - x_test (ndarray): Test data features.

            Returns:
            - ndarray: Predicted labels.
            """
            x_test = np.asarray(x_test, dtype=float)
            x_test = np.atleast_2d(x_test)

            predictions = []

            for test_point in x_test:
                # Calculate distances from the test point to all training points
                distances = [
                    self._minkowski_distance(test_point, train_point)
                    for train_point in self.x_train
                ]

                # Find the indices of the k nearest neighbors
                nearest_neighbors_indices = np.argsort(distances)[: self.n_neighbors]

                # Get the labels of the k nearest neighbors
                nearest_labels = self.y_train[nearest_neighbors_indices]

                # Determine the most common label (majority vote)
                unique, counts = np.unique(nearest_labels, return_counts=True)
                majority_vote = unique[np.argmax(counts)]
                
                predictions.append(majority_vote)               
            return np.array(predictions)

        def score(self, x_test, y_test):
            """
            Compute the accuracy of the classifier.

            Parameters:
            - x_test (ndarray): Test data features.
            - y_test (ndarray): True labels for the test data.

            Returns:
            - float: Accuracy score.
            """
            predictions = self.predict(x_test)
            return np.mean(predictions == y_test)

In [101]:
# For your assignment, you have implement function "train_test_split" and
# module "neighbors" on your own. This means you will have to make the code working
# without any modification in cells 2-6

df = pd.read_csv('breast-cancer-wisconsin.csv')
df.replace('?', -99999, inplace=True)
df.drop(['id'], axis=1, inplace=True)

x = np.array(df.drop(['class'], axis=1))
y = np.array(df['class'])

In [102]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [103]:
h = neighbors.KNeighborsClassifier(n_neighbors=7, p=2)
h.fit(x_train, y_train)

In [104]:
new_x = np.array([4,6,5,6,7,8,4,9,1])
result = h.predict(new_x.reshape(1, -1))
print(result)

[4]


In [105]:
print(h.score(x_test, y_test))

0.9784172661870504
