# A Custom K-NN implementation 
A k-nn algorithm that can:
1. Handle missing values
2. Handle class imbalance 
3. Handle different scales of data 
4. Automatically picks the best value for k 
5. Is close to the same speed as sklearn's 
6. Should run in parralel
7. Regression or Classification 

## Planning 
1. Basic implementation 
2. First get it to run faster with the optimisations
3. Just use simulated data

In [302]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification, make_regression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, r2_score, recall_score
import collections

In [336]:
class Custom_Knn(BaseEstimator, ClassifierMixin):
    """
    A custom k-nearest neighbors (KNN) classifier and regressor that handles missing values.
    
    Parameters:
    - neighbors (int): Number of nearest neighbors to consider.
    - optimise (bool): Whether to optimize hyperparameters.
    - method (str): 'c' for classification, 'r' for regression.
    """
    def __init__(self, neighbors=None, optimise=False, method='c', imbalance=False):
        self.neighbors = neighbors
        self.optimise = optimise
        self.method = method
        self.imbalance = imbalance
    
    def fit(self, X, y):
        """
        Fit the model with training data.
        
        Parameters:
        - X (array-like): Training feature matrix.
        - y (array-like): Target values.
        """
        self.X_ = X
        self.y_ = y

        self.masks_ = self._compute_masks(X)
    
    def predict(self, X):
        """
        Predict target values for the given input data.
        
        Parameters:
        - X (array-like): Input feature matrix.
        
        Returns:
        - list: Predicted values.
        """
        self.test_mask_ = self._compute_masks(X)
        if self.method == 'c':
            predictions = [self._predict_single_classification(x, test_mask) for x, test_mask in zip(X, self.test_mask_)]
        else:
            predictions = [self._predict_single_regression(x, test_mask) for x, test_mask in zip(X, self.test_mask_)]
        return predictions
    
    def _predict_single_regression(self, x, test_mask):
        """
        Predict a single regression instance.
        """
        distances = [self._distance(x, x_train, train_mask, test_mask) for x_train, train_mask in zip(self.X_, self.masks_)]
        nearest_neigbor_indices = sorted(range(len(distances)), key=lambda sub: distances[sub])[:self.neighbors]
        pred = self._mean([self.y_[index] for index in nearest_neigbor_indices])
        return pred

    def _predict_single_classification(self, x, test_mask):
        """
        Predict a single classification instance.
        """
        distances = [self._distance(x, x_train, train_mask, test_mask) for x_train, train_mask in zip(self.X_, self.masks_)]
        nearest_neigbor_indices = sorted(range(len(distances)), key=lambda sub: distances[sub])[:self.neighbors]
        if self.imbalance: 
            pred = self._sheppards_method(distances=[distances[index] for index in nearest_neigbor_indices], labels=[self.y_[index] for index in nearest_neigbor_indices])
        else: pred = self._mode([self.y_[index] for index in nearest_neigbor_indices])
        return pred
    
    def _distance(self, a, b, train_mask, test_mask):
        """
        Compute Euclidean distance between two feature vectors, considering masks.
        """
        mask = train_mask & test_mask
        diff = (a[mask] - b[mask]) ** 2
        return np.sqrt(np.sum(diff))
    
    def _mean(self, ls):
        """
        Compute the mean of a list.
        """
        return sum(ls) / len(ls)
    
    def _mode(self, ls):
        """
        Compute the most common value in a list.
        """
        return collections.Counter(ls).most_common()[0][0]
    
    def _sheppards_method(self, distances, labels):

        weights = 1 / (np.array(distances) + 1e-5)
        weighted_votes = {}
        for label, weight in zip(labels, weights):
            label = str(label)
            if label in weighted_votes:
                weighted_votes[label] += weight
            else:
                weighted_votes[label] = weight
        
        predicted_class = max(weighted_votes, key=weighted_votes.get)
        return predicted_class
    
    def _compute_masks(self, values):
        """
        Compute boolean masks to identify non-missing values.
        """
        return ~pd.isna(values)

In [326]:
# Generate classification dataset
X, y = make_classification(n_samples=100, n_features=5, n_informative=4, 
                           n_redundant=0, n_classes=2, random_state=42)

# Introduce missing values
missing_fraction = 0.05  # 5% missing values
num_missing = int(missing_fraction * X.size)

# Randomly select indices to introduce NaNs
np.random.seed(42)  # For reproducibility
missing_indices = np.random.choice(X.size, num_missing, replace=False)

# Flatten X, introduce NaNs, and reshape back
X_flattened = X.flatten()
X_flattened[missing_indices] = np.nan
X = X_flattened.reshape(X.shape)

# Introduce class imbalance by resampling
def introduce_class_imbalance(X, y, majority_class_ratio=0.8):
    # Get indices of both classes
    class_0_indices = np.where(y == 0)[0]
    class_1_indices = np.where(y == 1)[0]

    # Introduce imbalance by keeping only a fraction of one class
    np.random.seed(42)
    if len(class_0_indices) > len(class_1_indices):
        class_0_indices = np.random.choice(class_0_indices, int(len(class_0_indices) * majority_class_ratio), replace=True)
    else:
        class_1_indices = np.random.choice(class_1_indices, int(len(class_1_indices) * majority_class_ratio), replace=True)

    # Combine the new indices
    selected_indices = np.concatenate([class_0_indices, class_1_indices])

    # Subset the data
    X_imbalanced = X[selected_indices]
    y_imbalanced = y[selected_indices]

    return X_imbalanced, y_imbalanced

# Introduce class imbalance (e.g., 80% of majority class remains)
X_imbalanced, y_imbalanced = introduce_class_imbalance(X, y, majority_class_ratio=1)

# Print class distribution before and after imbalance
print("Original class distribution:", np.bincount(y))
print("Imbalanced class distribution:", np.bincount(y_imbalanced))


Original class distribution: [50 50]
Imbalanced class distribution: [50 50]


In [327]:
X_reg, y_reg = make_regression(n_samples=100, n_features=5, n_informative=5)
missing_fraction = 0.05  # 10% missing values

# Calculate number of missing values
num_missing = int(missing_fraction * X_reg.size)

# Randomly select indices to introduce NaNs
np.random.seed(42)  # For reproducibility
missing_indices = np.random.choice(X_reg.size, num_missing, replace=False)

# Flatten X, introduce NaNs, and reshape back
X_flattened = X_reg.flatten()
X_flattened[missing_indices] = np.nan
X_reg = X_flattened.reshape(X_reg.shape)

# Model 
1. sklearn model format 

In [329]:
c_knn_class = Custom_Knn(neighbors=5, method='c')
c_knn_class_imbalance_not_set = Custom_Knn(neighbors=5, method='c')
c_knn_class_imbalance = Custom_Knn(neighbors=5, method='c', imbalance=True)
c_knn_regg = Custom_Knn(neighbors=1, method='r')


In [330]:
c_knn_regg.fit(X_reg, y_reg)
c_knn_class.fit(X, y)
c_knn_class_imbalance.fit(X_imbalanced, y_imbalanced)
c_knn_class_imbalance_not_set.fit(X_imbalanced, y_imbalanced)


In [331]:
pred_regg = c_knn_regg.predict(X=X_reg)

pred_class = c_knn_class.predict(X=X)

pred_class_imbalance = c_knn_class_imbalance.predict(X=X_imbalanced)

pred_class_imbalance_not_set = c_knn_class_imbalance_not_set.predict(X=X_imbalanced)


In [332]:
print(accuracy_score(y, pred_class))

print(r2_score(y_reg, pred_regg))

0.88
1.0


In [333]:
pred_class_imbalance = list(map(int, pred_class_imbalance))

In [334]:
print(accuracy_score(y, pred_class))
print(recall_score(y, pred_class))


0.88
0.84


In [335]:
print(accuracy_score(y_imbalanced, pred_class_imbalance))
print(recall_score(y_imbalanced, pred_class_imbalance))

print(accuracy_score(y_imbalanced, pred_class_imbalance_not_set))
print(recall_score(y_imbalanced, pred_class_imbalance_not_set))

1.0
1.0
0.88
0.86
