# K-Random-Neighbors

In [100]:
import pandas as pd
from sklearn.utils.extmath import weighted_mode
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import numpy as np
import time
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_openml
import cProfile




## K-Random-Neighbors klasse

In [101]:
class KRandomNeighbors:
    
    def __init__(self):
        self.fit_map = None
        self.stddev = None
        self.X_shape = None
        self.XTrain_view = None
        self.stddev_target_scaler = 0.5
        
    def scaled_sigmoid(self, x, a, b, c):
        return a / (1 + np.exp(c * (x - b)))
    
    def fit(self, XTrain: np.ndarray, YTrain):
        # create a hashmap with feature tuples as keys and target values as values
        self.fit_map = {tuple(x): y for x, y in zip(XTrain, YTrain)}
        
        # compute the standard deviation for each column in XTrain
        self.stddev = np.std(XTrain, axis=0)
        
        # store the shape of the training data
        self.X_shape = XTrain.shape
        
        # convert the training data of shape n*m into an ndarray of shape n*1 with tuple-like elements of form 1*m
        self.XTrain_view = XTrain.view([('', XTrain.dtype)] * XTrain.shape[1])
        
        # TODO: optimize stddev_target_scaler
        
    def predict(self, X, k, num_points=1000):
        hits = list()
        
        while len(hits) < k:
            # generate data points
            rng = np.random.default_rng()
            points: np.ndarray = rng.normal(X, self.stddev * self.stddev_target_scaler, size=(num_points, self.X_shape[1])).round(1)
            
            # convert points into structured arrays (treat rows as tuples)
            point_view = points.view([('', points.dtype)] * points.shape[1])
            
            # determine the intersection between generated points and training data
            intersection: np.ndarray = np.intersect1d(self.XTrain_view, point_view)
            print(points[0, :], end='\r', flush=True)
            
            if intersection.shape[0] > 0:
                hits.extend(intersection.tolist())
                
        # determine class for each hit
        classes = list()
        for hit in hits:
            classes.append(self.fit_map[hit])
            
        classes = np.array(classes)
        
        # TODO: assign weights for each hit, use scaled_sigmoid(), to accomplish this
        weights = np.ones(shape=classes.shape)
        
        # determine the mode (most frequently occurring class)
        prediction = weighted_mode(classes, weights)
        
        return prediction[0][0].astype(int)



## Daten Aufbereitung (Iris)



In [102]:


# Load the data from the file
df = pd.read_csv('iris.data', header=None)

# Encode plant names as numbers
le = LabelEncoder()
df[4] = le.fit_transform(df[4])

# Save as CSV
df.to_csv('iris_numeric.csv', header=False, index=False)



## Datenaufbereitung (Digits)


In [103]:
# Load the digits dataset
digits = load_digits()

# Features (pixel values of the images)
X = digits.data

# Labels (the digits)
y = digits.target

# Combine the data and labels into a DataFrame
df = pd.DataFrame(X)
df['label'] = y

# Save as CSV
df.to_csv('digits_numeric.csv', index=False, header=False)


## Datenaufbereitung (Diabetes)

In [104]:
# Load the dataset from OpenML
diabetes = fetch_openml(data_id=37, as_frame=True)  # data_id=37 for Pima Indian Diabetes

# Convert to a DataFrame
df = diabetes.frame

# Encode classes as numbers
le = LabelEncoder()
df["class"] = le.fit_transform(df["class"])

# Remove or round columns with inconvenient decimal places
df.pop("pedi")
df["mass"] = df["mass"].round(0).astype(int)

df.pop("skin")
df.pop("preg")
df.pop("pres")

# Save as CSV
df.to_csv('diabetes_numeric.csv', index=False, header=False)


## Datenaufbereitung (Banknoten)

In [105]:
# Load the dataset
banknote = fetch_openml(data_id=1462, as_frame=True)  # Banknote dataset ID

# Convert to DataFrame
df = banknote.frame

# Round to a uniform number of decimal places
df["V1"] = df["V1"].round(1)
df["V2"] = df["V2"].round(1)
df["V3"] = df["V3"].round(1)
df["V4"] = df["V4"].round(1)

# Save as CSV
df.to_csv('banknotes_numeric.csv', index=False, header=False)


## Test funktion

In [106]:
def test_knn(XTrain: np.ndarray, YTrain: np.ndarray, XTest: np.ndarray, YTest: np.ndarray, k=30):
    """
    Tests a k-NN model and prints confusion matrix as well as precision, recall, and F1-score.

    Parameters:
    - XTrain, YTrain: Training data
    - XTest, YTest: Test data
    - k: Number of neighbors
    """
    # 1) Train the KNN
    knn = KRandomNeighbors()
    knn.fit(XTrain, YTrain)
    
    # Time measurement: Total time
    start_total = time.time()

    # 2) Predictions for all test samples
    predictions = []
    total_prediction_time = 0

    for i in range(XTest.shape[0]):
        start_pred = time.time()
        pred = knn.predict(XTest[i], k)
        end_pred = time.time()

        predictions.append(pred)
        
        pred_time = end_pred - start_pred
        total_prediction_time += pred_time
        print(f"Predicted class {pred} for {XTest[i]} in {round(pred_time, 3)} sec")
    
    end_total = time.time()

    # get all classes
    classes = np.unique(np.concatenate([YTrain, YTest]))

    # 3) Confusion matrix
    cm = confusion_matrix(YTest, predictions, labels=classes)
    print("Confusion Matrix:\n", cm)

    # 4) Precision, Recall, F1-score
    precision, recall, f1, support = precision_recall_fscore_support(
        YTest, predictions, labels=classes, zero_division=0
    )

    print("\nMetrics per class:")
    for idx, label in enumerate(classes):
        print(f"Class {label}:")
        print(f"  Precision: {precision[idx]:.2f}")
        print(f"  Recall:    {recall[idx]:.2f}")
        print(f"  F1-Score:  {f1[idx]:.2f}")
        print(f"  Support:   {support[idx]} samples\n")
        
    # Average time per prediction
    avg_prediction_time = total_prediction_time / XTest.shape[0]

    # Time per feature
    num_features = XTest.shape[1]
    avg_time_per_feature = avg_prediction_time / num_features

    print("Total time:", round(end_total - start_total, 3), "seconds")
    print("Avg. time per prediction:", round(avg_prediction_time, 3), "seconds")
    print("Avg. time per feature:", round(avg_time_per_feature, 3), "seconds")

    return cm, precision, recall, f1, support


## Test

In [107]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [108]:
# load the numeric data as an ndarray
dataset = np.loadtxt('iris_numeric.csv', delimiter=',')
# dataset = np.loadtxt('banknotes_numeric.csv', delimiter=',')

# select features and target data
X = dataset[:, :-1]  # all columns except the last
Y = dataset[:, -1]   # last column is the class/label

allIdx = np.arange(X.shape[0])  # all indices of the data

# randomly choose 80% of the indices for training
rng = np.random.default_rng()
train_idx = rng.choice(allIdx, size=int(allIdx.shape[0] * 0.8), replace=False)
test_idx = np.delete(allIdx, train_idx)  # remaining 20% for testing

XTrain = X[train_idx]
YTrain = Y[train_idx]

XTest = X[test_idx]
YTest = Y[test_idx]
print(XTest)

# test the k-NN model with k=3
def test_predict():
    for i in range(500):
        test_knn(XTrain, YTrain, XTest, YTest, k=11)



[[5.1 3.5 1.4 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.8 3.  1.4 0.1]
 [5.7 4.4 1.5 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [4.6 3.6 1.  0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.2 2.7 3.9 1.4]
 [6.  2.2 4.  1. ]
 [6.7 3.1 4.4 1.4]
 [5.6 3.  4.5 1.5]
 [6.2 2.2 4.5 1.5]
 [6.1 2.8 4.7 1.2]
 [5.5 2.4 3.8 1.1]
 [6.3 2.3 4.4 1.3]
 [5.  2.3 3.3 1. ]
 [5.7 3.  4.2 1.2]
 [5.7 2.9 4.2 1.3]
 [6.3 2.9 5.6 1.8]
 [7.6 3.  6.6 2.1]
 [6.4 2.7 5.3 1.9]
 [5.6 2.8 4.9 2. ]
 [6.3 2.7 4.9 1.8]
 [6.1 3.  4.9 1.8]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]]


In [109]:
%lprun -f KRandomNeighbors.predict test_predict()

Predicted class 0 for [5.1 3.5 1.4 0.2] in 0.012 sec
Predicted class 0 for [4.6 3.1 1.5 0.2] in 0.026 sec
Predicted class 0 for [5.  3.4 1.5 0.2] in 0.036 sec
Predicted class 0 for [4.4 2.9 1.4 0.2] in 0.061 sec
Predicted class 0 for [4.8 3.  1.4 0.1] in 0.037 sec
Predicted class 0 for [5.7 4.4 1.5 0.4] in 0.294 sec
Predicted class 0 for [5.1 3.5 1.4 0.3] in 0.016 sec
Predicted class 0 for [5.7 3.8 1.7 0.3] in 0.034 sec
Predicted class 0 for [4.6 3.6 1.  0.2] in 0.031 sec
Predicted class 0 for [4.9 3.1 1.5 0.1] in 0.029 sec
Predicted class 0 for [5.  3.2 1.2 0.2] in 0.026 sec
Predicted class 1 for [5.2 2.7 3.9 1.4] in 0.042 sec
Predicted class 1 for [6.  2.2 4.  1. ] in 0.111 sec
Predicted class 1 for [6.7 3.1 4.4 1.4] in 0.038 sec
Predicted class 1 for [5.6 3.  4.5 1.5] in 0.045 sec
Predicted class 2 for [6.2 2.2 4.5 1.5] in 0.133 sec
Predicted class 1 for [6.1 2.8 4.7 1.2] in 0.026 sec
Predicted class 1 for [5.5 2.4 3.8 1.1] in 0.067 sec
Predicted class 1 for [6.3 2.3 4.4 1.3] in 0.1

Timer unit: 1e-07 s

Total time: 1726 s
File: C:\Users\svenr\AppData\Local\Temp\ipykernel_34128\193846564.py
Function: predict at line 28

Line #      Hits         Time  Per Hit   % Time  Line Contents
    28                                               def predict(self, X, k, num_points=1000):
    29     15000     240126.0     16.0      0.0          hits = list()
    30                                           
    31    404500    3287214.0      8.1      0.0          while len(hits) < k:
    32                                                       # generate data points
    33    389500  271511778.0    697.1      1.6              rng = np.random.default_rng()
    34    389500  808344312.0   2075.3      4.7              points: np.ndarray = rng.normal(X, self.stddev * self.stddev_target_scaler, size=(num_points, self.X_shape[1])).round(1)
    35                                           
    36                                                       # convert points into structured arr

### Auswertung Banknotes

#### Confusion Matrix:
|                          | Vorhergesagt: Klasse 1.0 | Vorhergesagt: Klasse 2.0 |
|--------------------------|--------------------------|--------------------------|
| Tatsächlich: Klasse 1.0  | 141                      | 11                       |
| Tatsächlich: Klasse 2.0  | 0                        | 123                      |

#### Metriken pro Klasse:
Klasse 1.0:
- Präzision: 1.00
- Recall:    0.93
- F1-Score:  0.96
- Support:   152 Beispiele

Klasse 2.0:
- Präzision: 0.92
- Recall:    1.00
- F1-Score:  0.96
- Support:   123 Beispiele

#### Zeitmessungen:
- Gesamtzeit: 295.178 Sekunden
- Ø Zeit pro Vorhersage: 1.073 Sekunden
- Ø Zeit pro Feature: 0.268 Sekunden

# Auswertung Iris

## Konfusionsmatrix

|               | Vorhergesagt: Klasse 0 | Vorhergesagt: Klasse 1 | Vorhergesagt: Klasse 2 |
|---------------|------------------------|------------------------|------------------------|
| **Tatsächliche Klasse 0** | 11                     | 0                      | 0                      |
| **Tatsächliche Klasse 1** | 0                      | 9                      | 2                      |
| **Tatsächliche Klasse 2** | 0                      | 2                      | 6                      |



## Metriken pro Klasse

### Klasse 0.0
- **Precision:** 1.00  
- **Recall:** 1.00  
- **F1-Score:** 1.00  
- **Support:** 11 Samples  

### Klasse 1.0
- **Precision:** 0.82  
- **Recall:** 0.82  
- **F1-Score:** 0.82  
- **Support:** 11 Samples  

### Klasse 2.0
- **Precision:** 0.75  
- **Recall:** 0.75  
- **F1-Score:** 0.75  
- **Support:** 8 Samples  

## Zeitmessung

- **Gesamtdauer:** 0.409 Sekunden  
- **Ø  Zeit pro Vorhersage:** 0.014 Sekunden  
- **Ø  Zeit pro Feature:** 0.003 Sekunden


# Idee für bessere Skalierbarkeit mit Dimensionen

Divide and Conquer: Wenn mehr als 4 Features, daten in mehrere Tabellen aufteilen und in jeder Tabelle getrennt von einander Schießen. Anschließend die Trefferklassen kombinieren. ACHTUNG: wirkt sich auch auf potentiellen Bias aus.
