# K-Random-Neighbors

In [118]:
import pandas as pd
from sklearn.utils.extmath import weighted_mode
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import numpy as np
import time
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_openml
from itertools import combinations
import multiprocessing


## K-Random-Neighbors klasse

In [119]:
class KRandomNeighbors:
    
    def __init__(self):
        self.fit_map = None
        self.stddev = None
        self.X_shape = None
        self.XTrain_view = None
        self.stddev_target_scaler = 0.5
        
    def scaled_sigmoid(self, x, a, b, c):
        return a / (1 + np.exp(c * (x - b)))
    
    def fit(self, XTrain: np.ndarray, YTrain):
        # create a hashmap with feature tuples as keys and target values as values
        self.fit_map = {tuple(x): y for x, y in zip(XTrain, YTrain)}
        
        # compute the standard deviation for each column in XTrain
        self.stddev = np.std(XTrain, axis=0)
        
        # store the shape of the training data
        self.X_shape = XTrain.shape
        
        # convert the training data of shape n*m into an ndarray of shape n*1 with tuple-like elements of form 1*m
        self.XTrain_view = XTrain.view([('', XTrain.dtype)] * XTrain.shape[1])
        
        # TODO: optimize stddev_target_scaler
        
    def predict(self, X, k, num_points=1000):
        hits = list()
        
        while len(hits) < k:
            # generate data points
            rng = np.random.default_rng()
            points: np.ndarray = rng.normal(X, self.stddev * self.stddev_target_scaler, size=(num_points, self.X_shape[1])).round(1)
            
            # convert points into structured arrays (treat rows as tuples)
            point_view = points.view([('', points.dtype)] * points.shape[1])
            
            # determine the intersection between generated points and training data
            intersection: np.ndarray = np.intersect1d(self.XTrain_view, point_view)
            # print(points[0, :], end='\r', flush=True)
            
            if intersection.shape[0] > 0:
                hits.extend(intersection.tolist())
                
        # determine class for each hit
        classes = list()
        for hit in hits:
            classes.append(self.fit_map[hit])
            
        classes = np.array(classes)
        
        # TODO: assign weights for each hit, use scaled_sigmoid(), to accomplish this
        weights = np.ones(shape=classes.shape)
        
        # determine the mode (most frequently occurring class)
        prediction = weighted_mode(classes, weights)
        
        return prediction[0][0].astype(int)


# k-random-enseble

In [None]:
class KRandomEnsable:
    
    def __init__(self):
        self.COMBINED_FEATURES:int = 4
        self.WORKERS: list[KRandomEnsable] = list()
        
    def get_feature_combinations(self, dims):
        # make sure that feature combinations are not longer than existing features
        if (dims < self.COMBINED_FEATURES):
            self.COMBINED_FEATURES = dims
        elif (dims == self.COMBINED_FEATURES):
            self.COMBINED_FEATURES -= 1
        
        # get idx of all features
        feature_idxs = list(range(dims))
        
        # get all combinations of Features
        return np.array(list(combinations(feature_idxs, self.COMBINED_FEATURES)))
    
    
    def fit(self, XTrain: np.ndarray, YTrain):
        # get possible combinations of features
        combs = self.get_feature_combinations(XTrain.shape[1])
        
        # decide how many workers schould be used -> use as many combinations as there are cores, unless there are more cores than combinations
        worker_count = min(multiprocessing.cpu_count(), combs.shape[0])
        
        # choose the combinations on which the workers are trained on
        rng = np.random.default_rng()
        comb_idxs = rng.choice(combs.shape[0], worker_count, replace=False)
        combs = combs[comb_idxs]
        
        # train each worker with one combination
        for current_combo in combs:            
            # generate training dataset by using only the features at the given idxs in the current combination
            X_subset = XTrain[:, current_combo]
            
            # create and Train worker
            worker = KRandomEnsable()
            worker.fit(X_subset, YTrain)
            
            # evaluate trusworthyness of the worker
            
            
            self.WORKERS.append(worker)
        

## Test funktion

In [121]:
def test_knn(XTrain: np.ndarray, YTrain: np.ndarray, XTest: np.ndarray, YTest: np.ndarray, k=30):
    """
    Tests a k-NN model and prints confusion matrix as well as precision, recall, and F1-score.

    Parameters:
    - XTrain, YTrain: Training data
    - XTest, YTest: Test data
    - k: Number of neighbors
    """
    # 1) Train the KNN
    knn = KRandomNeighbors()
    knn.fit(XTrain, YTrain)
    
    # Time measurement: Total time
    start_total = time.time()

    # 2) Predictions for all test samples
    predictions = []
    total_prediction_time = 0

    for i in range(XTest.shape[0]):
        start_pred = time.time()
        pred = knn.predict(XTest[i], k)
        end_pred = time.time()

        predictions.append(pred)
        
        pred_time = end_pred - start_pred
        total_prediction_time += pred_time
        # print(f"Predicted class {pred} for {XTest[i]} in {round(pred_time, 3)} sec")
    
    end_total = time.time()

    # get all classes
    classes = np.unique(np.concatenate([YTrain, YTest]))

    # 3) Confusion matrix
    cm = confusion_matrix(YTest, predictions, labels=classes)
    # print("Confusion Matrix:\n", cm)

    # 4) Precision, Recall, F1-score
    precision, recall, f1, support = precision_recall_fscore_support(
        YTest, predictions, labels=classes, zero_division=0
    )

    # print("\nMetrics per class:")
    # for idx, label in enumerate(classes):
    #     print(f"Class {label}:")
    #     print(f"  Precision: {precision[idx]:.2f}")
    #     print(f"  Recall:    {recall[idx]:.2f}")
    #     print(f"  F1-Score:  {f1[idx]:.2f}")
    #     print(f"  Support:   {support[idx]} samples\n")
        
    # Average time per prediction
    avg_prediction_time = total_prediction_time / XTest.shape[0]

    # Time per feature
    num_features = XTest.shape[1]
    avg_time_per_feature = avg_prediction_time / num_features

    total_time = round(end_total - start_total, 3)
    time_per_prediction = round(avg_prediction_time, 3)
    time_per_feature = round(avg_time_per_feature, 3)
    # print("Total time:", total_time, "seconds")
    # print("Avg. time per prediction:", time_per_prediction, "seconds")
    # print("Avg. time per feature:", time_per_feature, "seconds")

    return cm, precision, recall, f1, support, total_time, time_per_prediction, time_per_feature


## Test

In [122]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [123]:
# load the numeric data as an ndarray
dataset = np.loadtxt('iris_synthetic.csv', delimiter=',')
# dataset = np.loadtxt('banknotes_numeric.csv', delimiter=',')

# select features and target data
X = dataset[:, :-1]  # all columns except the last
Y = dataset[:, -1]   # last column is the class/label

allIdx = np.arange(X.shape[0])  # all indices of the data

# randomly choose 75% of the indices for training
rng = np.random.default_rng(42)
tests = dict()

for i in range(30, 30, 30):
    # train_idx = rng.choice(allIdx, size=int(allIdx.shape[0] * 0.2), replace=False)
    train_idx = rng.choice(allIdx, size=int(i), replace=False)
    test_idx = np.delete(allIdx, train_idx)  # remaining 20% for testing
    test_idx = rng.choice(test_idx, size=100, replace=False)

    XTrain = X[train_idx]
    YTrain = Y[train_idx]

    XTest = X[test_idx]
    YTest = Y[test_idx]


    # %lprun -f KRandomNeighbors.predict test_knn(XTrain, YTrain, XTest, YTest, k=11)

    tests[i] = test_knn(XTrain, YTrain, XTest, YTest, k=11)
    print(i, list(tests[i][-3:]))

### Auswertung Banknotes

#### Confusion Matrix:
|                          | Vorhergesagt: Klasse 1.0 | Vorhergesagt: Klasse 2.0 |
|--------------------------|--------------------------|--------------------------|
| Tatsächlich: Klasse 1.0  | 141                      | 11                       |
| Tatsächlich: Klasse 2.0  | 0                        | 123                      |

#### Metriken pro Klasse:
Klasse 1.0:
- Präzision: 1.00
- Recall:    0.93
- F1-Score:  0.96
- Support:   152 Beispiele

Klasse 2.0:
- Präzision: 0.92
- Recall:    1.00
- F1-Score:  0.96
- Support:   123 Beispiele

#### Zeitmessungen:
- Gesamtzeit: 295.178 Sekunden
- Ø Zeit pro Vorhersage: 1.073 Sekunden
- Ø Zeit pro Feature: 0.268 Sekunden

# Auswertung Iris

## Konfusionsmatrix

|               | Vorhergesagt: Klasse 0 | Vorhergesagt: Klasse 1 | Vorhergesagt: Klasse 2 |
|---------------|------------------------|------------------------|------------------------|
| **Tatsächliche Klasse 0** | 11                     | 0                      | 0                      |
| **Tatsächliche Klasse 1** | 0                      | 9                      | 2                      |
| **Tatsächliche Klasse 2** | 0                      | 2                      | 6                      |



## Metriken pro Klasse

### Klasse 0.0
- **Precision:** 1.00  
- **Recall:** 1.00  
- **F1-Score:** 1.00  
- **Support:** 11 Samples  

### Klasse 1.0
- **Precision:** 0.82  
- **Recall:** 0.82  
- **F1-Score:** 0.82  
- **Support:** 11 Samples  

### Klasse 2.0
- **Precision:** 0.75  
- **Recall:** 0.75  
- **F1-Score:** 0.75  
- **Support:** 8 Samples  

## Zeitmessung

- **Gesamtdauer:** 0.409 Sekunden  
- **Ø  Zeit pro Vorhersage:** 0.014 Sekunden  
- **Ø  Zeit pro Feature:** 0.003 Sekunden


# Idee für bessere Skalierbarkeit mit Dimensionen

Divide and Conquer: Wenn mehr als 4 Features, daten in mehrere Tabellen aufteilen und in jeder Tabelle getrennt von einander Schießen. Anschließend die Trefferklassen kombinieren. ACHTUNG: wirkt sich auch auf potentiellen Bias aus.


In [124]:
g = KRandomEnsable()

# train_idx = rng.choice(allIdx, size=int(allIdx.shape[0] * 0.2), replace=False)
train_idx = rng.choice(allIdx, size=150, replace=False)
test_idx = np.delete(allIdx, train_idx)  # remaining 20% for testing
test_idx = rng.choice(test_idx, size=100, replace=False)

XTrain = X[train_idx]
YTrain = Y[train_idx]

XTest = X[test_idx]
YTest = Y[test_idx]


g.fit(XTrain, YTrain)

[5.9 3.1 4.3]
[5.9 3.1 4.3 1.6]
[5.9 3.1 1.6]
[5.9 3.1 4.3 1.6]
[3.1 4.3 1.6]
[5.9 3.1 4.3 1.6]
[5.9 4.3 1.6]
[5.9 3.1 4.3 1.6]
