In [None]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

df = pd.read_csv("../data/interim/complete_data.csv")

In [None]:
class CustomKNNClassifier:
    def __init__(self, metric: str = "euclidean"):
        """
        Initialize KNN classifier with specified distance metric

        Args:
            metric: Distance metric ('euclidean' or 'cosine')
        """
        self.metric = metric
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()

    def preprocess_data(self, X: np.ndarray, y: list) -> tuple[np.ndarray, np.ndarray]:
        """
        Preprocess the data by scaling features and encoding labels
        """
        return self.scaler.fit_transform(X), self.label_encoder.fit_transform(y)

    def cross_validate(
        self, X: np.ndarray, y: np.ndarray, k: int, n_folds: int = 10
    ) -> dict[str, float]:
        """
        Perform k-fold cross-validation and calculate metrics
        """
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
        metrics = defaultdict(list)

        for train_idx, val_idx in skf.split(X, y):
            knn = KNeighborsClassifier(n_neighbors=k, metric=self.metric).fit(
                X[train_idx],
                y[train_idx],
            )

            X_val, y_val = X[val_idx], y[val_idx]
            y_pred = knn.predict(X_val)
            y_prob = knn.predict_proba(X_val)

            metrics["f1"].append(f1_score(y_val, y_pred, average="weighted"))
            metrics["accuracy"].append(accuracy_score(y_val, y_pred))
            metrics["auc"].append(roc_auc_score(y_val, y_prob, multi_class="ovr"))
            metrics["top_3_accuracy"].append(
                self.calculate_top_k_accuracy(y_prob, y_val, k=3)
            )

        return {key: np.mean(values) for key, values in metrics.items()}

    @staticmethod
    def calculate_top_k_accuracy(
        y_prob: np.ndarray, y_true: np.ndarray, k: int
    ) -> float:
        """
        Calculate top-k accuracy
        """
        top_k_predictions = np.argsort(y_prob, axis=1)[:, -k:]
        correct = 0
        for i, true_label in enumerate(y_true):
            if true_label in top_k_predictions[i]:
                correct += 1
        return correct / len(y_true)


def bayesian_optimize_knn(X: np.ndarray, y: np.ndarray, metric: str) -> Dict:
    """
    Perform Bayesian optimization for KNN hyperparameters
    """

    def objective(k):
        k = int(k)
        classifier = CustomKNNClassifier(metric=metric)
        X_scaled, y_encoded = classifier.preprocess_data(X, y)
        metrics = classifier.cross_validate(X_scaled, y_encoded, k)
        return metrics["f1"]  # Optimize for F1-score

    optimizer = BayesianOptimization(
        f=objective, pbounds={"k": (1, 15)}, random_state=42
    )

    optimizer.maximize(n_iter=20)

    return {
        "best_k": int(optimizer.max["params"]["k"]),
        "best_score": optimizer.max["target"],
    }

In [None]:
def compare_metrics(euclidean_results: dict, cosine_results: dict) -> pd.DataFrame:
    """
    Create a comparison DataFrame of metrics
    """
    metrics = ["accuracy", "f1", "auc", "top_3_accuracy"]
    return pd.DataFrame(
        {
            "Euclidean": [euclidean_results[m] for m in metrics],
            "Cosine": [cosine_results[m] for m in metrics],
        },
        index=metrics,
    )


def plot_results(euclidean_results: dict, cosine_results: dict) -> None:
    """
    Plot comparison of metrics between Euclidean and Cosine distance
    """
    metrics = ["accuracy", "f1", "auc", "top_3_accuracy"]
    x = np.arange(len(metrics))
    width = 0.35

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(
        x - width / 2, [euclidean_results[m] for m in metrics], width, label="Euclidean"
    )
    ax.bar(x + width / 2, [cosine_results[m] for m in metrics], width, label="Cosine")

    ax.set_ylabel("Score")
    ax.set_title("Metric Comparison: Euclidean vs Cosine Distance")
    ax.set_xticks(x)
    ax.set_xticklabels(metrics)
    ax.legend()

    plt.tight_layout()
    plt.show()