In [1]:
import logging
from collections.abc import Callable
from datetime import datetime
from json import load
from typing import Final, Literal

from numpy import ndarray, where
from pandas import DataFrame, Series, concat, read_csv, set_option
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDOneClassSVM
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

set_option("display.max_columns", None)

NUM_TRIALS: Final[int] = 20
LOGS_PATH: Final[str] = "../reports/logs_grid.log"
CONF_PATH: Final[str] = "../conf/model_configs.json"
MODELS = {
    "SGDOneClassSVM": SGDOneClassSVM(),
    "OneClassSVM": OneClassSVM(kernel="rbf"),
    "IsolationForest": IsolationForest(),
    "LocalOutlierFactor": LocalOutlierFactor(novelty=True),
    # Set novelty=True for training/prediction workflow
}
logging.basicConfig(filename=LOGS_PATH, level=logging.INFO)

with open(CONF_PATH, "r") as f:
    for model in (MODEL_CONFIGS := load(f)):
        MODEL_CONFIGS[model]["model"] = MODELS[model]

    MODELS = MODEL_CONFIGS

In [2]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

models: dict[int, dict] = {}
training_data: DataFrame
testing_data: DataFrame
train_targets: Series
test_targets: Series

RESULTS: dict[int, dict[str, float | int]] = {}
MIN_SAMPLES = X_train["activity"].value_counts().sort_values().iloc[0]
MAXIMAZED = False

In [3]:
def score_function(
    model: OneClassSVM | SGDOneClassSVM | IsolationForest | LocalOutlierFactor,
    Train: DataFrame,  # only for API compliance
    test: Series,  # only for API compliance
) -> float:
    """
    Objective function to maximize, calcs the F1 score on the test set.
    follows the format needed by scikit-learn's API.

    Args:
        model (OneClassSVM): Model to eval
        X_test (DataFrame): train data, only for API compliance
        y_true (Series): true targets, only for API compliance

    Returns:
        float: F1 score
    """
    f1 = f1_score(test_targets, where(model.predict(testing_data) == -1, True, False))
    logging.info(
        {
            "target": f1,
            "params": model.get_params(),
            "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%s"),
        }
    )
    return float(f1)


def update_train_vars(
    i: int, activities: ndarray
) -> tuple[DataFrame, Series, DataFrame, Series]:
    training = (  # picks the first n samples of each class
        X_train[X_train["activity"].isin(activities[:i])]
        .groupby("activity")
        .head(MIN_SAMPLES)
    )
    testing = X_test[X_test["activity"] == activities[i]].head(MIN_SAMPLES)
    training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True
    novelty = concat(
        [testing, training.sample(n=int(0.15 * len(training)), random_state=42)]
    )
    return (
        training.drop(columns=["isNovelty"]),
        training["isNovelty"],
        # only current activity (as novelty)
        novelty.drop(columns=["isNovelty"]),
        novelty["isNovelty"],
    )


def train_search_method(
    seach_type: Literal["grid", "random"],
    params: dict[str, list],
    scoring: Callable,
    n_iter: int = 100,
    cv: int = 4,
    verbose: int = 1,
    random_state: int = 42,  # used only if seach_type == "random"
) -> RandomizedSearchCV | GridSearchCV:
    search_cls = RandomizedSearchCV if seach_type == "random" else GridSearchCV
    search_kwargs = {
        f"param_{'distributions' if seach_type == 'random' else 'grid'}": params,
        "estimator": OneClassSVM(kernel="rbf"),
        "scoring": scoring,
        "cv": cv,
        "verbose": verbose,
        "error_score": "raise",
    }
    if seach_type == "random":
        search_kwargs.update({"n_iter": n_iter, "random_state": random_state})
    return search_cls(**search_kwargs).fit(training_data, train_targets)

## Hyperparameters Description:

- tol: Defines the tolerance for the optimization solver (SMO). The algorithm stops iterating when the improvement in the objective function is smaller than this value.
- nu: Controls the fraction of the dataset that is considered anomalies/outliers. It must be in the range (0, 1].   
  - A ggressive anomaly detectionfor for nu > 0.5, 
- gamma: Defines how much influence a single training example has. This is the parameter for the Gaussian kernel
  - $ K(x_i,x_j​)=exp(−γ∣∣x_i​−x_j​∣∣²) $

In [4]:
def update_params_grid(cv_results: dict[str, list]) -> dict[str, list]:
    data = (
        DataFrame(
            zip(
                cv_results["rank_test_score"],
                cv_results["param_gamma"],
                cv_results["param_nu"],
                cv_results["param_tol"],
            ),
            columns=["rank_test_score", "gamma", "nu", "tol"],
        )
        .sort_values("rank_test_score")
        .head(NUM_TRIALS)
        .drop("rank_test_score", axis=1)
        .to_dict(orient="list")
    )
    return {col: list(dict.fromkeys(data[col])) for col in ["gamma", "nu", "tol"]}


for i in range(1, len((activities := X_train["activity"].unique()))):
    training_data, train_targets, testing_data, test_targets = update_train_vars(
        i, activities
    )
    print(f"Training for activities {activities[:i]}")
    if not MAXIMAZED:
        grid_search = train_search_method(
            "grid",
            {
                "tol": [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
                # Tolerance for Stopping Criterion
                "nu": [0.01, 0.05, 0.1, 0.25, 0.5],
                # Upper Bound on Outliers & Lower Bound on Support Vectors
                "gamma": [0.001, 0.01, 0.1, 1],
                # Kernel Coefficient for RBF Kernel
            },
            score_function,
        )
        MAXIMAZED = True
    else:
        print(f"Already maximized, sugesting new {NUM_TRIALS} points")
        grid_search = train_search_method(
            "grid",
            update_params_grid(grid_search.cv_results_),  # type: ignore
            score_function,
        )
    print("Grid Search Best Params:", grid_search.best_params_)

MAXIMAZED = False

Training for activities [1]
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01, 'tol': 1e-05}
Training for activities [1 2]
Already maximized, sugesting new 20 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01, 'tol': 0.001}
Training for activities [1 2 3]
Already maximized, sugesting new 20 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01, 'tol': 0.0001}
Training for activities [1 2 3 4]
Already maximized, sugesting new 20 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01, 'tol': 0.001}
Training for activities [1 2 3 4 5]
Already maximized, sugesting new 20 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.05, 'tol': 1e-05}
Training for act

In [None]:
params = [
    "random",
    {
        "nu": [0.01, 0.025, 0.05, 0.75, 0.1, 0.2, 0.3, 0.4, 0.5],
        "gamma": [0.001, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1],
        "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    },
    score_function,
]
for i in range(1, len((activities := X_train["activity"].unique()))):
    training_data, train_targets, testing_data, test_targets = update_train_vars(
        i, activities
    )
    print(f"Training for activities {activities[:i]}")
    if not MAXIMAZED:
        random_search = train_search_method(*params)
        MAXIMAZED = True
    else:
        print(f"Already maximized, sugesting new {NUM_TRIALS} points")
        random_search = train_search_method(*params, n_iter=NUM_TRIALS)
    print(
        f"Random Search. Best Params: {random_search.best_params_} Best Score: {random_search.best_score_}"
    )

MAXIMAZED = False

Training for activities [1]
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Random Search. Best Params: {'tol': 0.01, 'nu': 0.01, 'gamma': 0.001} Best Score: 0.9813838151877367
Training for activities [1 2]
Already maximized, sugesting new 20 points
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Random Search. Best Params: {'tol': 0.0001, 'nu': 0.05, 'gamma': 0.01} Best Score: 0.957762225276991
Training for activities [1 2 3]
Already maximized, sugesting new 20 points
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Random Search. Best Params: {'tol': 0.0001, 'nu': 0.05, 'gamma': 0.01} Best Score: 0.9387220334280943
Training for activities [1 2 3 4]
Already maximized, sugesting new 20 points
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Random Search. Best Params: {'tol': 0.001, 'nu': 0.05, 'gamma': 0.01} Best Score: 0.8220964658921611
Training for activities [1 2 3 4 5]
Already maximized, sugesting new 20 points
Fitting 4 folds f