In [None]:
import logging
from datetime import datetime
from json import load
from typing import Final

from numpy import ndarray, where
from pandas import DataFrame, Series, concat, read_csv, set_option
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDOneClassSVM
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

set_option("display.max_columns", None)

NUM_TRIALS: Final[int] = 5
LOGS_PATH: Final[str] = "../reports/logs_normal.log"
CONF_PATH: Final[str] = "../conf/model_configs.json"
MODELS = {
    "SGDOneClassSVM": SGDOneClassSVM(),
    "OneClassSVM": OneClassSVM(kernel="rbf"),
    "IsolationForest": IsolationForest(),
    "LocalOutlierFactor": LocalOutlierFactor(novelty=True),
    # Set novelty=True for training/prediction workflow
}
logging.basicConfig(filename=LOGS_PATH, level=logging.INFO)

with open(CONF_PATH, "r") as f:
    for model in (MODEL_CONFIGS := load(f)):
        MODEL_CONFIGS[model]["model"] = MODELS[model]

    MODELS = MODEL_CONFIGS

In [2]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

models: dict[int, dict] = {}
training_data: DataFrame
testing_data: DataFrame
train_targets: Series
test_targets: Series

RESULTS: dict[int, dict[str, float | int]] = {}
MIN_SAMPLES = X_train["activity"].value_counts().sort_values().iloc[0]
MAXIMAZED = False

In [3]:
def score_function(
    model: OneClassSVM | SGDOneClassSVM | IsolationForest | LocalOutlierFactor,
    Train: DataFrame,  # only for API compliance
    test: Series,  # only for API compliance
) -> float:
    """
    Objective function to maximize, calcs the F1 score on the test set.
    follows the format needed by scikit-learn's API.

    Args:
        model (OneClassSVM): Model to eval
        X_test (DataFrame): train data, only for API compliance
        y_true (Series): true targets, only for API compliance

    Returns:
        float: F1 score
    """
    f1 = f1_score(test_targets, where(model.predict(testing_data) == -1, True, False))
    logging.info(
        {
            "target": f1,
            "params": model.get_params(),
            "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%s"),
        }
    )
    return float(f1)


def update_train_vars(
    i: int, activities: ndarray
) -> tuple[DataFrame, Series, DataFrame, Series]:
    training = (  # picks the first n samples of each class
        X_train[X_train["activity"].isin(activities[:i])]
        .groupby("activity")
        .head(MIN_SAMPLES)
    )
    testing = X_test[X_test["activity"] == activities[i]].head(MIN_SAMPLES)
    training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True
    novelty = concat(
        [testing, training.sample(n=int(0.15 * len(training)), random_state=42)]
    )
    return (
        training.drop(columns=["isNovelty"]),
        training["isNovelty"],
        # only current activity (as novelty)
        novelty.drop(columns=["isNovelty"]),
        novelty["isNovelty"],
    )

## Hyperparameters Description:

- tol: Defines the tolerance for the optimization solver (SMO). The algorithm stops iterating when the improvement in the objective function is smaller than this value.
- nu: Controls the fraction of the dataset that is considered anomalies/outliers. It must be in the range (0, 1].   
  - A ggressive anomaly detectionfor for nu > 0.5, 
- gamma: Defines how much influence a single training example has. This is the parameter for the Gaussian kernel
  - $ K(x_i,x_j​)=exp(−γ∣∣x_i​−x_j​∣∣²) $

In [None]:
for model_name, config in MODEL_CONFIGS.items():
    print(f"\n{'=' * 50}\nRunning grid search for {model_name}\n{'=' * 50}")
    print("CONFIGS: ", config["model"])
    print("PARAMS: ", config["grid_params"])


Running grid search for SGDOneClassSVM
CONFIGS:  SGDOneClassSVM()
PARAMS:  {'nu': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5], 'learning_rate': ['optimal', 'constant', 'invscaling', 'adaptive'], 'alpha': [0.0001, 0.001, 0.01, 0.1]}

Running grid search for OneClassSVM
CONFIGS:  OneClassSVM()
PARAMS:  {'nu': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5], 'gamma': [0.001, 0.01, 0.1, 0.5, 1]}

Running grid search for IsolationForest
CONFIGS:  IsolationForest()
PARAMS:  {'n_estimators': [50, 100, 200], 'contamination': [0.01, 0.05, 0.1, 0.2, 0.3], 'max_samples': ['auto', 100, 500, 1000]}

Running grid search for LocalOutlierFactor
CONFIGS:  LocalOutlierFactor(novelty=True)
PARAMS:  {'n_neighbors': [5, 10, 20, 50], 'contamination': [0.01, 0.05, 0.1, 0.2, 0.3], 'metric': ['euclidean', 'manhattan']}


In [None]:
def update_params_grid(cv_results: dict[str, list]) -> dict[str, list]:
    data = (
        DataFrame(
            zip(
                cv_results["rank_test_score"],
                cv_results["param_gamma"],
                cv_results["param_nu"],
                cv_results["param_tol"],
            ),
            columns=["rank_test_score", "gamma", "nu", "tol"],
        )
        .sort_values("rank_test_score")
        .head(NUM_TRIALS)
        .drop("rank_test_score", axis=1)
        .to_dict(orient="list")
    )
    return {col: list(dict.fromkeys(data[col])) for col in ["gamma", "nu", "tol"]}


def train_grid_optimizer(grid: dict[str, list]) -> GridSearchCV:
    return GridSearchCV(
        estimator=OneClassSVM(kernel="rbf"),
        param_grid=grid,
        scoring=score_function,
        # n_jobs=-1,
        cv=4,
        verbose=1,
        error_score="raise",
    ).fit(training_data, train_targets)


for i in range(1, len((activities := X_train["activity"].unique()))):
    training_data, train_targets, testing_data, test_targets = update_train_vars(
        i, activities
    )
    print(f"Training for activities {activities[:i]}")
    if not MAXIMAZED:
        grid_search = train_grid_optimizer(
            {
                "tol": [1e-2, 1e-3, 1e-4, 1e-5],
                # Tolerance for Stopping Criterion
                "nu": [0.01, 0.05, 0.1, 0.25, 0.5],
                # Upper Bound on Outliers & Lower Bound on Support Vectors
                "gamma": [0.001, 0.01, 0.1, 0.5, 1],
                # Kernel Coefficient for RBF Kernel
            }
        )
        MAXIMAZED = True
    else:
        print(f"Already maximized, sugesting new {NUM_TRIALS} points")
        grid_search = train_grid_optimizer(update_params_grid(grid_search.cv_results_))  # type: ignore
    print("Grid Search Best Params:", grid_search.best_params_)

In [None]:
def train_optimizer(iters: int) -> RandomizedSearchCV:
    return RandomizedSearchCV(
        estimator=OneClassSVM(kernel="rbf"),
        param_distributions={
            "nu": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
            "gamma": [0.001, 0.01, 0.1, 0.5, 1],
            "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        },
        n_iter=iters,
        scoring=score_function,
        # n_jobs=-1,
        cv=4,
        verbose=1,
        random_state=42,
        error_score="raise",
    ).fit(training_data, train_targets)


for i in range(1, len((activities := X_train["activity"].unique()))):
    training_data, train_targets, testing_data, test_targets = update_train_vars(
        i, activities
    )
    print(f"Training for activities {activities[:i]}")
    if not MAXIMAZED:
        random_search = train_optimizer(100)
        MAXIMAZED = True
    else:
        print(f"Already maximized, sugesting new {NUM_TRIALS} points")
        random_search = train_optimizer(NUM_TRIALS)
    print(
        f"Random Search. Best Params: {random_search.best_params_} Best Score: {random_search.best_score_}"
    )

MAXIMAZED = False