In [76]:
import logging
from typing import Final

from numpy import where
from pandas import DataFrame, Series, concat, read_csv, set_option
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import OneClassSVM

set_option("display.max_columns", None)
NUM_TRIALS: Final[int] = 5
LOGS_PATH: Final[str] = "../reports/logs_normal.log"
logging.basicConfig(filename=LOGS_PATH, level=logging.INFO)

In [77]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

models: dict[int, dict] = {}
training_data: DataFrame
testing_data: DataFrame
train_targets: Series
test_targets: Series

RESULTS: dict[int, dict[str, float | int]] = {}
MIN_SAMPLES = X_train["activity"].value_counts().sort_values().iloc[0]
MAXIMAZED = False

In [82]:
def score_function(model: OneClassSVM, Train: DataFrame, test: Series) -> float:
    """
    Objective function to maximize, calcs the F1 score on the test set.
    follows the format needed by scikit-learn's API.

    Args:
        model (OneClassSVM): Model to eval
        X_test (DataFrame): train data
        y_true (Series): true targets

    Returns:
        float: F1 score
    """
    params = model.get_params()
    f1 = f1_score(
        test_targets, where(model.predict(testing_data) == -1, True, False), pos_label=1
    )
    logging.info(
        {
            "target": f1,
            "params": {"gamma": params["gamma"], "nu": params["nu"]},
            # "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%s"),
        }
    )
    return f1

In [84]:
for i in range(1, len((activities := X_train["activity"].unique()))):
    training = (  # picks the first n samples of each class
        X_train[X_train["activity"].isin(activities[:i])]
        .groupby("activity")
        .head(MIN_SAMPLES)
    )
    testing = X_test[X_test["activity"] == activities[i]].head(MIN_SAMPLES)
    training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True
    novelty = concat(
        [testing, training.sample(n=int(0.15 * len(training)), random_state=42)]
    )
    training_data = training.drop(columns=["isNovelty"])
    train_targets = training["isNovelty"]
    # only current activity (as novelty)
    testing_data = novelty.drop(columns=["isNovelty"])
    test_targets = novelty["isNovelty"]

    print(f"Training for activities {activities[:i]}")

    grid_search = GridSearchCV(
        estimator=OneClassSVM(kernel="rbf"),
        param_grid={
            "nu": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
            "gamma": [0.001, 0.01, 0.1, 0.5, 1],
        },
        scoring=score_function,
        # n_jobs=-1,
        cv=4,
        verbose=1,
        error_score="raise",
    ).fit(training_data, train_targets)
    print("Grid Search Best Params:", grid_search.best_params_)

Training for activities [1] with 3483 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01}
Training for activities [1 2] with 6966 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01}
Training for activities [1 2 3] with 10449 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01}
Training for activities [1 2 3 4] with 13932 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.01}
Training for activities [1 2 3 4 5] with 17415 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.05}
Training for activities [1 2 3 4 5 6] with 20898 points
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Grid Search Best Params: {'gamma': 0.001, 'nu': 0.05}


In [None]:
for i in range(1, len((activities := X_train["activity"].unique()))):
    training = (  # picks the first n samples of each class
        X_train[X_train["activity"].isin(activities[:i])]
        .groupby("activity")
        .head(MIN_SAMPLES)
        .copy()
    )
    testing = X_test[X_test["activity"] == activities[i]].head(MIN_SAMPLES)
    training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True
    novelty = concat(
        [testing, training.sample(n=int(0.15 * len(training)), random_state=42)]
    )
    training_data = training.drop(columns=["isNovelty"])
    train_targets = training["isNovelty"]
    # only current activity (as novelty)
    testing_data = novelty.drop(columns=["isNovelty"])
    test_targets = novelty["isNovelty"]

    random_search = RandomizedSearchCV(
        estimator=OneClassSVM(kernel="rbf"),
        param_distributions={
            "nu": [0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.5],
            "gamma": [0.001, 0.01, 0.025, 0.1, 0.25, 0.5, 1],
        },
        n_iter=30,
        scoring=make_scorer(score_func=score_function),
        n_jobs=-1,
        cv=5,
        verbose=3,
        random_state=42,
    ).fit(X_train)
    print("Random Search Best Params:", random_search.best_params_)