In [17]:
import json
import logging
from datetime import datetime

from pandas import DataFrame, Series, concat, read_csv
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import OneClassSVM

logging.basicConfig(filename="../reports/log_normal.log", level=logging.INFO)

In [18]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

MIN_SAMPLES = X_train["activity"].value_counts().sort_values().iloc[0]

In [19]:
def score_function(model: OneClassSVM, X_test: DataFrame, y_true: Series) -> float:
    """
    Objective function to maximize, calcs the F1 score on the test set.
    follows the format needed by scikit-learn's API.

    Args:
        model (OneClassSVM): Model to eval
        X_test (DataFrame): train data
        y_true (Series): true targets

    Returns:
        float: F1 score
    """
    params = model.get_params()
    nu, gamma = params["nu"], params["gamma"]
    f1 = f1_score(len(y_true), model.predict(X_test), pos_label=1)
    logging.info(
        json.dumps(
            {
                "target": f1,
                "params": {"gamma": gamma, "nu": nu},
                "datetime": datetime.now().strftime(format="%Y-%m-%d %H:%M:%S"),
            }
        )
    )
    return float(f1)

In [16]:
for i in range(1, len((activities := X_train["activity"].unique()))):
    training = (  # picks the first n samples of each class
        X_train[X_train["activity"].isin(activities[:i])]
        .groupby("activity")
        .head(MIN_SAMPLES)
        .copy()
    )
    testing = X_test[X_test["activity"] == activities[i]].head(MIN_SAMPLES)
    training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True
    novelty = concat(
        [testing, training.sample(n=int(0.15 * len(training)), random_state=42)]
    )
    training_data = training.drop(columns=["isNovelty"])
    train_targets = training["isNovelty"]
    # only current activity (as novelty)
    testing_data = novelty.drop(columns=["isNovelty"])
    test_targets = novelty["isNovelty"]

    grid_search: GridSearchCV[OneClassSVM] = GridSearchCV(
        estimator=OneClassSVM(kernel="rbf"),
        param_grid={
            "nu": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
            "gamma": [0.001, 0.01, 0.1, 0.5, 1],
        },
        scoring=make_scorer(score_func=score_function),
        n_jobs=-1,
        verbose=3,
        cv=4,
    ).fit(X_train)
    print("Grid Search Best Params:", grid_search.best_params_)

Fitting 4 folds for each of 30 candidates, totalling 120 fits


KeyboardInterrupt: 

In [None]:
for i in range(1, len((activities := X_train["activity"].unique()))):
    training = (  # picks the first n samples of each class
        X_train[X_train["activity"].isin(activities[:i])]
        .groupby("activity")
        .head(MIN_SAMPLES)
        .copy()
    )
    testing = X_test[X_test["activity"] == activities[i]].head(MIN_SAMPLES)
    training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True
    novelty = concat(
        [testing, training.sample(n=int(0.15 * len(training)), random_state=42)]
    )
    training_data = training.drop(columns=["isNovelty"])
    train_targets = training["isNovelty"]
    # only current activity (as novelty)
    testing_data = novelty.drop(columns=["isNovelty"])
    test_targets = novelty["isNovelty"]

    random_search = RandomizedSearchCV(
        estimator=OneClassSVM(kernel="rbf"),
        param_distributions={
            "nu": [0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.5],
            "gamma": [0.001, 0.01, 0.025, 0.1, 0.25, 0.5, 1],
        },
        n_iter=30,
        scoring=make_scorer(score_func=score_function),
        n_jobs=-1,
        cv=5,
        verbose=3,
        random_state=42,
    ).fit(X_train)
    print("Random Search Best Params:", random_search.best_params_)