In [32]:
from json import dump
from random import sample

from numpy import where
from pandas import DataFrame, concat, read_csv
from sklearn.metrics import (
    accuracy_score,
    auc,
    f1_score,
    matthews_corrcoef,
    precision_recall_curve,
    roc_auc_score,
)
from sklearn.svm import OneClassSVM

RESULTS: dict[str, dict[float, dict[str, float]]] = {
    "Grid Search": {},
    "Random Search": {},
    "Bayesian Optimization": {},
}

In [33]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

In [34]:
def filter_train_test(data: DataFrame) -> tuple[list, DataFrame, DataFrame]:
    classes: list[int] = [
        num for num in sample(X_train["activity"].value_counts().index.to_list(), 6)
    ]
    classes.sort()
    return (
        classes,
        data[data["activity"].isin(classes)],
        data[~data["activity"].isin(classes)],
    )

In [35]:
def train_ocsvm_with_pollution(
    training: DataFrame,
    testing: DataFrame,
    params: dict[str, float],
    pollution_percent: float,
) -> dict[str, float]:
    oc_svm = OneClassSVM(kernel="rbf", nu=params["nu"], gamma=params["gamma"]).fit(
        training.drop(columns=["isNovelty"])
    )
    novelty = concat(
        [
            testing,
            training.sample(n=int(pollution_percent * len(training)), random_state=42),
        ]
    )
    preds = where(
        oc_svm.predict(novelty.drop(columns=["isNovelty"])) == -1, True, False
    )
    f1 = f1_score(novelty["isNovelty"], preds)
    mcc = matthews_corrcoef(novelty["isNovelty"], preds)
    acc = accuracy_score(novelty["isNovelty"], preds)

    precision, recall, _ = precision_recall_curve(novelty["isNovelty"], preds)

    return {
        "f1": f1,
        "mcc": mcc,
        "acc": acc,
        "pr_auc": auc(recall, precision),
        "roc_auc": roc_auc_score(novelty["isNovelty"], preds),
    }

In [36]:
for pollution in [0.2, 0.55, 0.8]:
    classes, training, testing = filter_train_test(concat([X_train, X_test]))
    training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True
    print(
        f"Results for Grid Search:"
        f"\nTraining with: {classes} with {pollution * 100}% pollution"
    )
    result = train_ocsvm_with_pollution(
        training, testing, {"nu": 0.01, "gamma": 0.01}, pollution
    )
    RESULTS["Grid Search"].update({pollution: result})
    print(f"{result}", "=" * 40)
    # print(
    #     f"Results for Random Search:"
    #     f"\nTrained with: {classes} with {pollution * 100}% pollution"
    # )
    # result = train_ocsvm_with_pollution(
    #     training, testing, {"nu": 0.01, "gamma": 0.01}, pollution
    # )
    # RESULTS["Random Search"].update({pollution: result})
    # print(f"{result}", "=" * 40)
    # print(
    #     f"Results for Bayesian Optimization:"
    #     f"\nTrained with: {classes} with {pollution * 100}% pollution"
    # )
    # result = train_ocsvm_with_pollution(
    #     training, testing, {"nu": 0.01, "gamma": 0.01}, pollution
    # )
    # RESULTS["Bayesian Optimization"].update({pollution: result})
    # print(f"{result}", "=" * 40)

Results for Grid Search:
Training with: [1, 2, 5, 7, 16, 17] with 20.0% pollution
Results for Grid Search:
Training with: [1, 2, 4, 5, 6, 24] with 55.00000000000001% pollution
Results for Grid Search:
Training with: [5, 6, 7, 13, 16, 17] with 80.0% pollution


In [38]:
with open("../reports/result_all.json", "w") as fp:
    dump(RESULTS, fp)