In [2]:
from json import dump, load
from random import sample

import matplotlib.pyplot as plt
from joblib import Parallel, delayed, dump
from numpy import ndarray, where
from pandas import DataFrame, concat, read_csv
from sklearn.metrics import (
    accuracy_score,
    auc,
    f1_score,
    matthews_corrcoef,
    precision_recall_curve,
    roc_auc_score,
)
from sklearn.svm import OneClassSVM

TECHNIQUES = ["Grid Search", "Random Search", "Bayesian Opt"]
POLLUTIONS = [0.2, 0.4, 0.75, 0.9]
RESULTS: dict[str, dict[float, dict[str, float]]] = {
    technique: {} for technique in TECHNIQUES
}

In [3]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

In [4]:
def filter_train_test(data: DataFrame) -> tuple[list, DataFrame, DataFrame]:
    classes: list[int] = [
        num for num in sample(X_train["activity"].value_counts().index.to_list(), 6)
    ]
    classes.sort()
    return (
        classes,
        data[data["activity"].isin(classes)],
        data[~data["activity"].isin(classes)],
    )


def test_ocsvm_with_pollution(
    train: DataFrame, test: DataFrame, model: OneClassSVM, percent: float
) -> dict[str, float]:
    novelty = concat([test, train.sample(n=int(percent * len(train)), random_state=42)])
    preds = where(model.predict(novelty.drop(columns=["isNovelty"])) == -1, True, False)
    precision, recall, _ = precision_recall_curve(novelty["isNovelty"], preds)

    return {
        "f1": float(f1_score(novelty["isNovelty"], preds)),
        "mcc": float(matthews_corrcoef(novelty["isNovelty"], preds)),
        "acc": float(accuracy_score(novelty["isNovelty"], preds)),
        "pr_auc": float(auc(recall, precision)),
        "roc_auc": float(roc_auc_score(novelty["isNovelty"], preds)),
    }

In [None]:
classes, training, testing = filter_train_test(concat([X_train, X_test]))
training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True

print(f"Training with: {classes}")
with open("../conf/top_results.json", "r") as f:
    top_results = load(f)

# Extract training data once as numpy array (lighter to share)
X_training = training.drop(columns=["isNovelty"]).values


def create_ocsvm(params: dict, technique: str, train_data: ndarray = X_training):
    print(f"fitting model for {technique}")
    return OneClassSVM(
        kernel="rbf", gamma=params["gamma"][0], tol=params["tol"][0], nu=params["nu"][0]
    ).fit(train_data)


models = {
    technique: result
    for technique, result in zip(
        TECHNIQUES,
        Parallel(n_jobs=-1)(
            delayed(create_ocsvm)(top_results[technique], technique, X_training)
            for technique in TECHNIQUES
        ),
    )
}

In [None]:
for technique, model in models.items():
    with open(f"../models/{technique.replace(' ', '_')}.joblib", "wb") as f:
        dump(model, f)  # type: ignore

FileNotFoundError: [Errno 2] No such file or directory: '../models/Grid_Search.joblib'

In [None]:
for pollution in POLLUTIONS:
    print(f"Testing with {pollution * 100}% pollution")
    for name, model in models.items():
        result = {
            pollution: test_ocsvm_with_pollution(training, testing, model, pollution)
        }
        RESULTS[name].update(result)
        print(f"Results for {name}:\n{result[pollution]}")

with open("../conf/result_polution.json", "w") as fp:
    dump(RESULTS, fp)

Testing with 20.0% pollution




In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 8), sharey=True)

for i, (technique_name, df) in enumerate(
    (
        (name, DataFrame(RESULTS[name]).transpose())
        for name in ["Grid Search", "Random Search", "Bayesian Opt"]
    )
):
    for metric in df.columns.tolist():
        axes[i].plot(df.index.tolist(), df[metric], marker="o", label=metric)

    axes[i].set_ylabel("Resultados" if i == 0 else "")
    axes[i].set_xlabel("% de novidade")
    axes[i].set_title(technique_name)
    axes[i].tick_params(axis="x", rotation=0)
    axes[i].legend(loc="best", fontsize=18)

fig.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()