In [None]:
import re
from ast import literal_eval
from datetime import datetime
from json import dump as json_dump
from json import load as json_load
from json import loads
from os import environ, makedirs
from os.path import isfile, join
from random import sample

import matplotlib.pyplot as plt
from joblib import Parallel, delayed, dump, load
from numpy import ndarray, where
from pandas import DataFrame, concat, json_normalize, read_csv, to_datetime
from sklearn.metrics import (
    accuracy_score,
    auc,
    f1_score,
    matthews_corrcoef,
    precision_recall_curve,
    roc_auc_score,
)
from sklearn.svm import OneClassSVM

json_pattern = re.compile(r"\[INFO\] (.+)$")
log_pattern = re.compile(r"({.*})")

MODELS_UPDATED = environ["MODELS_UPDATED"] == "true"
TECHNIQUES = ["Grid Search", "Random Search", "Bayesian Opt"]
POLLUTIONS = [0.2, 0.4, 0.75, 0.9]
RESULTS: dict[str, dict[float, dict[str, float]]] = {
    technique: {} for technique in TECHNIQUES
}

In [None]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

x_label = [f"{num[0]}" for num in y_test.value_counts().index.sort_values().to_list()][
    1:
]

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

In [None]:
def extract_result(file_name: str) -> DataFrame:
    records = []
    with open(f"../reports/{file_name}.log", "r") as file:
        for line in file:
            if match := log_pattern.search(line):
                raw_data = match.group(1).strip()
                data = (
                    literal_eval(raw_data)
                    if raw_data.startswith("{'")
                    or raw_data.startswith('{"')
                    and "'" in raw_data
                    else loads(raw_data)
                )
                # Extract datetime from JSON and normalize datetime to 'YYYY-MM-DD HH:MM:SS'
                records.append(
                    {
                        "target": data["target"],
                        "datetime": datetime.strptime(
                            data["datetime"][:19]
                            if isinstance(data["datetime"], str)
                            else data["datetime"]["datetime"],
                            "%Y-%m-%d %H:%M:%S",
                        ).strftime("%Y-%m-%d %H:%M:%S"),
                        "gamma": data["params"]["gamma"],
                        "nu": data["params"]["nu"],
                        "tol": data["params"]["tol"],
                    }
                )
    return json_normalize(records)


def normalize_result(df: DataFrame) -> DataFrame:
    return (
        df.groupby(df.index // 4)
        .agg(
            {
                "target": "mean",
                "datetime": "first",
                "gamma": "first",
                "nu": "first",
                "tol": "first",
            }
        )
        .reset_index(drop=True)
    )


def extract_datetime_part(dt_str):
    # This will capture the first 'YYYY-MM-DD HH:MM' part
    match = re.match(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2})", dt_str)
    return match.group(1) if match else None


def split_dataframe(df, chunk_sizes):
    assert sum(chunk_sizes) <= len(df), "Chunk sizes exceed DataFrame length"

    grid_chunks = []
    start = 0
    for size in chunk_sizes:
        end = start + size
        grid_chunks.append(df.iloc[start:end].reset_index(drop=True))
        start = end

    return grid_chunks


def pick_best_target(chunks):
    best_targets = []
    labels = []

    for chunk in chunks:
        best_row = chunk.loc[chunk["target"].idxmax()]
        best_targets.append(best_row["target"])
        labels.append(
            f"γ={best_row['gamma']}\nν={best_row['nu']}\ntol={best_row['tol']}"
        )
    return best_targets, labels


def get_top_params(df):
    return {
        key: val[0]
        for key, val in df.sort_values("target", ascending=False)
        .head(1)[["gamma", "nu", "tol"]]
        .to_dict("list")
        .items()
    }

In [None]:
df_grid = normalize_result(extract_result("logs_grid"))
df_grid["datetime"] = to_datetime(
    df_grid["datetime"].apply(extract_datetime_part), format="%Y-%m-%d %H:%M"
)
print(
    f"Total Time Grid Search: {df_grid['datetime'].max() - df_grid['datetime'].min()}"
)
grid_chunks = split_dataframe(df_grid, [100] + [30] * 10)

df_rand = normalize_result(extract_result("logs_rand"))
df_rand["datetime"] = to_datetime(
    df_rand["datetime"].apply(extract_datetime_part), format="%Y-%m-%d %H:%M"
)
print(
    f"Total Time Random Search: {df_rand['datetime'].max() - df_rand['datetime'].min()}"
)
rand_chunks = split_dataframe(df_rand, [100] + [20] * 10)

df_bayes = extract_result("logs_bayesian")
df_bayes["datetime"] = to_datetime(
    df_bayes["datetime"].apply(extract_datetime_part), format="%Y-%m-%d %H:%M"
)
print(
    f"Total Time Bayesian Search: {df_bayes['datetime'].max() - df_bayes['datetime'].min()}"
)
bayes_chunks = split_dataframe(df_bayes, [100] + [5] * 10)

with open("../conf/top_results.json", "w") as f:
    json_dump(
        {
            "Grid Search": get_top_params(df_grid),
            "Random Search": get_top_params(df_rand),
            "Bayesian Opt": get_top_params(df_bayes),
        },
        f,
        indent=4,
    )


In [None]:
best_targets_grid, labels = pick_best_target(grid_chunks)
best_targets_rand, _ = pick_best_target(rand_chunks)
best_targets_bayes, _ = pick_best_target(bayes_chunks)

methods = [
    ("Grid Search", best_targets_grid, "b", labels, (0, 10), "black"),
    ("Random Search", best_targets_rand, "orange", labels, (0, -15), "orange"),
    ("Bayesian Opt.", best_targets_bayes, "green", labels, (0, -15), "green"),
]
plt.figure(figsize=(12, 6))

for name, targets, color, lbls, offset, ann_color in methods:
    plt.plot(x_label, targets, marker="o", linestyle="-", color=color, label=name)
    # for xi, yi, lbl in zip(x_label, targets, lbls):
    #     plt.annotate(
    #         lbl,
    #         (xi, yi),
    #         textcoords="offset points",
    #         xytext=offset,
    #         ha="center",
    #         fontsize=9,
    #         color=ann_color,
    #     )

plt.title("Comparação dos Resultados")
plt.xlabel("Classe de Teste")
plt.ylabel("Melhor Valor F1")
plt.xticks(x_label)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
def filter_train_test(data: DataFrame) -> tuple[list, DataFrame, DataFrame]:
    classes: list[int] = [
        num for num in sample(X_train["activity"].value_counts().index.to_list(), 6)
    ]
    classes.sort()
    mask = data["activity"].isin(classes)
    return (classes, data[mask], data[~mask])


def test_ocsvm_with_pollution(
    train: DataFrame, test: DataFrame, model: OneClassSVM, percent: float
) -> dict[str, float]:
    novelty = concat([test, train.sample(n=int(percent * len(train)), random_state=42)])
    preds = where(model.predict(novelty.drop(columns=["isNovelty"])) == -1, True, False)
    precision, recall, _ = precision_recall_curve(novelty["isNovelty"], preds)

    return {
        "f1": float(f1_score(novelty["isNovelty"], preds)),
        "mcc": float(matthews_corrcoef(novelty["isNovelty"], preds)),
        "acc": float(accuracy_score(novelty["isNovelty"], preds)),
        "pr_auc": float(auc(recall, precision)),
        "roc_auc": float(roc_auc_score(novelty["isNovelty"], preds)),
    }

In [None]:
classes, training, testing = filter_train_test(concat([X_train, X_test]))
training.loc[:, "isNovelty"], testing.loc[:, "isNovelty"] = False, True

print(f"Training with: {classes}")
with open("../conf/top_results.json", "r") as f:
    top_results = json_load(f)

# Extract training data once as numpy array (lighter to share)
X_training = training.drop(columns=["isNovelty"]).values


def create_ocsvm(params: dict, technique: str, train_data: ndarray = X_training):
    print(f"fitting model for {technique}")
    return OneClassSVM(
        kernel="rbf", gamma=params["gamma"], tol=params["tol"], nu=params["nu"]
    ).fit(train_data)


def load_or_train_models(techniques, top_results, X_training, model_dir="./models"):
    """Load models from disk if available, otherwise train and save them."""
    makedirs(model_dir, exist_ok=True)
    model_paths = {
        technique: join(model_dir, f"{technique.replace(' ', '_')}.joblib")
        for technique in TECHNIQUES
    }
    models = {}

    if all(isfile(path) for path in model_paths.values()) and MODELS_UPDATED:
        print("Loading models from disk...")
        for technique, path in model_paths.items():
            models[technique] = load(path)
            print(f"Loaded model for {technique} from {path}")
    else:
        print("Training models...")
        models = {
            technique: result
            for technique, result in zip(
                TECHNIQUES,
                Parallel(n_jobs=-1)(
                    delayed(create_ocsvm)(top_results[technique], technique, X_training)
                    for technique in TECHNIQUES
                ),
            )
        }
        for technique, model in models.items():
            dump(model, f"./models/{technique.replace(' ', '_')}.joblib")  # type: ignore

    return models

In [None]:
models = load_or_train_models(TECHNIQUES, top_results, X_training)
results_list = Parallel(n_jobs=-1)(
    delayed(test_ocsvm_with_pollution)(training, testing, model, pollution)
    for pollution in POLLUTIONS
    for _, model in models.items()
)

# Assign results back to RESULTS
idx = 0
for pollution in POLLUTIONS:
    print(f"Testing with {pollution * 100}% pollution")
    for name in models.keys():
        result = {pollution: results_list[idx]}
        RESULTS[name].update(result)
        idx += 1

with open("../conf/result_polution.json", "w") as fp:
    json_dump(RESULTS, fp)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 8), sharey=True)

for i, (technique_name, df) in enumerate(
    (
        (name, DataFrame(RESULTS[name]).transpose())
        for name in ["Grid Search", "Random Search", "Bayesian Opt"]
    )
):
    for metric in df.columns.tolist():
        axes[i].plot(df.index.tolist(), df[metric], marker="o", label=metric)

    axes[i].set_ylabel("Resultados" if i == 0 else "")
    axes[i].set_xlabel("% de novidade")
    axes[i].set_title(technique_name)
    axes[i].tick_params(axis="x", rotation=0)
    # 'upper right', 'upper left', 'lower left', 'lower right', 'right', 'center left', 'center right', 'lower center', 'upper center', 'center'
    if i == 0:
        axes[i].legend(loc="lower left", fontsize=18)

fig.tight_layout(rect=(0.0, 0.03, 1.0, 0.95))
plt.show()