# Baseline Experiments

In [None]:
import os
from multiprocessing import Pool

import pandas as pd
import numpy as np
import h5py
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from utils import cole_kripke, sadeh, calc_metrics

## Set gloabel variables

In [None]:
# Set information about the dataset
HDF5_FILE_PATH = os.path.join(os.sep, 'home', 'data', "ANNOTATED_BEDTIME_TU7.hdf5")
EPOCH_FILE_PATH = os.path.join(os.sep, 'home', 'data', "ACTIGRAPH_EPOCHS_TU7_CLEAN.hdf5")

COLNAMES = ["Time", "X", "Y", "Z", "Annotated Time in Bed"]
SAMPLE_RATE = 100
LABEL_DICT = {False: 0, True: 1}
EXCLUDED_DATASETS = []
EPOCH_LENGTH = "60s"

## Load subjects

In [None]:
with h5py.File(HDF5_FILE_PATH, "r") as hdf5_file:
    subjects = [subject for subject in hdf5_file.keys() if subject not in EXCLUDED_DATASETS]

## Define processing pipeline

In [None]:
def process_subject(subject):
    try:
        time_col, x_col, y_col, z_col, target_col = COLNAMES

        data = pd.read_hdf(HDF5_FILE_PATH, key=subject)
        data = data[[time_col, target_col]]

        # Fix time format (there might be a quicker way, but it works)
        #data.loc[:,time_col] = pd.to_datetime(data[time_col].dt.strftime('%Y-%d-%m %H:%M:%S'), format='%Y-%m-%d %H:%M:%S')

        epochs = pd.read_hdf(EPOCH_FILE_PATH, key=subject)
        epochs = epochs[[time_col, y_col]]

        if min(data[time_col]).strftime('%Y-%d-%m') == min(epochs[time_col]).strftime('%Y-%m-%d'):
            start_time = pd.Timestamp(min(data[time_col]).strftime('%Y-%d-%m %H:%M:%S')) # Apparently d and m are switched in the data.
            data.loc[:,time_col] = pd.date_range(start_time, periods=data.shape[0], freq="10ms")
        else:
            start_time = min(data[time_col])
            
        end_time = max(data[time_col])

        epochs = epochs.loc[(epochs[time_col] > start_time) & (epochs[time_col] < end_time), :]
        epochs = epochs.set_index(time_col).resample(EPOCH_LENGTH).sum()

        epochs = pd.merge_asof(epochs, data, on=time_col, direction="nearest")
        epochs.loc[:, "Counts"] = np.clip(epochs[y_col], 0, 300)

        # Predict with Sadeh algorithm
        epochs.loc[:,"Sadeh"] = sadeh(epochs["Counts"])

        # Predict with Cole_Kripke algorithm / Divide by 100 for Cole Kripke algorithm
        epochs.loc[:, "Cole-Kripke"] = cole_kripke(epochs["Counts"] / 100)

        results = pd.DataFrame(columns=["Subject", "Method", "Accuracy", "Precision", "Recall", "F1 Score"])
        results.loc[0, :] = [subject, "Sadeh"] + list(calc_metrics(epochs[target_col], epochs["Sadeh"]))
        results.loc[1, :] = [subject, "Cole-Kripke"] + list(calc_metrics(epochs[target_col], epochs["Cole-Kripke"]))

        return results

    except Exception as msg:
        print(f"Problem processing {subject}: {msg}")

## Calculate Sadeh and Cole-Kripke results for each subject

In [None]:
with Pool(200) as p:
    results = pd.concat(tqdm(p.imap(process_subject, subjects), total=len(subjects)), axis=0, ignore_index=True)

## Simulate cross-validation to get an idea about the between fold variation

In [None]:
results.loc[:, "Fold"] = -1

# Do 10-fold cross-validation
kf = KFold(n_splits=10)
for fold, (_, test_idx) in enumerate(kf.split(np.arange(len(subjects)))):
    subjects_in_fold = [subjects[idx] for idx in test_idx]
    results.loc[[subject in subjects_in_fold for subject in results["Subject"]], "Fold"] = fold

results.to_csv("results/exp01_baseline.csv", index=False)