In [1]:
import os
import sys
sys.path.append("..")

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import collections as matcoll
%matplotlib inline

In [3]:
from data_utils.dataloader import Dataloader
from data_utils.preprocessor import Preprocessor
from data_utils import X_HEADER

In [4]:
data_base_path = os.path.join("..", "data", "TrainingData")
dataloader = Dataloader(data_base_path)

In [6]:
metadata_path = "metadata"
split_ids_file = os.path.join(metadata_path, "split_ids.json")
with open(split_ids_file, "r") as f:
    split_ids = json.load(f)
train_ids = split_ids["train"]

In [8]:
samplewise_stats = []
for uid in train_ids:
    subject_id, session_number = uid.split("_")
    try:
        x, y = dataloader.load_and_join_data(subject_id, session_number)
        stats = x.describe().reset_index(drop=False).rename(columns={"index": "stat"})
        stats = stats[stats["stat"].isin(["min", "max"])]
        stats = stats.drop(columns="time")
        samplewise_stats.append(stats)
        print(f"Subject {subject_id} session {session_number} done.")
    except FileNotFoundError:
        print(f"Subject {subject_id} session {session_number} not found.")

Subject 005 session 02 done.
Subject 001 session 06 done.
Subject 003 session 02 done.
Subject 001 session 05 done.
Subject 002 session 02 done.
Subject 003 session 01 done.
Subject 003 session 03 done.
Subject 005 session 01 done.
Subject 001 session 07 done.
Subject 002 session 05 done.
Subject 004 session 02 done.
Subject 002 session 03 done.
Subject 001 session 02 done.
Subject 002 session 04 done.
Subject 001 session 03 done.
Subject 004 session 01 done.


In [9]:
stat_df = pd.concat(samplewise_stats, axis=0)

In [10]:
stats_to_find = ["min", "max"]

In [11]:
aggregate_stats = []
for stat in stats_to_find:
    agg = getattr(stat_df.groupby("stat"), stat)()
    agg = agg.reset_index(drop=False)
    agg = agg[agg["stat"] == stat]
    aggregate_stats.append(agg)

In [12]:
final_stats = pd.concat(aggregate_stats, axis=0).reset_index(drop=True)
final_stats

Unnamed: 0,stat,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z
0,min,-39.13261,-38.92137,-31.50025,-11.62605,-12.19817,-6.345545
1,max,39.26,39.49,38.1886,10.72668,10.93212,8.093803


In [13]:
final_stats.to_csv(os.path.join("metadata", "statistics.csv"), header=True, index=False)

In [14]:
stats_dict = {feature: dict() for feature in X_HEADER}
for i, row in final_stats.iterrows():
    for feature in X_HEADER:
        stats_dict[feature][row["stat"]] = row[feature]

In [15]:
stats_dict

{'acc_x': {'min': -39.13261, 'max': 39.26},
 'acc_y': {'min': -38.92137, 'max': 39.49},
 'acc_z': {'min': -31.50025, 'max': 38.1886},
 'gyro_x': {'min': -11.62605, 'max': 10.72668},
 'gyro_y': {'min': -12.19817, 'max': 10.93212},
 'gyro_z': {'min': -6.345545, 'max': 8.093803}}

In [16]:
with open(os.path.join("metadata", "statistics.json"), "w") as f:
    json.dump(stats_dict, f, indent=2)