In [None]:
!pip install pandas

In [1]:
import os
import glob
import pandas as pd
from typing import Dict, List, Optional, Tuple, Counter
from pathlib import Path


In [None]:
LABEL_MAP: Dict[str, int] = {
    'null': 0, 'jogging': 1, 'jogging (rotating arms)': 2, 'jogging (skipping)': 3,
    'jogging (sidesteps)': 4, 'jogging (butt-kicks)': 5,
    'stretching (triceps)': 6, 'stretching (lunging)': 7, 'stretching (shoulders)': 8,
    'stretching (hamstrings)': 9, 'stretching (lumbar rotation)': 10,
    'push-ups': 11, 'push-ups (complex)': 12, 'sit-ups': 13, 'sit-ups (complex)': 14,
    'burpees': 15, 'lunges': 16, 'lunges (complex)': 17, 'bench-dips': 18
}
LABEL_MAP_LOWER = {k.lower(): v for k, v in LABEL_MAP.items()}
LOCATIONS: List[str] = ["right_arm", "left_arm", "right_leg", "left_leg"]
RAW_DIR   = Path('data')
OUTPUT_DIR = RAW_DIR/'dataset_without_null'
TRAIN_DIR = RAW_DIR / 'train'
WINDOW_SIZE    = 50
OVERLAP_FRAC   = 0.5

In [None]:
def is_reasonable(coord: float) -> bool:
    return pd.notna(coord) and -20.0 <= coord <= 20.0

def most_common_label(window_labels: List[int]) -> int:
    cnt = Counter(window_labels)
    return sorted(cnt.items(), key=lambda x: (-x[1], x[0]))[0][0]

def build_windowed_datasets(
    train_folder: str,
    locations: List[str] = LOCATIONS,
    window_size: int = WINDOW_SIZE,
    overlap_frac: float = OVERLAP_FRAC
) -> Dict[str, pd.DataFrame]:
    step = int(window_size * (1 - overlap_frac))
    buffers: Dict[str, List[Dict]] = {loc: [] for loc in locations}
    global_id = 0

    for csv_path in glob.glob(os.path.join(train_folder, "*.csv")):
        df = pd.read_csv(csv_path)
        if "sbj_id" not in df.columns or "label" not in df.columns:
            raise ValueError(f"Basis-Spalte fehlt in {csv_path}")

        df = df.dropna(subset=["label"])
        df["label"] = df["label"].map(LABEL_MAP)
        for sbj_id, df_sb in df.groupby("sbj_id", sort=False):
            n = len(df_sb)
            for loc in locations:
                cols_xyz = [f"{loc}_acc_{axis}" for axis in ("x", "y", "z")]
                if not set(cols_xyz).issubset(df_sb.columns):
                    continue
                arr = df_sb[cols_xyz].to_numpy()
                labels = df_sb["label"].to_numpy()

                for start in range(0, n - window_size + 1, step):
                    window = arr[start : start + window_size]
                    lbl = most_common_label(labels[start : start + window_size])

                    buffers[loc].append({
                        "id":        global_id,
                        "sbj_id":    int(sbj_id),
                        "label":     int(lbl),
                        "x_axis":    window[:, 0].tolist(),
                        "y_axis":    window[:, 1].tolist(),
                        "z_axis":    window[:, 2].tolist(),
                    })
                    global_id += 1

    datasets = {
        loc: pd.DataFrame(buffers[loc], columns=["id", "sbj_id", "label", "x_axis", "y_axis", "z_axis"])
        for loc in locations if buffers[loc]
    }
    return datasets

def save_windowed_datasets(
    datasets: Dict[str, pd.DataFrame],
    out_folder: str
) -> None:
    os.makedirs(out_folder, exist_ok=True)
    for loc, df in datasets.items():
        path = os.path.join(out_folder, f"{loc}_windows.csv")
        df.to_csv(path, index=False)
        print(f"→ {loc}: {len(df)} to {path}")

In [None]:
sets = build_windowed_datasets(TRAIN_DIR)
save_windowed_datasets(sets, OUTPUT_DIR)


In [3]:
df = pd.read_csv("data/dataset_without_null/left_arm_windows.csv")

In [4]:
df

Unnamed: 0,id,sbj_id,label,x_axis,y_axis,z_axis
0,3667,6,1,"[-0.9640337142857144, -0.923407619047619, -0.9...","[0.5363462857142857, 0.5922266666666667, 0.717...","[-0.3019099999999999, -0.3076314285714285, -0...."
1,3668,6,1,"[0.3273762857142857, 0.2956210476190476, 0.275...","[0.6088466666666666, 0.5863520476190476, 0.458...","[-0.2077032380952381, -0.1601060952380952, -0...."
2,3669,6,1,"[0.5694409523809524, 0.6152040952380953, 0.630...","[-0.1511814285714285, -0.2214044761904762, -0....","[-0.0180695238095238, -0.0593889523809523, -0...."
3,3670,6,1,"[-1.1438057142857143, -1.0607022857142856, -0....","[-0.5799330476190475, -0.4933303809523809, -0....","[0.0398948571428571, 0.032551619047619, -0.063..."
4,3671,6,1,"[0.972435238095238, 1.3619304761904762, 1.4231...","[-0.2945504761904762, 0.7389723809523807, 3.40...","[0.0324823809523809, 0.7876134285714285, 1.720..."
...,...,...,...,...,...,...
83520,327533,18,14,"[0.0166660952380952, 0.0179904285714285, 0.027...","[0.9632597142857144, 0.9632148571428572, 0.962...","[-0.2828838095238095, -0.3048982857142857, -0...."
83521,327534,18,14,"[0.0361668571428571, 0.0343228571428571, 0.032...","[0.9803518095238096, 0.9764653333333332, 0.976...","[-0.2946061904761904, -0.294466, -0.28995505, ..."
83522,327535,18,14,"[0.1306798095238095, 0.0334744166666666, -0.05...","[1.4025302380952382, 1.4302977916666666, 1.209...","[-0.1656064761904762, -0.2415722583333333, -0...."
83523,327536,18,14,"[0.3413825, 0.1800518214285714, 0.069964916666...","[1.7818789285714285, 1.5744374, 1.4250406, 1.2...","[0.3389154761904762, 0.3532424857142857, 0.064..."


In [6]:
from numpy import nan

In [7]:
df["x_axis"] = df["x_axis"].apply(lambda row: eval(row))

In [8]:
df["y_axis"] = df["y_axis"].apply(lambda row: eval(row))


In [9]:
df["z_axis"] = df["z_axis"].apply(lambda row: eval(row))


In [10]:
df["x_axis"] = df["x_axis"].apply(lambda row: [x if x != nan else 0.0 for x in row])


In [11]:
df["y_axis"] = df["y_axis"].apply(lambda row: [x if x != nan else 0.0 for x in row])


In [12]:
df["z_axis"] = df["z_axis"].apply(lambda row: [x if x != nan else 0.0 for x in row])


In [13]:
df.to_csv("data/dataset_without_null/left_arm_windows_wonan.csv", index=False)

In [14]:
df["x_axis"][0]

[-0.9640337142857144,
 -0.923407619047619,
 -0.9109012380952382,
 -0.9313628571428572,
 -0.944458,
 -0.9346265714285714,
 -0.9280142857142858,
 -0.9168411333333334,
 -0.8980509142857143,
 -0.8564607769423559,
 -0.7599015238095239,
 -0.6290490714285715,
 -0.526194761904762,
 -0.47111,
 -0.3985752857142857,
 -0.3384387142857143,
 -0.2929505,
 -0.1432769523809524,
 0.0946333333333333,
 0.2372001904761904,
 0.330331,
 0.4111863809523809,
 0.4555959047619047,
 0.4389366666666667,
 0.3818421904761905,
 0.3273762857142857,
 0.2956210476190476,
 0.2755233333333333,
 0.2371179761904762,
 0.1580119047619047,
 0.0484513571428571,
 -0.0550533333333333,
 -0.1193257380952381,
 -0.2042604761904762,
 -0.3708845,
 -0.46444,
 -0.3969200476190476,
 -0.3858691904761905,
 -0.4199212857142857,
 -0.3393458095238095,
 -0.2041188095238095,
 -0.0787106666666666,
 0.0662993333333333,
 0.2059894285714285,
 0.3040825476190476,
 0.3862823809523809,
 0.4648089047619047,
 0.5132498095238096,
 0.5244608333333334,
 0.5

In [2]:
df2 = pd.read_csv("data/dataset_without_null/left_arm_windows.csv")

In [3]:
import json

In [18]:
json.loads(df2["x_axis"][0])

[-0.9640337142857144,
 -0.923407619047619,
 -0.9109012380952382,
 -0.9313628571428572,
 -0.944458,
 -0.9346265714285714,
 -0.9280142857142858,
 -0.9168411333333334,
 -0.8980509142857143,
 -0.8564607769423559,
 -0.7599015238095239,
 -0.6290490714285715,
 -0.526194761904762,
 -0.47111,
 -0.3985752857142857,
 -0.3384387142857143,
 -0.2929505,
 -0.1432769523809524,
 0.0946333333333333,
 0.2372001904761904,
 0.330331,
 0.4111863809523809,
 0.4555959047619047,
 0.4389366666666667,
 0.3818421904761905,
 0.3273762857142857,
 0.2956210476190476,
 0.2755233333333333,
 0.2371179761904762,
 0.1580119047619047,
 0.0484513571428571,
 -0.0550533333333333,
 -0.1193257380952381,
 -0.2042604761904762,
 -0.3708845,
 -0.46444,
 -0.3969200476190476,
 -0.3858691904761905,
 -0.4199212857142857,
 -0.3393458095238095,
 -0.2041188095238095,
 -0.0787106666666666,
 0.0662993333333333,
 0.2059894285714285,
 0.3040825476190476,
 0.3862823809523809,
 0.4648089047619047,
 0.5132498095238096,
 0.5244608333333334,
 0.5

In [24]:
for i, entry in enumerate((df2[:49240]+df2[50607:])
["x_axis"]):
    print(i)
    json.loads(entry)

0


TypeError: the JSON object must be str, bytes or bytearray, not float

Unnamed: 0,id,sbj_id,label,x_axis,y_axis,z_axis
0,3667,6,1,"[-0.9640337142857144, -0.923407619047619, -0.9...","[0.5363462857142857, 0.5922266666666667, 0.717...","[-0.3019099999999999, -0.3076314285714285, -0...."
1,3668,6,1,"[0.3273762857142857, 0.2956210476190476, 0.275...","[0.6088466666666666, 0.5863520476190476, 0.458...","[-0.2077032380952381, -0.1601060952380952, -0...."
2,3669,6,1,"[0.5694409523809524, 0.6152040952380953, 0.630...","[-0.1511814285714285, -0.2214044761904762, -0....","[-0.0180695238095238, -0.0593889523809523, -0...."
3,3670,6,1,"[-1.1438057142857143, -1.0607022857142856, -0....","[-0.5799330476190475, -0.4933303809523809, -0....","[0.0398948571428571, 0.032551619047619, -0.063..."
4,3671,6,1,"[0.972435238095238, 1.3619304761904762, 1.4231...","[-0.2945504761904762, 0.7389723809523807, 3.40...","[0.0324823809523809, 0.7876134285714285, 1.720..."
...,...,...,...,...,...,...
83520,327533,18,14,"[0.0166660952380952, 0.0179904285714285, 0.027...","[0.9632597142857144, 0.9632148571428572, 0.962...","[-0.2828838095238095, -0.3048982857142857, -0...."
83521,327534,18,14,"[0.0361668571428571, 0.0343228571428571, 0.032...","[0.9803518095238096, 0.9764653333333332, 0.976...","[-0.2946061904761904, -0.294466, -0.28995505, ..."
83522,327535,18,14,"[0.1306798095238095, 0.0334744166666666, -0.05...","[1.4025302380952382, 1.4302977916666666, 1.209...","[-0.1656064761904762, -0.2415722583333333, -0...."
83523,327536,18,14,"[0.3413825, 0.1800518214285714, 0.069964916666...","[1.7818789285714285, 1.5744374, 1.4250406, 1.2...","[0.3389154761904762, 0.3532424857142857, 0.064..."


Unnamed: 0,id,sbj_id,label,x_axis,y_axis,z_axis
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
...,...,...,...,...,...,...
83520,,,,,,
83521,,,,,,
83522,,,,,,
83523,,,,,,


In [23]:
df2

Unnamed: 0,id,sbj_id,label,x_axis,y_axis,z_axis
0,3667,6,1,"[-0.9640337142857144, -0.923407619047619, -0.9...","[0.5363462857142857, 0.5922266666666667, 0.717...","[-0.3019099999999999, -0.3076314285714285, -0...."
1,3668,6,1,"[0.3273762857142857, 0.2956210476190476, 0.275...","[0.6088466666666666, 0.5863520476190476, 0.458...","[-0.2077032380952381, -0.1601060952380952, -0...."
2,3669,6,1,"[0.5694409523809524, 0.6152040952380953, 0.630...","[-0.1511814285714285, -0.2214044761904762, -0....","[-0.0180695238095238, -0.0593889523809523, -0...."
3,3670,6,1,"[-1.1438057142857143, -1.0607022857142856, -0....","[-0.5799330476190475, -0.4933303809523809, -0....","[0.0398948571428571, 0.032551619047619, -0.063..."
4,3671,6,1,"[0.972435238095238, 1.3619304761904762, 1.4231...","[-0.2945504761904762, 0.7389723809523807, 3.40...","[0.0324823809523809, 0.7876134285714285, 1.720..."
...,...,...,...,...,...,...
83520,327533,18,14,"[0.0166660952380952, 0.0179904285714285, 0.027...","[0.9632597142857144, 0.9632148571428572, 0.962...","[-0.2828838095238095, -0.3048982857142857, -0...."
83521,327534,18,14,"[0.0361668571428571, 0.0343228571428571, 0.032...","[0.9803518095238096, 0.9764653333333332, 0.976...","[-0.2946061904761904, -0.294466, -0.28995505, ..."
83522,327535,18,14,"[0.1306798095238095, 0.0334744166666666, -0.05...","[1.4025302380952382, 1.4302977916666666, 1.209...","[-0.1656064761904762, -0.2415722583333333, -0...."
83523,327536,18,14,"[0.3413825, 0.1800518214285714, 0.069964916666...","[1.7818789285714285, 1.5744374, 1.4250406, 1.2...","[0.3389154761904762, 0.3532424857142857, 0.064..."


In [12]:
df3 = pd.concat([df2[:49200], df2[50800:]])


In [13]:
df3

Unnamed: 0,id,sbj_id,label,x_axis,y_axis,z_axis
0,3667,6,1,"[-0.9640337142857144, -0.923407619047619, -0.9...","[0.5363462857142857, 0.5922266666666667, 0.717...","[-0.3019099999999999, -0.3076314285714285, -0...."
1,3668,6,1,"[0.3273762857142857, 0.2956210476190476, 0.275...","[0.6088466666666666, 0.5863520476190476, 0.458...","[-0.2077032380952381, -0.1601060952380952, -0...."
2,3669,6,1,"[0.5694409523809524, 0.6152040952380953, 0.630...","[-0.1511814285714285, -0.2214044761904762, -0....","[-0.0180695238095238, -0.0593889523809523, -0...."
3,3670,6,1,"[-1.1438057142857143, -1.0607022857142856, -0....","[-0.5799330476190475, -0.4933303809523809, -0....","[0.0398948571428571, 0.032551619047619, -0.063..."
4,3671,6,1,"[0.972435238095238, 1.3619304761904762, 1.4231...","[-0.2945504761904762, 0.7389723809523807, 3.40...","[0.0324823809523809, 0.7876134285714285, 1.720..."
...,...,...,...,...,...,...
83520,327533,18,14,"[0.0166660952380952, 0.0179904285714285, 0.027...","[0.9632597142857144, 0.9632148571428572, 0.962...","[-0.2828838095238095, -0.3048982857142857, -0...."
83521,327534,18,14,"[0.0361668571428571, 0.0343228571428571, 0.032...","[0.9803518095238096, 0.9764653333333332, 0.976...","[-0.2946061904761904, -0.294466, -0.28995505, ..."
83522,327535,18,14,"[0.1306798095238095, 0.0334744166666666, -0.05...","[1.4025302380952382, 1.4302977916666666, 1.209...","[-0.1656064761904762, -0.2415722583333333, -0...."
83523,327536,18,14,"[0.3413825, 0.1800518214285714, 0.069964916666...","[1.7818789285714285, 1.5744374, 1.4250406, 1.2...","[0.3389154761904762, 0.3532424857142857, 0.064..."


In [14]:
for i, entry in enumerate(df3["x_axis"]):
    try:
        json.loads(entry)
    except:
        print(i, entry)