In [None]:
# CELL 0 — IMPORTS, LOCATION
import os, time, json
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras import layers, models, optimizers, Sequential

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

import time, io
from datetime import datetime

from google.colab import drive

drive.mount('/content/drive', force_remount=True)

data_base = Path(
    "/content/drive/MyDrive/Colab Notebooks/classifier_2026_locomotion_mode/step1_labelled")
data_base.mkdir(parents=True, exist_ok=True)

OUTPUT_ROOT = Path(
    "/content/drive/MyDrive/Colab Notebooks/classifier_2026_locomotion_mode/step3_models")
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)


DATASET_ROOT = Path(
    "/content/drive/MyDrive/Colab Notebooks/classifier_2026_locomotion_mode/step2_combined")
DATASET_ROOT.mkdir(parents=True, exist_ok=True)

# print(os.listdir(data_base))

subject_folders = sorted(
  [d for d in os.listdir(data_base)
    if d.lower().startswith("sub")
    ]
  )
assert len(subject_folders) > 1, "Need at least 2 subjects for LOSO"
print("Subjects found:", subject_folders)

print('\nCell 1 done at', datetime.now())  # useful for understanding which cell has been previously compiled

Min-Max Scaling

In [None]:
# CELL 1.1 — Normalizae all CSVs using per-subject, per-sensor pre-trials min-max calibration routine data

for sub in subject_folders:
    sub_path = data_base / sub
    print(f"\nProcessing {sub} ...")

    # Find normalization CSV
    normal_files = list(sub_path.glob("*_normal.csv"))

    normal_path = normal_files[0]
    normal_df = pd.read_csv(normal_path)

    assert normal_df.shape[1] == 6, f"{normal_path.name}: expected 6 columns" # 6 sensors

    # Reference min/max (column-wise)
    col_min = normal_df.min()
    col_max = normal_df.max()

    # Min-Max scale all other CSVs for the subject
    for csv_path in sub_path.glob("*.csv"):
        if csv_path == normal_path:
            continue

        df = pd.read_csv(csv_path)
        assert df.shape[1] == 6, f"{csv_path.name}: expected 6 columns" # 6 sensors

        df_norm = (df - col_min) / (col_max - col_min)
        df_norm = df_norm.clip(0.0, 1.0) # safety clipping

        # Overwrite original file
        df_norm.to_csv(csv_path, index=False)

    print(f"{sub}: normalization complete")


print("\nCell 2 done at", datetime.now())

In [None]:
# CELL 1.2 — Check if normalized CSVs are fine. If yes, delete the normalization CSV.

for sub in subject_folders:
    sub_path = data_base / sub
    print(f"\nChecking {sub} ...")

    # Find normalization CSV
    normal_files = list(sub_path.glob("*_normal.csv"))
    normal_path = normal_files[0]

    # remove normalization CSV, not reqd. anymore
    normal_path.unlink()


print("\nCell 3 done at", datetime.now())

During manual preprocessing, all subject data manually split into different activity-specific segmented csvs, using lap-counting stopwatch timestamps.

Combine scaled CSVs of same class from selected subjects.

The combined CSV built also shows the source subject data for manual verification. After all, correct data -> proper training!

In [None]:
# Cell 2 - Combine selected scaled datasets

def cell2_combine_per_class(
    subjects_to_use,
    data_base,
    combined_data_base,
    tag
):
  print("Subjects used for combination:")
  for s in subjects_to_use:
      print("  -", s)

  class_ids = set()

  for sub in subjects_to_use:
      for f in (data_base / sub).glob(f"{sub}_*.csv"):
          class_id = f.stem.split("_")[-1]
          class_ids.add(class_id)

  class_ids = sorted(class_ids)

  print("\nDetected class IDs:")
  for cid in class_ids:
      print("  -", cid)

  if not class_ids:
      raise RuntimeError("No class CSV files found.")

  for class_id in class_ids:
      combined_df_list = []

      for sub in subjects_to_use:
          file_path = data_base / sub / f"{sub}_{class_id}.csv"
          if not file_path.exists():
              continue

          df = pd.read_csv(file_path)
          before_rows = len(df)

          # Replace Inf with NaN, then drop rows with NaN
          df = df.replace([np.inf, -np.inf], np.nan)
          df = df.dropna()

          after_rows = len(df)

          if after_rows == 0:
              print(f"WARNING: {file_path} empty after cleanup")
              continue

          df["source_subject"] = sub
          combined_df_list.append(df)

          if before_rows != after_rows:
              print(
                  f"Cleaned {file_path.name}: "
                  f"dropped {before_rows - after_rows} rows"
              )

      if not combined_df_list:
          print(f"Skipping class {class_id} (no valid data)")
          continue

      combined_df = pd.concat(combined_df_list, ignore_index=True)

      out_path = combined_data_base / f"combined_class_{class_id}_{tag}.csv"
      combined_df.to_csv(out_path, index=False)

      print(f"Saved: {out_path} | rows = {len(combined_df)}")

  print("\nCell 2 done at", datetime.now())

In [None]:
# Cell 3 - User may confirm combined individual classes. Once confirmed, drop source_subject column

def cell3_confirm_and_clean(combined_data_base, tag):
    combined_files = sorted(combined_data_base.glob(f"combined_class_*_{tag}.csv"))

    print("Combined class files detected:")
    for f in combined_files:
        print("  -", f.name)

    for file_path in combined_files:
        df = pd.read_csv(file_path)

        if "source_subject" in df.columns:
            df = df.drop(columns=["source_subject"])

        df.to_csv(file_path, index=False)

        class_id = file_path.stem.split("_")[-1]
        print(
            f"Processed {file_path.name}: "
            f"rows={len(df)}, cols={df.shape[1]}"
        )

    print("\nCell 3 done at", datetime.now())

Build segment wise combined dataset

In [None]:
# Cell 4 - Build the First combined dataset

def cell4_build_all_combined(
    combined_data_base,
    window_size,
    step_distance,
    tag
):
    def build_sequences_generic(df, time_steps, step, class_label):
        segments = []
        labels = []

        values = df.values
        num_features = values.shape[1]

        for i in range(0, len(values) - time_steps, step):
            segment = values[i:i + time_steps]
            segments.append(segment)
            labels.append(class_label)

        X = np.asarray(segments, dtype=np.float32)
        y = np.asarray(labels)

        return X, y

    combined_files = sorted(combined_data_base.glob(f"combined_class_*_{tag}.csv"))

    all_rows = []
    segment_offset = 0

    print("Processing files and shapes:\n")

    for file_path in combined_files:
        stem_parts = file_path.stem.split("_")

        # Expected pattern:
        # combined_class_<classID>_W.._S.._LOSO_subX
        try:
            class_idx = stem_parts.index("class")
            class_label = int(stem_parts[class_idx + 1])
        except (ValueError, IndexError):
            raise RuntimeError(
                f"Failed to parse class label from filename: {file_path.name}"
            )

        df = pd.read_csv(file_path)

        print(file_path.name)
        print("  Input CSV shape:", df.shape)

        X, y = build_sequences_generic(
            df,
            window_size,
            step_distance,
            class_label
        )

        print("  Segments shape (X):", X.shape)
        print("  Labels shape (y):", y.shape)

        for i in range(X.shape[0]):
            for t in range(X.shape[1]):
                row = [segment_offset + i, t]
                row.extend(X[i, t].tolist())
                row.append(y[i])
                all_rows.append(row)

        segment_offset += X.shape[0]

    num_features = df.shape[1]
    columns = (
        ["segment_id", "t"]
        + [f"f{i}" for i in range(num_features)]
        + ["label"]
    )

    final_df = pd.DataFrame(all_rows, columns=columns)

    output_path = combined_data_base / f"all_combined_{tag}.csv"
    final_df.to_csv(output_path, index=False)

    df_check = pd.read_csv(output_path)

    nan_count = df_check.isna().sum().sum()
    inf_count = np.isinf(df_check.select_dtypes(include=[np.number])).sum().sum()

    print("\n=== NaN / Inf CHECK: all_combined.csv ===")
    print("Shape:", df_check.shape)
    print("Total NaN values:", nan_count)
    print("Total Inf values:", inf_count)

    if nan_count == 0 and inf_count == 0:
        print("OK: all_combined.csv has no NaN/Inf")
    else:
        print("WARNING: NaN/Inf present")

    print("\nFinal output:")
    print("  all_combined.csv shape:", final_df.shape)
    print("  Saved to:", output_path)

    print("\nCell 4 done at", datetime.now())

Break into validation set

In [None]:
# Cell 5 - Build class-balanced validation dataset (segment-wise breaking)

def cell5_build_validation(combined_data_base, tag, val_ratio=0.20):
    input_path = combined_data_base / f"all_combined_{tag}.csv"
    val_output_path = combined_data_base / f"all_combined_val_{tag}.csv"
    train_output_path = combined_data_base / f"all_combined_train_before_shuffle_{tag}.csv"

    df = pd.read_csv(input_path)

    print("Initial dataset shape:", df.shape)

    val_segments = []
    segments_to_drop = set()

    for class_label, class_df in df.groupby("label"):
        print(f"\nProcessing class {class_label}")

        segment_ids = class_df["segment_id"].unique()
        num_segments = len(segment_ids)
        num_val_segments = int(np.ceil(num_segments * val_ratio))

        print("  Total segments:", num_segments)
        print("  Validation segments:", num_val_segments)

        segment_ids = segment_ids.copy()
        np.random.shuffle(segment_ids)

        selected_segments = segment_ids[:num_val_segments]

        for seg_id in selected_segments:
            seg_rows = df[df["segment_id"] == seg_id]
            val_segments.append(seg_rows)
            segments_to_drop.add(seg_id)

    val_df = pd.concat(val_segments, ignore_index=True)
    val_df.to_csv(val_output_path, index=False)

    print("\nValidation set shape:", val_df.shape)

    train_df = df[~df["segment_id"].isin(segments_to_drop)]
    train_df.to_csv(train_output_path, index=False)

    print("Remaining training set shape:", train_df.shape)
    print("\nCell 5 done at", datetime.now())

Optional shuffling

In [None]:
# Cell 6 - Window-wise shuffle of training data

def cell6_shuffle_train(combined_data_base, tag):
    input_path = combined_data_base / f"all_combined_train_before_shuffle_{tag}.csv"
    output_path = combined_data_base / f"all_combined_train_after_shuffle_{tag}.csv"

    df = pd.read_csv(input_path)

    groups = [g for _, g in df.groupby("segment_id")]
    np.random.shuffle(groups)

    shuffled_df = pd.concat(groups, ignore_index=True)
    shuffled_df.to_csv(output_path, index=False)

    print("Training data shuffled window-wise")
    print("Final training shape:", shuffled_df.shape)
    print("\nCell 6 done at", datetime.now())

Check if training data is correct in all aspects.

In [None]:
# Cell 7 - Sanity checks for train/validation split

def cell7_sanity_checks(combined_data_base, WINDOW_SIZE, tag, check_counter):

  train_path = combined_data_base / f"all_combined_train_after_shuffle_{tag}.csv"
  val_path   = combined_data_base / f"all_combined_val_{tag}.csv"

  train_df = pd.read_csv(train_path)
  val_df   = pd.read_csv(val_path)

  print("=== BASIC SHAPES ===")
  print("Train shape:", train_df.shape)
  print("Val shape  :", val_df.shape)
  print("Total rows:", train_df.shape[0] + val_df.shape[0])

  # NaN / Inf checks
  print("\n=== NaN / Inf CHECKS ===")

  def nan_inf_check(df, name, check_counter):
      nan_count = df.isna().sum().sum()
      inf_count = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()

      print(f"{name}:")
      print(f"  Total NaN values : {nan_count}")
      print(f"  Total Inf values : {inf_count}")

      if nan_count == 0 and inf_count == 0:
          print(f"  {name}: OK (no NaN / Inf detected)")
          check_counter += 1
      else:
          print(f"  {name}: WARNING — NaN / Inf present")

          # Per-column breakdown (only columns with issues)
          nan_cols = df.isna().sum()
          nan_cols = nan_cols[nan_cols > 0]

          inf_cols = np.isinf(df.select_dtypes(include=[np.number])).sum()
          inf_cols = inf_cols[inf_cols > 0]

          if not nan_cols.empty:
              print("  Columns with NaNs:")
              print(nan_cols)

          if not inf_cols.empty:
              print("  Columns with Infs:")
              print(inf_cols)

  nan_inf_check(train_df, "Train", check_counter)
  nan_inf_check(val_df, "Val", check_counter)

  # Segment integrity check
  print("\n=== SEGMENT INTEGRITY CHECK ===")

  def check_segment_sizes(df, name, check_counter):
      counts = df.groupby("segment_id").size()
      bad = counts[counts != WINDOW_SIZE]
      if len(bad) == 0:
          print(f"{name}: OK (all segments size = {WINDOW_SIZE})")
          check_counter += 1
      else:
          print(f"{name}: FAILED")
          print(bad.head())

  check_segment_sizes(train_df, "Train", check_counter)
  check_segment_sizes(val_df, "Val", check_counter)

  # Segment data leakage check
  print("\n=== LEAKAGE CHECK (segment overlap) ===")

  train_segments = set(train_df["segment_id"].unique())
  val_segments   = set(val_df["segment_id"].unique())

  overlap = train_segments.intersection(val_segments)

  print("Train segments:", len(train_segments))
  print("Val segments  :", len(val_segments))
  print("Segment Overlap       :", len(overlap))

  if len(overlap) == 0:
      print("OK: No segment leakage")
      check_counter += 1
  else:
      print("FAILED: Overlapping segment_ids detected")

  # Class distribution (rows)
  print("\n=== CLASS DISTRIBUTION (rows) ===")

  train_class_counts = train_df["label"].value_counts().sort_index()
  val_class_counts   = val_df["label"].value_counts().sort_index()

  class_dist = pd.DataFrame({
      "train_rows": train_class_counts,
      "val_rows": val_class_counts,
      "val_ratio": val_class_counts / (train_class_counts + val_class_counts)
  })

  print(class_dist)

  # Class distribution (segments)
  print("\n=== CLASS DISTRIBUTION (segments) ===")

  train_seg_counts = train_df.groupby("label")["segment_id"].nunique()
  val_seg_counts   = val_df.groupby("label")["segment_id"].nunique()

  seg_dist = pd.DataFrame({
      "train_segments": train_seg_counts,
      "val_segments": val_seg_counts
  })

  print(seg_dist)

  # Window order check
  print("\n=== WINDOW ORDER CHECK ===")

  def check_order(df, name, check_counter):
      bad = False
      for seg_id, seg_df in df.groupby("segment_id"):
          if not np.all(seg_df["t"].values == np.arange(WINDOW_SIZE)):
              bad = True
              print(f"{name}: ordering issue in segment {seg_id}")
              break
      if not bad:
          print(f"{name}: OK (t ordering preserved)")
          check_counter += 1

  check_order(train_df, "Train", check_counter)
  check_order(val_df, "Val", check_counter)

  # Column consistency
  print("\n=== COLUMN CONSISTENCY ===")

  if list(train_df.columns) == list(val_df.columns):
      print("OK: Train and Val columns identical")
      check_counter += 1
  else:
      print("FAILED: Column mismatch")

  print("\n=== FINAL SUMMARY ===")

  # Update the condition to reflect the actual number of checks (2 for NaN/Inf, 2 for segment sizes, 1 for leakage, 2 for order, 1 for columns = 8 checks)
  print("\ncheck count: ", check_counter)
  if check_counter == 8:
      print("\nOK: All checks passed :-)")
  elif check_counter < 8 and check_counter > 0:
      print("! Failed ! : Some checks failed")
  else:
      print("!!! FAILED !!!: No checks passed")

  print("\nTrain rows:", len(train_df))
  print("Val rows  :", len(val_df))
  print("Val ratio :", len(val_df) / (len(train_df) + len(val_df)))

  print("\nCell 7 done at", datetime.now())

Loop over multi-subjecs and multi window-stride combos

In [None]:
# ============================================================
# CELL 8 — DATASET MATERIALIZATION ORCHESTRATOR
#
# Build and freeze datasets ONCE:
#  - For each (WINDOW_SIZE, STEP_DISTANCE)
#  - For each LOSO subject
#  - Calls Cell 2 to Cell 7
# ============================================================

# ----------------------------
# CONFIGURATION
# ----------------------------

WINDOW_STRIDE_CONFIGS = [
    (100, 20),
    (100, 50),
    (50, 10),
    (50, 25)
]

SUBJECTS = sorted(
    [d for d in os.listdir(data_base) if d.lower().startswith("sub")]
)
SUBJECTS = [
    "sub1",
    "sub2",
    # "sub3", # bad data
    "sub4",
    "sub5",
    "sub6",
    "sub7",
    "sub8",
    "sub9",
    "sub10",
    "sub11",
]

print("\n========== DATASET MATERIALIZATION ==========")
print("Subjects detected:")
for s in SUBJECTS:
    print("  -", s)

print("\nWindow–Stride combinations:")
for W, S in WINDOW_STRIDE_CONFIGS:
    print(f"  - W{W}, S{S}")

print("\nDataset root:")
print(" ", DATASET_ROOT)

print("\nStarting dataset creation...\n")

for WINDOW_SIZE, STEP_DISTANCE in WINDOW_STRIDE_CONFIGS:

    print("\n" + "=" * 80)
    print(f"WINDOW = {WINDOW_SIZE}, STRIDE = {STEP_DISTANCE}")
    print("=" * 80)

    for TEST_SUBJECT in SUBJECTS:
        start_time = time.time()

        print("\n" + "-" * 70)
        print(f"LOSO SUBJECT: {TEST_SUBJECT}")
        print("-" * 70)

        # TAG (used in ALL filenames)
        tag = f"W{WINDOW_SIZE}_S{STEP_DISTANCE}_LOSO_{TEST_SUBJECT}"
        # print("Tag:", tag)

        # OUTPUT DIRECTORY FOR THIS DATASET
        combined_data_base = (
            DATASET_ROOT
            / f"W{WINDOW_SIZE}_S{STEP_DISTANCE}"
            / f"LOSO_{TEST_SUBJECT}"
        )

        combined_data_base.mkdir(parents=True, exist_ok=True)
        if not combined_data_base.is_dir():
            raise RuntimeError(f"Failed to create directory: {combined_data_base}. Please check Google Drive connection and permissions.")

        print("\n[Step 1] Dataset directory found:")
        print(" ", combined_data_base)

        # SUBJECTS USED FOR TRAINING
        subjects_to_use = [s for s in SUBJECTS if s != TEST_SUBJECT]

        print("Training subjects:")
        for s in subjects_to_use:
            print("  -", s)

        # =====================================================
        # CELL 2 — COMBINE PER-CLASS DATA
        # =====================================================
        print("\n[Step 2] Combining per-class data")
        cell2_combine_per_class(
            subjects_to_use=subjects_to_use,
            data_base=data_base,
            combined_data_base=combined_data_base,
            tag=tag
        )

        # =====================================================
        # CELL 3 — CONFIRM & CLEAN COMBINED CSVs
        # =====================================================
        print("\n[Step 3] Confirming and cleaning combined CSVs")
        cell3_confirm_and_clean(
            combined_data_base=combined_data_base,
            tag=tag
        )

        # =====================================================
        # CELL 4 — BUILD all_combined CSV
        # =====================================================
        print("\n[Step 4] Building all_combined CSV")
        cell4_build_all_combined(
            combined_data_base=combined_data_base,
            window_size=WINDOW_SIZE,
            step_distance=STEP_DISTANCE,
            tag=tag
        )

        # =====================================================
        # CELL 5 — BUILD TRAIN / VAL SPLIT
        # =====================================================
        print("\n[Step 5] Building train / validation split")
        cell5_build_validation(
            combined_data_base=combined_data_base,
            # window_size=WINDOW_SIZE,
            tag=tag,
            val_ratio=0.20  # 20% validation split
        )

        # =====================================================
        # CELL 6 — SHUFFLE TRAIN DATA (SEGMENT-WISE)
        # =====================================================
        print("\n[Step 6] Shuffling training data (segment-wise)")
        cell6_shuffle_train(
            combined_data_base=combined_data_base,
            tag=tag
        )

        # =====================================================
        # CELL 7 — SANITY CHECKS
        # =====================================================
        print("\n[Step 7] Running sanity checks")
        check_counter = 0
        cell7_sanity_checks(
            combined_data_base=combined_data_base,
            WINDOW_SIZE=WINDOW_SIZE,
            tag=tag,
            check_counter=check_counter
        )

        print("\n[Cleanup] Removing intermediate CSV files")

        ### Check if correct data exists:
        required_files = []

        # Per-class combined CSVs
        for cls in range(7):
            required_files.append(
                combined_data_base / f"combined_class_{cls:02d}_{tag}.csv"
            )

        # Final frozen datasets
        required_files.extend([
            combined_data_base / f"all_combined_val_{tag}.csv",
            combined_data_base / f"all_combined_train_after_shuffle_{tag}.csv",
        ])

        missing = []
        for f in required_files:
            if not f.exists():
                missing.append(f.name)

        if missing:
            print("\n!!! DATASET VERIFICATION FAILED !!!")
            print("Missing required files:")
            for m in missing:
                print("  -", m)
            raise RuntimeError(
                f"Dataset integrity check failed for {tag}. "
                f"Missing {len(missing)} required CSV(s)."
            )
        else:
            print("\n** Dataset verification successful **")


        ### Delete redundant data
        files_to_delete = [
            combined_data_base / f"all_combined_{tag}.csv",
            # combined_data_base / f"all_combined_train_before_shuffle_{tag}.csv",
        ]
        for cls in range(7):
            files_to_delete.append(
                combined_data_base / f"combined_class_{cls:02d}_{tag}.csv"
            )

        for f in files_to_delete:
            if f.exists():
                f.unlink()
                print(f"  Deleted: {f.name}")
            else:
                print(f"  Skipped (not found): {f.name}")



        print("\n Dataset generation COMPLETE for:", tag)
        print("\n Time taken:", time.time() - start_time, "seconds")

print("\n******** ALL DATASETS BUILT SUCCESSFULLY ********")
print("\nCell 8 done at", datetime.now())