In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import numpy as np
import glob

In [2]:
# filepath = "/Users/trevoryu/Code/data/FOG_data/002/task_1.txt"

In [3]:
# Rename all the columns more pythonically
patient_emg_columns = {
    "001": ["emg_right_ta", "emg_left_ta", "eog", "ecg", "emg_right_gs"],
    "002": ["emg_right_ta", "emg_left_ta", "eog", "ecg", "emg_right_gs"],
    "003": ["emg_left_ta", "emg_right_ta", "eog", "ecg", "emg_right_gs"],
    "004": ["emg_left_ta", "emg_right_ta", "eog", "ecg", "emg_right_gs"],
    "005": ["emg_left_ta", "emg_right_ta", "eog", "ecg", "emg_right_gs"],
    "006": ["emg_right_ta", "emg_left_ta", "eog", "ecg", "emg_right_gs"],
    "007": ["emg_right_ta", "emg_left_ta", "eog", "ecg", "emg_right_gs"],
    "008/OFF_1": ["emg_right_ta", "emg_left_ta", "eog", "ecg", "emg_right_gs"],
    "008/OFF_2": ["emg_right_ta", "emg_right_gs", "eog", "ecg", "emg_left_ta"],
    "009": ["emg_left_ta", "emg_right_ta", "eog", "emg_right_gs", "ecg"],
    "010": ["emg_left_ta", "emg_right_ta", "eog", "ecg", "emg_right_gs"],
    "011": ["emg_left_ta", "emg_right_ta", "eog", "ecg", "emg_right_gs"],
    "012": ["emg_left_ta", "emg_right_ta", "eog", "ecg", "emg_right_gs"]
}

In [4]:
# Make names for other columns
a = ["left_shank", "right_shank", "waist", "arm"]
b = ["accel_x", "accel_y", "accel_z", "gyro_x", "gyro_y", "gyro_z", "NC"]
imu_options = ["_".join(x) for x in itertools.product(a, b)]
# Only arm is SC
imu_options[-1] = "arm_skin_conductance"

eeg_columns = [
    "FP1", "FP2", "F3", "F4",
    "C4", "C5", "P3", "P4",
    "O1", "O2", "F7", "F8", "P7", "P8", "FZ",
    "CZ", "PZ", "FC1", "FC2", "CP1", "CP2", "FC5", "FC6", "CP5", "CP6",
]

In [5]:
# Select relevant data columns in specific order
cols = [
    "timestamp",
    "labels",
    # EMG signals
    'emg_right_ta', 'emg_left_ta', 'emg_right_gs',
    'ecg', 'eog',
    # IMU L-Shank
    'left_shank_accel_x', 'left_shank_accel_y', 'left_shank_accel_z',
    'left_shank_gyro_x', 'left_shank_gyro_y', 'left_shank_gyro_z',
    # IMU R-Shank
    'right_shank_accel_x', 'right_shank_accel_y', 'right_shank_accel_z',
    'right_shank_gyro_x', 'right_shank_gyro_y', 'right_shank_gyro_z',
    # IMU waist
    'waist_accel_x', 'waist_accel_y', 'waist_accel_z',
    'waist_gyro_x', 'waist_gyro_y', 'waist_gyro_z',
    # IMU arm
    'arm_accel_x', 'arm_accel_y', 'arm_accel_z',
    'arm_gyro_x', 'arm_gyro_y', 'arm_gyro_z',
    # Arm skin conductance
    'arm_skin_conductance'
]

In [6]:
# Read the files and turn into dataframe
data_files = glob.glob("/Users/trevoryu/Code/data/FOG_data/*/*.txt")
data_files = data_files + glob.glob("/Users/trevoryu/Code/data/FOG_data/*/*/*.txt")  # subject 008 has extra directory
data_files = sorted(data_files)

missing_imus = []
res = {}

for filepath in data_files:
    df = pd.read_csv(filepath, header=None)
    # Match the relevant EMG columns to the patient name in the filepath
    emg_columns = [v for k, v in patient_emg_columns.items() if k in filepath][0]
    header = ["index", "timestamp"] + eeg_columns + emg_columns + imu_options + ["labels"]
    df.columns = header
    df = df[cols]

    # Convert timestamps to offsets in milliseconds
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    t0 = df['timestamp'][0]
    df['timestamp'] = df['timestamp'].apply(lambda x: (x - t0).value / 1e6)  # timedelta.value is in nanoseconds

    # Collect stats
    stats = {}
    for col in cols:
        mean = df[col].mean()
        std = df[col].std()
        ub = df[col].max()
        lb = df[col].min()
        # Skip columns with no signal
        if mean == std == ub == lb == 0:
            continue
        stats[col] = {
            "mean": mean,
            "std": std,
            "min": lb,
            "max": ub,
            "N": len(df)
        }
    # Some patients might not have all the IMU data points
    num_imus = len([k for k in stats.keys() if "accel_x" in k])
    if num_imus < 4:
        missing_imus.append(filepath)

    res[filepath] = (df, stats)

In [7]:
# Subjects 1, 2, 3, 4, 5, 6, 7, 11, 12 are missing some IMU data
missing_imus

['/Users/trevoryu/Code/data/FOG_data/001/task_1.txt',
 '/Users/trevoryu/Code/data/FOG_data/001/task_2.txt',
 '/Users/trevoryu/Code/data/FOG_data/001/task_3.txt',
 '/Users/trevoryu/Code/data/FOG_data/001/task_4.txt',
 '/Users/trevoryu/Code/data/FOG_data/002/task_1.txt',
 '/Users/trevoryu/Code/data/FOG_data/002/task_2.txt',
 '/Users/trevoryu/Code/data/FOG_data/002/task_3.txt',
 '/Users/trevoryu/Code/data/FOG_data/002/task_4.txt',
 '/Users/trevoryu/Code/data/FOG_data/003/task_1.txt',
 '/Users/trevoryu/Code/data/FOG_data/003/task_2.txt',
 '/Users/trevoryu/Code/data/FOG_data/003/task_3.txt',
 '/Users/trevoryu/Code/data/FOG_data/003/task_4.txt',
 '/Users/trevoryu/Code/data/FOG_data/004/task_1.txt',
 '/Users/trevoryu/Code/data/FOG_data/004/task_2.txt',
 '/Users/trevoryu/Code/data/FOG_data/004/task_3.txt',
 '/Users/trevoryu/Code/data/FOG_data/004/task_4.txt',
 '/Users/trevoryu/Code/data/FOG_data/004/task_5.txt',
 '/Users/trevoryu/Code/data/FOG_data/005/task_1.txt',
 '/Users/trevoryu/Code/data/

In [8]:
test_files = [f"/Users/trevoryu/Code/data/FOG_data/0{i}/task_1.txt" for i in ["01", "02", "03", "04", "05", "06", "07", "11", "12"]]

In [9]:
# The readme document says that if 2 IMUs were used, then they would be on the left tibia and the wrist
# However, there are subjects with (arm + L-shank), (arm + R-shank), (arm + R-shank + L-shank) and other combos
res[test_files[0]][1]

{'timestamp': {'mean': 180500.0,
  'std': 104212.58961373141,
  'min': 0.0,
  'max': 361000.0,
  'N': 180501},
 'labels': {'mean': 0.3628788760173074,
  'std': 0.48083165269348327,
  'min': 0,
  'max': 1,
  'N': 180501},
 'emg_right_ta': {'mean': -0.22234779862715442,
  'std': 2969.014416102385,
  'min': -5869.0,
  'max': 6117.0,
  'N': 180501},
 'emg_left_ta': {'mean': 0.27152204142913333,
  'std': 934.6166008136009,
  'min': -4060.5,
  'max': 5477.5,
  'N': 180501},
 'emg_right_gs': {'mean': -1.6449437953252337,
  'std': 3099.037376267388,
  'min': -5613.5,
  'max': 5744.5,
  'N': 180501},
 'ecg': {'mean': -1.1736749380889857,
  'std': 2571.6215763091022,
  'min': -5126.0,
  'max': 6339.5,
  'N': 180501},
 'eog': {'mean': 0.17161123761087196,
  'std': 44.845060451814426,
  'min': -391.5,
  'max': 302.5,
  'N': 180501},
 'left_shank_accel_x': {'mean': 8377.38501041035,
  'std': 4407.496655823841,
  'min': -36396.87926295932,
  'max': 38340.70295044864,
  'N': 180501},
 'left_shank_acc

In [10]:
total_data_points = sum([len(df) for df, stats in res.values()])

In [11]:
total_data_points

6211056

In [12]:
subject_8_files = [k for k in res.keys() if "008" in k]
subject_8_files

['/Users/trevoryu/Code/data/FOG_data/008/OFF_1/task_1.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_1/task_2.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_1/task_3.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_1/task_4.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_1/task_5.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_2/task_1.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_2/task_2.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_2/task_3.txt',
 '/Users/trevoryu/Code/data/FOG_data/008/OFF_2/task_4.txt']

In [13]:
# Save the filtered, mostly raw dataframes as csv files
save_path = "/Users/trevoryu/Code/syde_599/data/raw_fog_data/"
raw_stats = {}
for filepath, (df, stats) in res.items():
    # Consolidate all the subject 8 data
    # Remove the splits
    if "OFF_1/" in filepath:
        filepath = filepath.replace("OFF_1/", "")
    if "OFF_2/" in filepath:
        filepath = filepath.replace("OFF_2/", "")
        # Add 5 to the OFF_2 task numbers
        old_num = int(filepath[-5])
        new_num = old_num + 5
        filepath = filepath.replace(f"task_{old_num}", f"task_{new_num}")
    # Make new filename, e.g. "001_task_1.csv"
    new_filename = "_".join(filepath.split("/")[-2:])
    new_filename = new_filename.replace(".txt", ".csv").replace("/", "_")
    
    # Store the raw stats to save for later
    raw_stats[new_filename] = stats

    df.to_csv(save_path + new_filename, index=False)

In [14]:
import pickle
with open(save_path + "raw_stats.pkl", "wb") as f:
    pickle.dump(raw_stats, f)

In [15]:
# Normalize by subtracting the mean and dividing by the standard deviateogn
# 0s will stay the same, they weren't recorded in stats anyways
# Save as csv files
save_path = "/Users/trevoryu/Code/syde_599/data/norm_fog_data/"

for filepath, (df, stats) in res.items():
    # Consolidate all the subject 8 data
    # Remove the splits
    if "OFF_1/" in filepath:
        filepath = filepath.replace("OFF_1/", "")
    if "OFF_2/" in filepath:
        filepath = filepath.replace("OFF_2/", "")
        # Add 5 to the OFF_2 task numbers
        old_num = int(filepath[-5])
        new_num = old_num + 5
        filepath = filepath.replace(f"task_{old_num}", f"task_{new_num}")
    # Make new filename, e.g. "001_task_1.csv"
    new_filename = "_".join(filepath.split("/")[-2:])
    new_filename = new_filename.replace(".txt", ".csv").replace("/", "_")
    
    for col in stats:
        # Don't normalize timestamp or labels
        if col in ["timestamp", "labels"]:
            continue
        df[col] = (df[col] - stats[col]["mean"]) / stats[col]["std"]

    df.to_csv(save_path + new_filename, index=False)