## Polarity issues fixed for task data

In [2]:
# Standalone processing for sub-032 only
from pathlib import Path
import mne
import pandas as pd
import numpy as np

scripts_folder = Path.cwd()
root_folder = scripts_folder.parent.parent
dataset_path = root_folder / "data" / "ds003838"
output_folder = scripts_folder / "task_data"
output_folder.mkdir(exist_ok=True)

subject_id = "sub-032"
task = "memory"  # BIDS task label
ecg_folder = dataset_path / subject_id / "ecg"
set_path = ecg_folder / f"{subject_id}_task-{task}_ecg.set"
events_tsv = ecg_folder / f"{subject_id}_task-{task}_events.tsv"

if not set_path.exists():
    raise FileNotFoundError(f"Missing file: {set_path}")

raw = mne.io.read_raw_eeglab(set_path, preload=True, verbose=False)
sfreq = raw.info["sfreq"]
ecg_ch = "ECG"
raw.set_channel_types({ecg_ch: "ecg"})
ecg_picks = mne.pick_channels(raw.info["ch_names"], include=[ecg_ch])
if len(ecg_picks) == 0:
    raise RuntimeError("No ECG channel found in Raw object")

ecg_data, times = raw.get_data(picks=ecg_picks, return_times=True)
ecg = ecg_data[0]

# Decide polarity
ecg_detrended = mne.filter.detrend(ecg, order=1)
ecg_z = (ecg_detrended - np.mean(ecg_detrended)) / np.std(ecg_detrended)
pos_energy = np.mean(ecg_z[ecg_z > 0] ** 2) if np.any(ecg_z > 0) else 0
neg_energy = np.mean(ecg_z[ecg_z < 0] ** 2) if np.any(ecg_z < 0) else 0
flip = neg_energy > pos_energy

ecg_mod = ecg_z if not flip else -ecg_z
ecg_nk = ecg if not flip else -ecg

df = pd.DataFrame({
    "time": times,
    "ecg_raw": ecg,
    "ecg_mod": ecg_mod,
    "ecg_nk": ecg_nk,
})

parquet_filename = f"{subject_id}_task-{task}_ecg_task.parquet"
output_path = output_folder / parquet_filename
df.to_parquet(output_path, index=False)

if events_tsv.exists():
    events_df = pd.read_csv(events_tsv, sep="\t")
    events_out = output_folder / f"{subject_id}_task-{task}_events.parquet"
    events_df.to_parquet(events_out, index=False)
    print(f"Events saved: {events_out}")
else:
    print("Events file not found; skipping events export")

print(f"Done {subject_id}: {df.shape[0]} samples @ {sfreq} Hz")
print(f"ECG saved to: {output_path}")

  warn(
  raw = mne.io.read_raw_eeglab(set_path, preload=True, verbose=False)


Events saved: /home/martin/RESEARCH/thesis/brain_heart_psv_sdg/code/digitspan/task_data/sub-032_task-memory_events.parquet
Done sub-032: 7395320 samples @ 1000.0 Hz
ECG saved to: /home/martin/RESEARCH/thesis/brain_heart_psv_sdg/code/digitspan/task_data/sub-032_task-memory_ecg_task.parquet


In [None]:
import mne
import pandas as pd
import numpy as np
from pathlib import Path

# --- 1. Setup Paths ---
scripts_folder = Path.cwd()

# Go up two levels to find the root, then down to data
root_folder = scripts_folder.parent.parent
dataset_path = root_folder / 'data' / 'ds003838'

# Create output folder
output_folder = scripts_folder / 'task_data'
output_folder.mkdir(exist_ok=True)

print(f'üìÇ Reading data from: {dataset_path}')
print(f'üíæ Saving output to:  {output_folder}')
print()

# Exclude subjects with missing physiological data
excluded_subjects = {
    'sub-013', 'sub-014', 'sub-015', 'sub-016', 'sub-017', 'sub-018',
    'sub-019', 'sub-020', 'sub-021', 'sub-022', 'sub-023', 'sub-024',
    'sub-025', 'sub-026', 'sub-027', 'sub-028', 'sub-029', 'sub-030',
    'sub-031', 'sub-037', 'sub-066', 'sub-094'
}

task = 'memory'  # BIDS task label used in file names

# --- 2. Find All Subjects ---
all_subject_folders = sorted([f for f in dataset_path.glob('sub-*') if f.is_dir()])
subject_folders = [f for f in all_subject_folders if f.name not in excluded_subjects]
print(f'üîç Found {len(all_subject_folders)} subjects total')
print(f'‚ùå Excluding {len(excluded_subjects)} subjects with missing data')
print(f'‚úÖ Processing {len(subject_folders)} subjects with complete data')
print()

# --- 3. Process Each Subject ---
successful = 0
failed = 0
skipped = 0

for subject_path in subject_folders:
    subject_id = subject_path.name
    ecg_folder = subject_path / 'ecg'

    # Check if ECG folder exists
    if not ecg_folder.exists():
        print(f'‚è≠Ô∏è  {subject_id}: No ECG folder, skipping')
        skipped += 1
        continue

    # Build file paths
    set_filename = f"{subject_id}_task-{task}_ecg.set"
    set_path = ecg_folder / set_filename
    events_tsv = ecg_folder / f"{subject_id}_task-{task}_events.tsv"

    if not set_path.exists():
        print(f'‚è≠Ô∏è  {subject_id}: No .set file, skipping')
        skipped += 1
        continue

    print(f'‚è≥ {subject_id}: Loading...')
    try:
        # Load the binary file with MNE
        raw = mne.io.read_raw_eeglab(set_path, preload=True, verbose=False)

        # Sampling frequency (should be 1000 Hz for this dataset)
        sfreq = raw.info['sfreq']

        # Ensure ECG channel is typed correctly and pick it
        ecg_ch = 'ECG'
        raw.set_channel_types({ecg_ch: 'ecg'})
        ecg_picks = mne.pick_channels(raw.info['ch_names'], include=[ecg_ch])
        if len(ecg_picks) == 0:
            raise RuntimeError('No ECG channel found in Raw object')

        # Get data in Volts (MNE's internal unit)
        ecg_data, times = raw.get_data(picks=ecg_picks, return_times=True)
        ecg = ecg_data[0]  # shape: (n_samples,)

        # --- Decide polarity using detrend + z-score ---
        ecg_detrended = mne.filter.detrend(ecg, order=1)
        ecg_z = (ecg_detrended - np.mean(ecg_detrended)) / np.std(ecg_detrended)
        pos_energy = np.mean(ecg_z[ecg_z > 0] ** 2) if np.any(ecg_z > 0) else 0
        neg_energy = np.mean(ecg_z[ecg_z < 0] ** 2) if np.any(ecg_z < 0) else 0
        flip = neg_energy > pos_energy

        # Polarity-corrected versions
        ecg_mod = ecg_z if not flip else -ecg_z          # detrended + z-scored
        ecg_nk = ecg if not flip else -ecg               # raw volts, flipped if needed

        # Build DataFrame
        df = pd.DataFrame({
            'time':    times,      # seconds
            'ecg_raw': ecg,     # original volts
            'ecg_mod': ecg_mod, # detrended, z-scored, polarity-fixed
            'ecg_nk':  ecg_nk    # raw volts, polarity-fixed
        })

        # Save as Parquet
        parquet_filename = f"{subject_id}_task-{task}_ecg_task.parquet"
        output_path = output_folder / parquet_filename
        df.to_parquet(output_path, index=False)

        # Save events if available
        if events_tsv.exists():
            events_df = pd.read_csv(events_tsv, sep='	')
            events_out = output_folder / f"{subject_id}_task-{task}_events.parquet"
            events_df.to_parquet(events_out, index=False)
            print(f'   Events saved: {events_out.name} ({len(events_df)} rows)')
        else:
            print('   Events file not found; skipping events export')

        print(f'‚úÖ {subject_id}: Success! ({df.shape[0]} samples)')
        print(f'   Sampling rate: {sfreq} Hz (use this in NeuroKit2)')
        successful += 1

    except Exception as e:
        print(f'‚ùå {subject_id}: Error - {e}')
        failed += 1

# --- 4. Summary ---
print()
print('=' * 60)
print('üéØ Processing Complete!')
print(f'   ‚úÖ Successful: {successful}')
print(f'   ‚ùå Failed:     {failed}')
print(f'   ‚è≠Ô∏è  Skipped:    {skipped}')
print(f'   üìä Total:      {len(subject_folders)}')
print('=' * 60)
