In [2]:
pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
[33m  DEPRECATION: Building 'wget' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'wget'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=f0c6e8006971c9d1aa694e4fcac0ebee2508929676f8459708ab3d2c9b9e8c0b
  Stored in directory: /Users/onogantsog/Library/Caches/pip/wheels/8a/b8/04/0c88fb22489b0c049bee4e977c5689c7fe597d6c4b0e7d0b6a
Successfully built wget
Installing collected packages: wg

In [None]:
import numpy as np
import pandas as pd
import mne
from mne.datasets.sleep_physionet.age import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import os
import wget
from pathlib import Path

In [None]:
from preprocessing import preprocess_record, preprocess_all
from features import extract_features_all_epochs

In [3]:

BASE_URL = "https://physionet.org/files/sleep-edfx/1.0.0/sleep-cassette/"
SAVE_DIR = "sleep_edf_raw"
os.makedirs(SAVE_DIR, exist_ok=True)

# PhysioNet lists subjects SC4001 → SC4193 (not continuous)
subjects = [f"SC4{str(i).zfill(3)}" for i in range(1, 194)]  # SC4001–SC4193

for subj in subjects:
    for suffix in ["E0-PSG.edf", "E1-PSG.edf", "EC-Hypnogram.edf", "EJ-Hypnogram.edf"]:
        filename = f"{subj}{suffix}"
        file_url = BASE_URL + filename
        save_path = os.path.join(SAVE_DIR, filename)

        try:
            print("Downloading:", filename)
            wget.download(file_url, save_path)
            print()
        except:
            # Not all subjects have E1 recordings or EJ hypnograms
            print("Not found:", filename)

Downloading: SC4001E0-PSG.edf

Downloading: SC4001E1-PSG.edf
Not found: SC4001E1-PSG.edf
Downloading: SC4001EC-Hypnogram.edf

Downloading: SC4001EJ-Hypnogram.edf
Not found: SC4001EJ-Hypnogram.edf
Downloading: SC4002E0-PSG.edf

Downloading: SC4002E1-PSG.edf
Not found: SC4002E1-PSG.edf
Downloading: SC4002EC-Hypnogram.edf

Downloading: SC4002EJ-Hypnogram.edf
Not found: SC4002EJ-Hypnogram.edf
Downloading: SC4003E0-PSG.edf
Not found: SC4003E0-PSG.edf
Downloading: SC4003E1-PSG.edf
Not found: SC4003E1-PSG.edf
Downloading: SC4003EC-Hypnogram.edf
Not found: SC4003EC-Hypnogram.edf
Downloading: SC4003EJ-Hypnogram.edf
Not found: SC4003EJ-Hypnogram.edf
Downloading: SC4004E0-PSG.edf
Not found: SC4004E0-PSG.edf
Downloading: SC4004E1-PSG.edf
Not found: SC4004E1-PSG.edf
Downloading: SC4004EC-Hypnogram.edf
Not found: SC4004EC-Hypnogram.edf
Downloading: SC4004EJ-Hypnogram.edf
Not found: SC4004EJ-Hypnogram.edf
Downloading: SC4005E0-PSG.edf
Not found: SC4005E0-PSG.edf
Downloading: SC4005E1-PSG.edf
Not foun

In [None]:
RAW_DIR = Path("sleep_edf_raw")

def get_record_pairs(raw_dir):
    """Return list of (psg_file, hyp_file) pairs."""
    psg_files = {}
    hyp_files = {}

    for f in raw_dir.glob("*.edf"):
        name = f.name

        # PSG files contain "-PSG"
        if "-PSG" in name:
            key = name.split("-")[0]   # e.g., "SC4001E0"
            psg_files[key] = f

        # Hypnogram files contain "Hypnogram"
        if "Hypnogram" in name:
            key = name.split("-")[0]   # e.g., "SC4001EC"
            # Normalize key to match PSG key
            key = key.replace("EC", "E0").replace("EJ", "E0")
            hyp_files[key] = f

    # Pair them
    pairs = []
    for key in psg_files:
        if key in hyp_files:
            pairs.append((psg_files[key], hyp_files[key]))
        else:
            print(f"⚠️ Missing hypnogram for {key}")

    return pairs

pairs = get_record_pairs(RAW_DIR)
print("Found pairs:", len(pairs))


In [None]:
X_all, y_all = [], []

for (psg, hyp) in pairs:
    psg_path = str(psg)
    hyp_path = str(hyp)

    print(f"Processing {os.path.basename(psg)}")
    X, y = preprocess_record(psg_path, hyp_path)
    X_all.append(X)
    y_all.append(y)

In [None]:
all_features = []
for (X, y) in zip(X_all, y_all):
    features = extract_features_all_epochs(X, fs=100)
    stage_labels = y
    feature_df = pd.DataFrame(features)
    feature_df['stage'] = stage_labels
    all_features.append(feature_df)

In [None]:
df_features = pd.concat(all_features, ignore_index = True)
print(df_features.shape)
df_features.head()

In [None]:
df_features.to_csv('features.csv', index=False)