In [1]:
import os
import re
import requests
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from extract_mic_to_wav import extract_mic_to_wav_auto
from extract_stages_from_rml import rml_to_epoch_csv

In [2]:
RML_URLS_PATH = "./download_urls_list.txt"
INDEX_PATH = "./task_index.csv"
BASE_DIR = "./data"

In [3]:
def download_file(url, output_path):
    if os.path.exists(output_path):
        print(f"‚úÖ Already exists: {output_path}")
        return
    try:
        r = requests.get(url, stream=True)
        r.raise_for_status()
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"‚úÖ Downloaded: {output_path}")
    except Exception as e:
        print(f"‚ùå Download failed: {url} - {e}")

def build_link_maps():
    rml_map = {}
    edf_map = defaultdict(list)
    with open(RML_URLS_PATH, 'r') as f:
        for line in f:
            line = line.strip()
            # RML
            rml_match = re.search(r'/V3/APNEA_RML(?:_clean)?/(\d{8}-\d+)\.rml', line)
            if rml_match:
                pid = rml_match.group(1)
                rml_map[pid] = line
            # EDF
            edf_match = re.search(r'/V3/APNEA_EDF/(\d{8}-\d+)/(\1\[(\d{3})\]\.edf)', line)
            if edf_match:
                pid = edf_match.group(1)
                filename = edf_match.group(2)
                edf_map[pid].append((int(edf_match.group(3)), line, filename))
    return edf_map, rml_map

def process_id(pid, edf_links, rml_url):
    print(f"\n=== üöÄ Processing ID: {pid} ===")
    edf_dir = os.path.join(BASE_DIR, "edf", pid)
    wav_dir = os.path.join(BASE_DIR, "wav", pid)
    rml_path = os.path.join(BASE_DIR, "rml", f"{pid}.rml")
    os.makedirs(edf_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)

    # ‰∏ãËΩΩÊâÄÊúâ edf ÂàÜÂùó
    for _, url, filename in sorted(edf_links):
        edf_path = os.path.join(edf_dir, filename)
        download_file(url, edf_path)

    # ÊèêÂèñ mic -> wav
    for file in os.listdir(edf_dir):
        if file.endswith(".edf"):
            try:
                extract_mic_to_wav_auto(os.path.join(edf_dir, file), wav_dir)
            except Exception as e:
                print(f"‚ùå MicÊèêÂèñÂ§±Ë¥•: {file} - {e}")

    # ‰∏ãËΩΩ RML
    download_file(rml_url, rml_path)

    # ÁîüÊàê CSV
    rml_to_epoch_csv(rml_path, from_machine=False)
    rml_to_epoch_csv(rml_path, from_machine=True)

In [4]:
df = pd.read_csv(INDEX_PATH)
edf_map, rml_map = build_link_maps()

for i, row in df.iterrows():
    if row['done']:
        continue
    pid = row['id']
    if pid not in edf_map or pid not in rml_map:
        print(f'‚ö†Ô∏è Ë∑≥ËøáÊó†Êïà ID: {pid}')
        continue

    try:
        process_id(pid, edf_map[pid], rml_map[pid])
        df.at[i, 'done'] = True
        df.to_csv(INDEX_PATH, index=False)
    except Exception as e:
        print(f'‚ùå Â§ÑÁêÜÂ§±Ë¥• {pid}: {e}')


=== üöÄ Processing ID: 00000995-100507 ===
‚úÖ Downloaded: ./data\edf\00000995-100507\00000995-100507[001].edf
‚úÖ Downloaded: ./data\edf\00000995-100507\00000995-100507[002].edf
‚úÖ Downloaded: ./data\edf\00000995-100507\00000995-100507[003].edf
‚úÖ Downloaded: ./data\edf\00000995-100507\00000995-100507[004].edf
‚úÖ Downloaded: ./data\edf\00000995-100507\00000995-100507[005].edf
Saved to ./data\wav\00000995-100507\00000995-100507[001].wav
Saved to ./data\wav\00000995-100507\00000995-100507[002].wav
Saved to ./data\wav\00000995-100507\00000995-100507[003].wav
Saved to ./data\wav\00000995-100507\00000995-100507[004].wav
Saved to ./data\wav\00000995-100507\00000995-100507[005].wav
‚úÖ Downloaded: ./data\rml\00000995-100507.rml
‚úÖ ÁîüÊàê 597 Êù°ËÆ∞ÂΩï -> .\data\csv\00000995-100507\00000995-100507_user.csvÔºåÊÄßÂà´: Female
‚úÖ ÁîüÊàê 597 Êù°ËÆ∞ÂΩï -> .\data\csv\00000995-100507\00000995-100507_machine.csvÔºåÊÄßÂà´: Female

=== üöÄ Processing ID: 00000999-100507 ===
‚úÖ Downloaded: ./dat

KeyboardInterrupt: 