In [1]:
import os
import re
import requests
import time
import random
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from extract_mic_to_wav import extract_mic_to_wav_auto
from extract_stages_from_rml import rml_to_epoch_csv
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

In [2]:
RML_URLS_PATH = "./download_urls_list.txt"
INDEX_PATH = "./task_index.csv"
BASE_DIR = "./data"

In [3]:
def download_file(url, output_path):
    if os.path.exists(output_path):
        print(f"✅ Already exists: {output_path}")
        return
    try:
        # 添加 访问headers ， 随机延迟
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"
        }
        time.sleep(random.uniform(1.5, 4.0))  # 随机等待防止封锁

        r = requests.get(url, stream=True, headers=headers)
        r.raise_for_status()
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Downloaded: {output_path}")
    except Exception as e:
        print(f"❌ Download failed: {url} - {e}")

def build_link_maps():
    rml_map = {}
    edf_map = defaultdict(list)
    with open(RML_URLS_PATH, 'r') as f:
        for line in f:
            line = line.strip()
            # RML
            rml_match = re.search(r'/V3/APNEA_RML(?:_clean)?/(\d{8}-\d+)\.rml', line)
            if rml_match:
                pid = rml_match.group(1)
                rml_map[pid] = line
            # EDF
            edf_match = re.search(r'/V3/APNEA_EDF/(\d{8}-\d+)/(\1\[(\d{3})\]\.edf)', line)
            if edf_match:
                pid = edf_match.group(1)
                filename = edf_match.group(2)
                edf_map[pid].append((int(edf_match.group(3)), line, filename))
    return edf_map, rml_map

def process_id(pid, edf_links, rml_url):
    print(f"\n=== 🚀 Processing ID: {pid} ===")
    edf_dir = os.path.join(BASE_DIR, "edf", pid)
    wav_dir = os.path.join(BASE_DIR, "wav", pid)
    rml_path = os.path.join(BASE_DIR, "rml", f"{pid}.rml")
    os.makedirs(edf_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)

    # 下载所有 edf 分块
    for _, url, filename in sorted(edf_links):
        edf_path = os.path.join(edf_dir, filename)
        download_file(url, edf_path)

    # 提取 mic -> wav
    for file in os.listdir(edf_dir):
        if file.endswith(".edf"):
            try:
                extract_mic_to_wav_auto(os.path.join(edf_dir, file), wav_dir)
            except Exception as e:
                print(f"❌ Mic提取失败: {file} - {e}")

    # 下载 RML
    download_file(rml_url, rml_path)

    # 生成 CSV
    rml_to_epoch_csv(rml_path, from_machine=False)
    rml_to_epoch_csv(rml_path, from_machine=True)

In [5]:
df = pd.read_csv(INDEX_PATH)
edf_map, rml_map = build_link_maps()
# 暂时只下载前10个
for i, row in df.head(10).iterrows():
    if row['done']:
        continue
    pid = row['id']
    if pid not in edf_map or pid not in rml_map:
        print(f'⚠️ 跳过无效 ID: {pid}')
        continue

    try:
        process_id(pid, edf_map[pid], rml_map[pid])
        df.at[i, 'done'] = True
        df.to_csv(INDEX_PATH, index=False)
    except Exception as e:
        print(f'❌ 处理失败 {pid}: {e}')


=== 🚀 Processing ID: 00001000-100507 ===
✅ Downloaded: ./data\edf\00001000-100507\00001000-100507[001].edf
✅ Downloaded: ./data\edf\00001000-100507\00001000-100507[002].edf
✅ Downloaded: ./data\edf\00001000-100507\00001000-100507[003].edf
✅ Downloaded: ./data\edf\00001000-100507\00001000-100507[004].edf
✅ Downloaded: ./data\edf\00001000-100507\00001000-100507[005].edf
Saved to ./data\wav\00001000-100507\00001000-100507[001].wav
Saved to ./data\wav\00001000-100507\00001000-100507[002].wav
Saved to ./data\wav\00001000-100507\00001000-100507[003].wav
Saved to ./data\wav\00001000-100507\00001000-100507[004].wav
Saved to ./data\wav\00001000-100507\00001000-100507[005].wav
✅ Downloaded: ./data\rml\00001000-100507.rml
✅ 生成 484 条记录 -> .\data\csv\00001000-100507\00001000-100507_user.csv，性别: Male
✅ 生成 497 条记录 -> .\data\csv\00001000-100507\00001000-100507_machine.csv，性别: Male

=== 🚀 Processing ID: 00001006-100507 ===
✅ Downloaded: ./data\edf\00001006-100507\00001006-100507[001].edf
✅ Downloaded: