In [18]:
# Cell 1: Imports
import os
import re
import requests
import shutil # For potential cleanup
from tqdm.notebook import tqdm

print("All libraries imported successfully.")

All libraries imported successfully.


In [19]:
# Cell 2: Configuration
print("--- Configuring the download process... ---")

# --- FILE & PATH SETTINGS ---
LINK_FILE = 'download_links.txt' # Your file containing all the download URLs
BASE_DOWNLOAD_DIR = 'F:/Solo All In One Docs/Scidb Sleep Data' # Main directory where patient folders will be created

# --- PATIENT SELECTION SETTINGS ---
TARGET_PATIENT_COUNT = 25 # Number of valid patients to download

# Create the base download directory if it doesn't exist
os.makedirs(BASE_DOWNLOAD_DIR, exist_ok=True)

print(f"Configuration set. Data will be downloaded to '{BASE_DOWNLOAD_DIR}'.")
print(f"Targeting {TARGET_PATIENT_COUNT} valid patients.")

--- Configuring the download process... ---
Configuration set. Data will be downloaded to 'F:/Solo All In One Docs/Scidb Sleep Data'.
Targeting 25 valid patients.


In [None]:
# Cell 3: Enhanced Helper Functions (Download & Link Grouping with Retry Logic)
print("--- Defining enhanced helper functions with retry logic... ---")

import time

def group_links_by_patient(filepath):
    """
    Reads a file of URLs, extracts patient IDs, and groups RML and EDF links together.
    """
    grouped_data = {}
    patient_id_regex = re.compile(r'(\d{8}-\d{6})')
    if not os.path.exists(filepath):
        print(f"ERROR: Link file not found at '{filepath}'. Please create it.")
        return {}
    with open(filepath, 'r') as f:
        for url in f:
            url = url.strip()
            if not url: continue
            match = patient_id_regex.search(url)
            if not match: continue
            patient_id = match.group(1)
            if patient_id not in grouped_data:
                grouped_data[patient_id] = {'rml': None, 'edf': []}
            if url.endswith('.rml'):
                grouped_data[patient_id]['rml'] = url
            elif url.endswith('.edf'):
                grouped_data[patient_id]['edf'].append(url)
    return grouped_data

def download_file_with_retry(url, local_path, max_retries=5, base_delay=2):
    """
    Downloads a file with retry logic for network interruptions.
    Checks file existence and completeness before downloading.
    """
    # Check if file already exists and is complete
    if os.path.exists(local_path):
        try:
            local_size = os.path.getsize(local_path)
            if local_size > 0:  # Basic check - file exists and has content
                # Try to verify with server (with retry)
                for attempt in range(3):  # Quick verification attempts
                    try:
                        response = requests.head(url, timeout=10)
                        server_size = int(response.headers.get('content-length', 0))
                        if local_size == server_size and server_size > 0:
                            print(f"  > File already exists and complete: {os.path.basename(local_path)}")
                            return True
                        break  # Exit verification loop if successful
                    except:
                        if attempt < 2:
                            time.sleep(1)
                        continue
                
                # If we can't verify but file exists and has reasonable size, assume it's good
                if local_size > 1000:  # Assume files > 1KB are likely complete
                    print(f"  > File exists (couldn't verify size): {os.path.basename(local_path)}")
                    return True
                else:
                    print(f"  > File exists but seems incomplete: {os.path.basename(local_path)}")
        except Exception:
            print(f"  > Error checking existing file: {os.path.basename(local_path)}")

    # Download the file with retry logic
    for attempt in range(max_retries):
        try:
            print(f"  > Downloading: {os.path.basename(local_path)} (attempt {attempt + 1}/{max_retries})")
            
            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            
            with requests.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                
                # Download to temporary file first
                temp_path = local_path + '.tmp'
                with open(temp_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                # Move temp file to final location if download completed
                shutil.move(temp_path, local_path)
                print(f"  > Successfully downloaded: {os.path.basename(local_path)}")
                return True
                
        except requests.exceptions.RequestException as e:
            delay = base_delay * (2 ** attempt)  # Exponential backoff
            print(f"  > Download failed (attempt {attempt + 1}): {e}")
            
            if attempt < max_retries - 1:
                print(f"  > Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  > Max retries reached for {os.path.basename(local_path)}")
                
        except Exception as e:
            print(f"  > Unexpected error downloading {os.path.basename(local_path)}: {e}")
            break
    
    # Clean up temp file if it exists
    temp_path = local_path + '.tmp'
    if os.path.exists(temp_path):
        os.remove(temp_path)
    
    return False

def save_progress(completed_patients, progress_file='download_progress.txt'):
    """Save list of completed patients to resume later"""
    with open(progress_file, 'w') as f:
        for patient in completed_patients:
            f.write(f"{patient}\n")

def load_progress(progress_file='download_progress.txt'):
    """Load list of completed patients"""
    if not os.path.exists(progress_file):
        return set()
    with open(progress_file, 'r') as f:
        return set(line.strip() for line in f if line.strip())

print("Enhanced helper functions with retry logic defined.")

--- Defining helper functions... ---
Helper functions defined.


In [None]:
# Cell 4: Enhanced Main Download Loop with Resume Capability
print("\n--- Starting Enhanced Download Loop with Resume Capability ---")

# Load previous progress
PROGRESS_FILE = os.path.join(BASE_DOWNLOAD_DIR, 'download_progress.txt')
completed_patients = load_progress(PROGRESS_FILE)
print(f"Found {len(completed_patients)} previously completed patients.")

# 1. Group all available links by patient
grouped_links = group_links_by_patient(LINK_FILE)
print(f"Found {len(grouped_links)} unique patient IDs in the link file.")

# 2. Select the target number of valid patients (excluding already completed ones)
patient_ids_to_download = []
for pid, files in grouped_links.items():
    if files['rml'] and files['edf']:  # Ensure both RML and at least one EDF are available
        patient_folder_name = f"patient_{str(len(patient_ids_to_download)+1).zfill(2)}"
        if patient_folder_name not in completed_patients:
            patient_ids_to_download.append(pid)
        else:
            print(f"Skipping already completed patient: {patient_folder_name}")
    if len(patient_ids_to_download) + len(completed_patients) >= TARGET_PATIENT_COUNT:
        break

if len(patient_ids_to_download) + len(completed_patients) < TARGET_PATIENT_COUNT:
    total_available = len(patient_ids_to_download) + len(completed_patients)
    print(f"Warning: Only found {total_available} valid patients, less than the target of {TARGET_PATIENT_COUNT}.")

print(f"Proceeding to download data for {len(patient_ids_to_download)} remaining patients.")

# 3. Loop over each selected patient for download
for i, patient_original_id in enumerate(tqdm(patient_ids_to_download, desc="Total Download Progress")):
    
    # Calculate correct patient number considering already completed patients
    patient_number = len(completed_patients) + i + 1
    patient_folder_name = f"patient_{str(patient_number).zfill(2)}"
    patient_dir = os.path.join(BASE_DOWNLOAD_DIR, patient_folder_name)
    os.makedirs(patient_dir, exist_ok=True)
    
    print(f"\n--- Processing patient: {patient_folder_name} (Original ID: {patient_original_id}) ---")
    
    patient_info = grouped_links[patient_original_id]
    patient_success = True

    # Download RML file
    rml_url = patient_info['rml']
    rml_filename_match = re.search(r'fileName=([^&]+)', rml_url)
    rml_filename = rml_filename_match.group(1) if rml_filename_match else os.path.basename(rml_url).split('?')[0]
    rml_filename = requests.utils.unquote(rml_filename)
    rml_path = os.path.join(patient_dir, rml_filename)

    print(f"  > Processing RML: {rml_filename}...")
    if not download_file_with_retry(rml_url, rml_path):
        patient_success = False
        print(f"  > Failed to download RML for patient {patient_folder_name}")

    # Download each EDF file for this patient
    edf_success_count = 0
    total_edf_count = len(patient_info['edf'])
    
    for edf_url in tqdm(patient_info['edf'], desc=f"  EDFs for {patient_folder_name}", leave=False):
        edf_filename_match = re.search(r'fileName=([^&]+)', edf_url)
        edf_filename = edf_filename_match.group(1) if edf_filename_match else os.path.basename(edf_url).split('?')[0]
        edf_filename = requests.utils.unquote(edf_filename)
        edf_path = os.path.join(patient_dir, edf_filename)
        
        # WFDB also needs the .hea file
        hea_url = edf_url.replace('.edf', '.hea')
        hea_filename = edf_filename.replace('.edf', '.hea')
        hea_path = os.path.join(patient_dir, hea_filename)

        # Try to download both EDF and HEA files
        edf_downloaded = download_file_with_retry(edf_url, edf_path)
        hea_downloaded = download_file_with_retry(hea_url, hea_path)
        
        if edf_downloaded and hea_downloaded:
            edf_success_count += 1
        else:
            print(f"  > Failed to download EDF/HEA pair: {edf_filename}")

    # Consider patient successful if RML and at least one EDF pair downloaded
    if patient_success and edf_success_count > 0:
        completed_patients.add(patient_folder_name)
        save_progress(completed_patients, PROGRESS_FILE)
        print(f"  ✓ Patient {patient_folder_name} completed successfully ({edf_success_count}/{total_edf_count} EDF pairs)")
    else:
        print(f"  ✗ Patient {patient_folder_name} incomplete (RML: {'✓' if patient_success else '✗'}, EDFs: {edf_success_count}/{total_edf_count})")

print(f"\n--- Download session complete. Total completed patients: {len(completed_patients)} ---")
print("You can re-run this notebook to resume from where it left off.")


--- Starting Main Download Loop ---
Found 305 unique patient IDs in the link file.
Proceeding to download data for 25 patients.


Total Download Progress:   0%|          | 0/25 [00:00<?, ?it/s]


--- Processing patient: patient_01 (Original ID: 00001631-100507) ---
  > Downloading RML: 00001631-100507.rml...


  EDFs for patient_01:   0%|          | 0/7 [00:00<?, ?it/s]

  All files downloaded for patient_01.

--- Processing patient: patient_02 (Original ID: 00001547-100507) ---
  > Downloading RML: 00001547-100507.rml...


  EDFs for patient_02:   0%|          | 0/6 [00:00<?, ?it/s]

  > Download failed for 00001547-100507[002].edf: ("Connection broken: ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)", ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
  > Error downloading EDF: 00001547-100507[002].edf. Proceeding to next EDF.
  > Download failed for 00001547-100507[005].hea: ("Connection broken: ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)", ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
  > Error downloading HEA: 00001547-100507[005].hea. Proceeding to next EDF.
  > Download failed for 00001547-100507[006].edf: HTTPSConnectionPool(host='download.scidb.cn', port=443): Max retries exceeded with url: /download?fileId=62295e1ed7561b594fb68127&path=/V3/APNEA_EDF/00001547-100507/00001547-100507%5B006%5D.edf&fileNa