## Download MIMIC-CXR

In [None]:
# Copyright 2025 Parsa Mohammadi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip -q install requests bs4 tqdm # (BeautifulSoup not actually needed anymore)

In [None]:
%cd /content/drive/MyDrive/RadLLaMA_Thinking/Dataset/MIMIC-CXR

In [None]:
# ==================== IMPORTS ====================
import os                # filesystem walking for staging flush
import sys               # exiting with a non-zero code on auth failure
import time              # (imported by original script; not used here but kept for parity)
import re                # (imported by original script; not used here but kept for parity)
import shutil            # moving files from staging to final destination
from pathlib import Path # convenient, OS-agnostic path handling
from tqdm.notebook import tqdm

import requests                                           # HTTP client
from concurrent.futures import ThreadPoolExecutor, as_completed  # simple multi-threading for downloads
from requests.adapters import HTTPAdapter                 # plug retry/connection pooling into requests
from urllib3.util.retry import Retry                      # retry policy (429/5xx + backoff)

# ==================== CONFIG ====================
# PhysioNet auth: your logged-in session cookie value. Required to access MIMIC-CXR.
PN_SESSIONID = ""

# Base URLs/paths on PhysioNet for MIMIC-CXR-JPG v2.1.0
BASE = "https://physionet.org/files/mimic-cxr-jpg/2.1.0/"
IMAGE_LIST_URL = BASE + "IMAGE_FILENAMES"  # text file listing all relative image paths

# Where to put the final files. Change to your desired directory.
SAVE_DIR = Path("/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/MIMIC-CXR/")

# Optional: stage to a fast local disk and later move to SAVE_DIR (useful on Colab)
STAGE_DIR = None
# Example:
# SAVE_DIR = Path("/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/MIMIC-CXR")
# STAGE_DIR = Path("/content/mimic_stage")

# Number of parallel downloads (threads). Increase cautiously to avoid rate limits.
CONCURRENCY = 15

# Chunk size in bytes when streaming file content.
# "1 << 20" is bit-shift: 1 shifted left by 20 bits == 1 * 2^20 == 1,048,576 bytes (~1 MB).
CHUNK = 4 * 1024 * 1024  # 4 MB

# Limit how many files to download (useful for testing). None means "all".
MAX_FILES = None  # e.g. 5000

# Restrict to certain main subject directories (pXX). None means "all pXX".
# Example: ["p10", "p15"] to download only those subsets.
MAIN_DIRS = ["p19"]

# Paths to simple line-based logs (created if missing).
# NOTE: Names are hardcoded to "p12" here to match MAIN_DIRS default.
LOG_OK = SAVE_DIR / "downloaded_files_p19.list"   # every successfully downloaded relative path is appended
LOG_FAIL = SAVE_DIR / "failed_files_p19.list"     # any failures are appended

# If staging is enabled, flush (move) staged files to SAVE_DIR once staged size exceeds ~5GB.
# FLUSH_BYTES = 5 * 1024**3

# ==================== SESSION SETUP ====================
def build_session(sessionid: str) -> requests.Session:
    """
    Build a requests.Session with:
      - your PhysioNet 'sessionid' cookie for auth,
      - a large connection pool,
      - robust retries with exponential backoff for 429/5xx,
      - keep-alive headers.
    """
    s = requests.Session()                                       # create a persistent HTTP session
    s.cookies.update({"sessionid": sessionid})                   # attach auth cookie to all requests

    # Configure retry policy:
    # - total attempts up to 8 (with specific caps for connect/read)
    # - backoff_factor=1.5 -> 1.5s, 3s, 4.5s, ... between retries
    # - retry only on specific status codes
    # - only for idempotent methods GET/HEAD
    retries = Retry(
        total=8,
        connect=4,
        read=6,
        backoff_factor=1.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET", "HEAD"])
    )

    # HTTPAdapter provides connection pooling + retries.
    # pool_maxsize controls max simultaneous connections.
    adapter = HTTPAdapter(pool_connections=64, pool_maxsize=64, max_retries=retries)

    # Mount adapter for both HTTP and HTTPS.
    s.mount("https://", adapter)
    s.mount("http://", adapter)

    # Keep TCP connections alive where possible.
    s.headers.update({"Connection": "keep-alive"})

    return s

# Create one global session for all downloads.
session = build_session(PN_SESSIONID)

# ==================== IMAGE LIST ====================
def fetch_image_list() -> list[str]:
    """
    Download and parse the IMAGE_FILENAMES list from PhysioNet, optionally:
      - filter to MAIN_DIRS (e.g., "p12"),
      - truncate to MAX_FILES.
    Returns a list of relative paths (e.g., "files/p12/p123.../s.../xxx.jpg").
    """
    print("The IMAGE_FILENAMES downloading...")
    # Request the plaintext list (one relative path per line).
    r = session.get(IMAGE_LIST_URL, timeout=60)
    r.raise_for_status()  # fail fast on non-2xx
    print("The IMAGE_FILENAMES downloaded succsessfully")
    # Split by line, strip whitespace, and keep non-empty lines.
    raw_lines = r.text.splitlines()
    lines: list[str] = []
    for ln in raw_lines:
        ln = ln.strip()
        if not ln:
            continue
        lines.append(ln)
    print("Strip is done. Clean list of URLs are in the lins list.")
    # If MAIN_DIRS is set, keep only entries that mention "/pXX/" in the path.
    if MAIN_DIRS:
        needles = tuple(f"/{d}/" for d in MAIN_DIRS)  # MAIN_DIRS = ["p12", "p15"] -->> needles = ("/p12/", "/p15/")
        filtered: list[str] = []
        for ln in lines:
            if any(nd in ln for nd in needles):
                filtered.append(ln)
        lines = filtered
    print("Filter is done. A list of URLS in desired subfolder is ready.")
    # If MAX_FILES is set, keep just the first N.
    if MAX_FILES:
        lines = lines[:MAX_FILES]

    return lines

def rel_to_dest(rel: str) -> Path:
    """
    Convert a relative path (from IMAGE_FILENAMES) into the full local path
    where the file should be written. If STAGE_DIR is set, write to STAGE_DIR;
    otherwise write directly to SAVE_DIR.
    """
    root = STAGE_DIR if STAGE_DIR else SAVE_DIR
    return (root / rel).resolve()

# ==================== IO / LOG HELPERS ====================
def ensure_dirs(path: Path) -> None:
    """
    Ensure the parent directory for 'path' exists.
    """
    path.parent.mkdir(parents=True, exist_ok=True)

def log_line(path: Path, text: str) -> None:
    """
    Append a single line of text to a UTF-8 file, creating parents as needed.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        f.write(text + "\n")

def human(n: int | float) -> str:
    """
    Turn a byte count into a human-readable string (B, KB, MB, GB, TB).
    """
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if n < 1024 or unit == "TB":
            return f"{n:.1f} {unit}"
        n /= 1024

def load_downloaded(log_path: Path) -> set[str]:
    """
    Read a log file (one relative path per line) and return a set of those paths.
    If the file doesn't exist yet, return an empty set.
    """
    if not log_path.exists():
        return set()
    with open(log_path, "r", encoding="utf-8") as f:
        return set(ln.strip() for ln in f if ln.strip())

# ==================== DOWNLOADER (WITH RESUME) ====================
def download_one(rel: str) -> tuple[str, bool]:
    """
    Download a single file fresh.
    Skip check is handled in main(), so here we always write.
    Returns (rel, ok).
    """
    url = BASE + rel
    out_path = rel_to_dest(rel)

    ensure_dirs(out_path)

    try:
        with session.get(url, stream=True, timeout=(30, 300)) as r:
            r.raise_for_status()
            with open(out_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=CHUNK):
                    if chunk:
                        f.write(chunk)
        return rel, True

    except Exception:
        return rel, False


# ==================== MAIN ====================
def main() -> None:
    """
    Orchestrate the download:
      1) Ensure base folder exists (STAGE_DIR if set else SAVE_DIR).
      2) Fetch + filter list of relative image paths.
      3) Download in parallel threads.
      4) Periodically flush staged files (if staging is enabled).
      5) Final flush.
      6) Print summary counts.
    """
    # Build the plan of work.
    all_files = fetch_image_list()
    print(f"Planned downloads before skipping: {len(all_files)} files")

    # Load already-downloaded files
    downloaded = load_downloaded(LOG_OK)

    # Filter out already downloaded
    relative_paths = [ln for ln in all_files if ln not in downloaded]
    print(f"Planned downloads after skipping: {len(relative_paths)} files")

    # Progress bar starts at % already done
    progress = tqdm(
        total=len(all_files),
        initial=len(downloaded),
        desc="Downloading",
        unit="file"
    )

    ok_count = 0
    fail_count = 0

    # Thread pool to download multiple files concurrently.
    with ThreadPoolExecutor(max_workers=CONCURRENCY) as pool:
        # Submit all tasks and keep a mapping to know which future corresponds to which rel path.
        future_map = {pool.submit(download_one, rel): rel for rel in relative_paths}

        # as_completed yields futures as soon as each finishes (success or failure).
        for i, fut in enumerate(as_completed(future_map), 1):
            rel, ok = fut.result()

            if ok:
                ok_count += 1
                # Print each successful path (tweak the modulus if too chatty).
                # if i % 1 == 0:
                #     print(rel)
                log_line(LOG_OK, rel)  # append to the success log
            else:
                fail_count += 1
                log_line(LOG_FAIL, f"FAILED {rel}")  # append to the failure log

            # ✅ tick the progress bar after handling each file
            progress.update(1)

    # ✅ close the bar after all downloads finish
    progress.close()

    print(f"Done. OK={ok_count}, FAILED={fail_count}")

# ==================== SCRIPT ENTRY ====================
if __name__ == "__main__":
    # Quick auth sanity check: can we access the BASE URL with the provided cookie?
    test = session.get(BASE, timeout=30)
    if test.status_code != 200:
        print("❌ Auth failed (check sessionid).", file=sys.stderr)
        sys.exit(1)

    # Run the full pipeline.
    main()


The IMAGE_FILENAMES downloading...
The IMAGE_FILENAMES downloaded succsessfully
Strip is done. Clean list of URLs are in the lins list.
Filter is done. A list of URLS in desired subfolder is ready.
Planned downloads before skipping: 37648 files
Planned downloads after skipping: 23816 files


Downloading:  37%|###6      | 13832/37648 [00:00<?, ?file/s]

## Download Chextpert

In [None]:
!apt -y install aria2

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libaria2-0 libc-ares2
The following NEW packages will be installed:
  aria2 libaria2-0 libc-ares2
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 1,513 kB of archives.
After this operation, 5,441 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libc-ares2 amd64 1.18.1-1ubuntu0.22.04.3 [45.1 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libaria2-0 amd64 1.36.0-1 [1,086 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 aria2 amd64 1.36.0-1 [381 kB]
Fetched 1,513 kB in 0s (5,438 kB/s)
Selecting previously unselected package libc-ares2:amd64.
(Reading database ... 126380 files and directories currently installed.)
Preparing to unpack .../libc-ares2_1.18.1-1ubuntu0.22.04.3_amd64.deb ...
Unpacking libc-ares2:amd64 (1.18.1-1ubun

In [None]:
!aria2c -x 16 -s 16 -k 1M -d "/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files" "the link"


08/17 21:36:33 [[1;32mNOTICE[0m] Downloading 1 item(s)

08/17 21:36:33 [[1;32mNOTICE[0m] Allocating disk space. Use --file-allocation=none to disable it. See --file-allocation option in man page for more details.
 *** Download Progress Summary as of Sun Aug 17 21:37:33 2025 *** 
=
[#61b5dc 0B/145GiB(0%) CN:1 DL:0B]
FILE: /content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files/png_chexpert_plus_chunk_1.zip
-

 *** Download Progress Summary as of Sun Aug 17 21:38:34 2025 *** 
=
[#61b5dc 0B/145GiB(0%) CN:1 DL:0B]
FILE: /content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files/png_chexpert_plus_chunk_1.zip
-

 *** Download Progress Summary as of Sun Aug 17 21:39:34 2025 *** 
=
[#61b5dc 0B/145GiB(0%) CN:1 DL:0B]
FILE: /content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files/png_chexpert_plus_chunk_1.zip
-

 *** Download Progress Summary as of Sun Aug 17 21:40:35 2025 *** 
=
[#61b5dc 0B/145GiB(0%) CN:1 DL:0B]
FILE: /content/drive/MyDriv

### Unzip the Data

In [None]:
!7z x -mmt=on "/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files/png_chexpert_plus_chunk_1.zip" -o"/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Extracted_data"


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/drive/MyDrive/RadLLaMA_ . Dataset/Chextpert_Plus/Zip_files/                                                                              1 file, 155699656670 bytes (146 GiB)

Extracting archive: /content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files/png_chexpert_plus_chunk_1.zip
  8% 4096 Open              --
Path = /content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files/png_chexpert_plus_chunk_1.zip
Type = zip
Physical Size = 155699656670
64-bit = +

  0%    
Would you like to replace the existing file:
  Path:    

In [None]:
!unzip -q "/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Zip_files/png_chexpert_plus_chunk_1.zip" -d "/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Extracted_data"

/content/drive/MyDrive/RadLLaMA_Thinking/Dataset/Chextpert_Plus/Extracted_data/PNG/train/patient32368/study1/view1_frontal.png  bad CRC 511c9dff  (should be 29424069)
