In [1]:
import requests

def walk(obj):
    if isinstance(obj, dict):
        for v in obj.values():
            yield from walk(v)
    elif isinstance(obj, list):
        for item in obj:
            yield from walk(item)
    else:
        return
    yield obj

def get_all_urls(project_id: str):
    url = f"https://gtexportal.org/api/v2/dataset/openAccessFilesMetadata?project_id={project_id}"
    data = requests.get(url).json()
    urls = []
    for node in walk(data):
        if isinstance(node, dict) and node.get("url"):
            urls.append(node["url"])
    return urls

urls = get_all_urls("adult-gtex")

for u in urls:
    print(u)


https://storage.googleapis.com/adult-gtex/additional_GTEx_datasets/v6p/coexpression_networks/coexpression_networks_v6p.zip
https://storage.googleapis.com/adult-gtex/additional_GTEx_datasets/v8/outlier_calls/GTEx_v8_outlier_calls.zip
https://storage.googleapis.com/adult-gtex/annotations/v10/metadata-files/GTEx_Analysis_v10_Annotations_SampleAttributesDD.xlsx
https://storage.googleapis.com/adult-gtex/annotations/v10/metadata-files/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt
https://storage.googleapis.com/adult-gtex/annotations/v10/metadata-files/GTEx_Analysis_v10_Annotations_SubjectPhenotypesDD.xlsx
https://storage.googleapis.com/adult-gtex/annotations/v10/metadata-files/GTEx_Analysis_v10_Annotations_SubjectPhenotypesDS.txt
https://storage.googleapis.com/adult-gtex/annotations/v10/metadata-files/GTEx_Analysis_v10_Sample_Tissue_Changes_From_v8.txt
https://storage.googleapis.com/adult-gtex/annotations/v10/small-RNA/smallRNA.filtered_annotated_031725.txt
https://storage.googleapis.

In [2]:
import os
import requests
import pandas as pd

def walk(obj):
    if isinstance(obj, dict):
        for v in obj.values():
            yield from walk(v)
    elif isinstance(obj, list):
        for item in obj:
            yield from walk(item)
    yield obj

def get_urls(project_id="adult-gtex", keyword="bulk-gex/v10"):
    url = f"https://gtexportal.org/api/v2/dataset/openAccessFilesMetadata?project_id={project_id}"
    data = requests.get(url).json()
    urls = []

    for node in walk(data):
        if isinstance(node, dict) and node.get("url"):
            if keyword in node["url"]:
                urls.append(node["url"])

    return urls

# Fetch only the 54 V10 files
urls = get_urls()

# Build DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
df = pd.DataFrame({
    "name": [os.path.basename(u) for u in urls],
    "url": urls
})

print(df)


                                                               name  \
0                GTEx_Analysis_v10_RNASeQCv2.4.2_exon_reads.parquet   
1            GTEx_Analysis_v10_RNASeQCv2.4.2_gene_median_tpm.gct.gz   
2                 GTEx_Analysis_v10_RNASeQCv2.4.2_gene_reads.gct.gz   
3                   GTEx_Analysis_v10_RNASeQCv2.4.2_gene_tpm.gct.gz   
4               GTEx_Analysis_v10_RNASeQCv2.4.2_gene_tpm_lcm.gct.gz   
5    GTEx_Analysis_v10_RSEMv1.3.3_transcripts_expected_count.txt.gz   
6               GTEx_Analysis_v10_RSEMv1.3.3_transcripts_tpm.txt.gz   
7                   GTEx_Analysis_v10_STARv2.7.10a_junctions.gct.gz   
8                        gene_reads_v10_adipose_subcutaneous.gct.gz   
9                    gene_reads_v10_adipose_visceral_omentum.gct.gz   
10                              gene_reads_v10_adrenal_gland.gct.gz   
11                               gene_reads_v10_artery_aorta.gct.gz   
12                            gene_reads_v10_artery_coronary.gct.gz   
13    

In [3]:
# df

In [4]:
df[df['name'].str.startswith("gene_tpm_v10")]

Unnamed: 0,name,url
62,gene_tpm_v10_adipose_subcutaneous.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_adipose_subcutaneous.gct.gz
63,gene_tpm_v10_adipose_visceral_omentum.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_adipose_visceral_omentum.gct.gz
64,gene_tpm_v10_adrenal_gland.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_adrenal_gland.gct.gz
65,gene_tpm_v10_artery_aorta.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_artery_aorta.gct.gz
66,gene_tpm_v10_artery_coronary.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_artery_coronary.gct.gz
67,gene_tpm_v10_artery_tibial.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_artery_tibial.gct.gz
68,gene_tpm_v10_bladder.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_bladder.gct.gz
69,gene_tpm_v10_brain_amygdala.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_brain_amygdala.gct.gz
70,gene_tpm_v10_brain_anterior_cingulate_cortex_ba24.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_brain_anterior_cingulate_cortex_ba24.gct.gz
71,gene_tpm_v10_brain_caudate_basal_ganglia.gct.gz,https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/tpms-by-tissue/gene_tpm_v10_brain_caudate_basal_ganglia.gct.gz


In [5]:
len(df[df['name'].str.startswith("gene_tpm_v10")]) # 54 entries as shown on the website

54

In [6]:
# TODO: make a user input interface

In [7]:
tpm_df = df[df['name'].str.startswith("gene_tpm_v10")].copy()
tpm_df['tissue'] = tpm_df['name'].str.replace('gene_tpm_v10_', '').str.replace('.gct.gz', '')
tissues = tpm_df['tissue'].tolist()
print(f"Available tissues ({len(tissues)}):")
print(tissues)

Available tissues (54):
['adipose_subcutaneous', 'adipose_visceral_omentum', 'adrenal_gland', 'artery_aorta', 'artery_coronary', 'artery_tibial', 'bladder', 'brain_amygdala', 'brain_anterior_cingulate_cortex_ba24', 'brain_caudate_basal_ganglia', 'brain_cerebellar_hemisphere', 'brain_cerebellum', 'brain_cortex', 'brain_frontal_cortex_ba9', 'brain_hippocampus', 'brain_hypothalamus', 'brain_nucleus_accumbens_basal_ganglia', 'brain_putamen_basal_ganglia', 'brain_spinal_cord_cervical_c-1', 'brain_substantia_nigra', 'breast_mammary_tissue', 'cells_cultured_fibroblasts', 'cells_ebv-transformed_lymphocytes', 'cervix_ectocervix', 'cervix_endocervix', 'colon_sigmoid', 'colon_transverse', 'esophagus_gastroesophageal_junction', 'esophagus_mucosa', 'esophagus_muscularis', 'fallopian_tube', 'heart_atrial_appendage', 'heart_left_ventricle', 'kidney_cortex', 'kidney_medulla', 'liver', 'lung', 'minor_salivary_gland', 'muscle_skeletal', 'nerve_tibial', 'ovary', 'pancreas', 'pituitary', 'prostate', 'skin

In [20]:
# tpm_df.loc[tpm_df["tissue"] == "bladder", "url"]

In [8]:
import aiohttp
import asyncio
from aiohttp import ClientTimeout
import pandas as pd
import os
from rich import print
from rich.console import Console
from rich.table import Table
import yaml
from rich.progress import (
    Progress, BarColumn, TimeElapsedColumn,
    TimeRemainingColumn, DownloadColumn, TransferSpeedColumn
)
import sys
import logging
from pathlib import Path
from pythonjsonlogger import jsonlogger
import time

In [25]:
# log template ## betterstack.com
logger = logging.getLogger(__name__)
# addition ## ensure log dir exists # TODO: Switch to a single logging file instead
log_path = Path("data/data_raw/hca_api_logs.txt")
log_path.parent.mkdir(parents=True, exist_ok=True)

stdoutHandler = logging.StreamHandler(stream=sys.stdout)
fileHandler = logging.FileHandler(log_path)

jsonFmt = jsonlogger.JsonFormatter(
    "%(name)s %(asctime)s %(levelname)s %(filename)s %(lineno)s %(process)d %(message)s",
    rename_fields={"levelname": "severity", "asctime": "timestamp"},
    datefmt="%Y-%m-%dT%H:%M:%SZ",
)

stdoutHandler.setFormatter(jsonFmt)
fileHandler.setFormatter(jsonFmt)

logger.addHandler(stdoutHandler)
logger.addHandler(fileHandler)

logger.setLevel(logging.DEBUG)

from aiohttp import ClientTimeout
timeout = ClientTimeout(total=None)
headers = {"User-Agent": "Mozilla/5.0"}

In [10]:
# download func; In 1 mb chunks
async def download_file(url, filename, chunk_size=1024*1024, retries=3):

    for i in range(retries):
        try:
            connector = aiohttp.TCPConnector(limit=10)
            async with aiohttp.ClientSession(timeout=timeout, headers=headers, connector=connector) as session:
                async with session.get(url) as response:

                    if response.status != 200:
                        logger.error("HTTP error", extra={"status": response.status, "url": url})
                        raise Exception(f"Failed: {response.status}")
                    
                    total = response.content_length

                    with Progress(
                        "[progress.description]{task.description}",
                        DownloadColumn(),
                        BarColumn(),
                        TransferSpeedColumn(),
                        TimeRemainingColumn(),
                        TimeElapsedColumn(),
                    ) as progress:

                        task = progress.add_task(
                            f"Downloading {os.path.basename(filename)}",
                            total=total
                        )

                        with open(filename, "wb") as f:
                            async for chunk in response.content.iter_chunked(chunk_size):
                                try:
                                    f.write(chunk)
                                    #raise RuntimeError("test")
                                    progress.update(task, advance=len(chunk))
                                except Exception:
                                    logger.critical("failed to write chunks while downloading", exc_info=True)
                                    raise

## When the download fails (after retries the branch/pipeline) still continues. #TODO: think if this needs fixn

            print("Download complete:", filename)
            return

        except Exception as e:
            print(f"Retry {i+1} failed:", e)
            time.sleep(2 ** i)

In [27]:
from pathlib import Path

async def download_tissue(tissue_name, output_path="."):
    row = tpm_df.loc[tpm_df["tissue"] == tissue_name]

    if row.empty:
        print(f"Tissue '{tissue_name}' not found")
        return None

    url = row["url"].iloc[0]
    filename = row["name"].iloc[0]

    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)

    out_file = output_path / filename

    await download_file(url, str(out_file))

    return str(out_file)

In [28]:
await download_tissue("liver")

Output()

'gene_tpm_v10_liver.gct.gz'