In [1]:
from pydantic import BaseModel, Field
from typing import List, Optional

In [21]:
import requests
from requests.exceptions import HTTPError
import json, time

import aiohttp
import asyncio
from aiohttp import ClientTimeout

In [3]:
import pandas as pd

In [4]:
class File(BaseModel):
    name: str
    format: Optional[str] = None # Maybe not optional??
    url: Optional[str] = None

In [5]:
class Project(BaseModel):
    projectTitle: Optional[List[Optional[str]]] = None
    laboratory: Optional[List[Optional[str]]] = None

In [6]:
class Sample(BaseModel):
    organ: Optional[List[Optional[str]]] = None
    disease: Optional[List[Optional[str]]] = None

In [7]:
class Hit(BaseModel):
    projects: List[Project]
    samples: Optional[List[Sample]] = None
    files: List[File]

In [38]:
filters = {
    "genusSpecies": {"is": ["Homo sapiens"]},
    "fileSource": {"is": ["DCP/2 Analysis"]},
    "fileFormat": {"is": ["loom"]},
    "isIntermediate": {"is": [False]} # hvy filter
}

In [39]:
size=100
catalog = "dcp54"

base_url = "https://service.azul.data.humancellatlas.org/index/files"

def fetch_all_pages(catalog=catalog, filters=filters, size=50):
    params = {"catalog": catalog, "filters": json.dumps(filters), "size": size}
    hits, url, page = [], base_url, 1 # empty list to store evrytg

    while url:
        print(f"page {page}")
        r = requests.get(url, params=params if url == base_url else None)
        # stat at this point
        if r.status_code != 200:
            print("error", r.status_code, r.text[:200])
            break

        data = r.json()
        if page == 1:
            facets = data.get("termFacets")
        hits += data.get("hits", [])
        url = data.get("pagination", {}).get("next")
        params = None
        page += 1

        time.sleep(0.2)

    return hits, facets

raw_hits, facets = fetch_all_pages()
hits = [Hit(**h) for h in raw_hits]
print("validated:", len(hits))

page 1
page 2
page 3
validated: 101


In [33]:
formats = facets["fileFormat"]["terms"]
pd.DataFrame(formats)

Unnamed: 0,term,count
0,fastq.gz,155749
1,bam,36553
2,bai,35573
3,fastq,16637
4,tsv.gz,4044
...,...,...
78,h5ad.zip,1
79,md5,1
80,son.gz,1
81,tab.gz,1


In [None]:
# df = pd.DataFrame(formats)
# fastq_df = df[df["term"].isin(["fastq.gz", "fastq"])]

In [None]:
# think of use for termfacets

In [53]:
rows = []
for hit in hits:
    for project in hit.projects or []:
        project_title = ", ".join(project.projectTitle or [])
        labs = project.laboratory or []
        lab_names = "; ".join(str(lab) for lab in labs if lab) or "N/A"

        # samples = hit.samples or []

        # if samples:
        #         organ = ", ".join(samples[0].organ or ["N/A"])
        #         disease = ", ".join(samples[0].disease or ["N/A"])
        # else:
        #         organ = "N/A"
        #         disease = "N/A"

for file in hit.files or []:
        # print(
        #     f"Project: {project_title}\n"
        #     f"Project lab: {lab_names}\n"
        #     #f"Organ: {organ}\n"
        #     #f"Disease: {disease}\n"
        #     f"File: {file.name}\n"
        #     f"URL: {file.url}\n"
        # )
        rows.append({
                "Project": project_title,
                "Project lab": lab_names,
                #"Organ": organ,
               #"Disease": disease,
                "File": file.name,
                "URL": file.url})
        
df =pd.DataFrame(rows)
df

Unnamed: 0,Project,Project lab,File,URL
0,Single-cell transcriptomics reveals unique features of human pancreatic islet cell subtypes,Human Cell Atlas Data Coordinaton Platform,unique-pancreatic-islets-human-pancreas-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/733c22aa-6f5a-5055-a147-000e4474b5ab?catalog=dcp54&version=2022-01-19T16%3A16%3A27.000000Z


In [54]:
rows = []

for hit in hits:
    for project in hit.projects or []:
        project_title = ", ".join(project.projectTitle or [])
        lab_names = "; ".join([lab for lab in (project.laboratory or []) if lab]) or "N/A"
        
        for file in hit.files or []:
            rows.append({
                "Project": project_title,
                "Project lab": lab_names,
                "File": file.name,
                "Url": file.url
            })

df = pd.DataFrame(rows)
df


Unnamed: 0,Project,Project lab,File,Url
0,Re-evaluation of human BDCA-2+ DC during acute sterile skin inflammation.,"Centre for Computational Biology; Department of Plastic and Reconstructive Surgery; Medical Research Council Human Immunology Unit, Radcliffe Department of Medicine",AcuteSkinInflammation-human-blood-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/feb5f419-da5c-58ac-b866-d5f1fa9e0548?catalog=dcp54&version=2022-01-20T20%3A13%3A43.000000Z
1,Re-evaluation of human BDCA-2+ DC during acute sterile skin inflammation.,"Centre for Computational Biology; Department of Plastic and Reconstructive Surgery; Medical Research Council Human Immunology Unit, Radcliffe Department of Medicine",AcuteSkinInflammation-human-skin-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/e17ad464-787e-5526-8b5d-f2fcea28cb06?catalog=dcp54&version=2022-01-19T14%3A08%3A08.000000Z
2,Dissecting the clonal nature of allelic expression in somatic cells by single-cell RNA-seq,Human Cell Atlas Data Coordination Platform; Sandberg,AllelicExpressionPatterns-human-hematopoeitic-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/d0e303fb-b679-5968-b814-1ad6c1e6db28?catalog=dcp54&version=2022-01-08T18%3A21%3A57.000000Z
3,Single cell RNA sequencing of multiple myeloma II,Human Cell Atlas Data Coordination Platform; Samsung Medical Center/Sungkyunkwan University School of Medicine,Bone-Marrow-Myeloma-human-hematopoeitic-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/3014ec47-1399-57ca-ab74-c8c296232b9e?catalog=dcp54&version=2021-10-14T20%3A19%3A06.000000Z
4,Precursors of human CD4+ cytotoxic T lymphocytes identified by single-cell transcriptome analysis,Division of Vaccine Discovery; Human Cell Atlas Data Coordination Platform; Molecular Atlas,CD4-cytotoxic-human-blood-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/ec0ad12e-9828-54a5-86bf-58a9dbb1e679?catalog=dcp54&version=2021-10-13T03%3A43%3A25.000000Z
5,Cryopreservation and post-thaw characterization of dissociated human islet cells,Human Cell Atlas Data Coordination Platform,CryoPancreatic-human-pancreas-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/2dc04008-2d62-5c3f-a6f1-c7cae7ac1009?catalog=dcp54&version=2022-01-08T16%3A18%3A42.000000Z
6,Single-cell RNA-seq reveals heterogeneity within human pre-cDCs,"Human Cell Atlas Data Coordination Platform; Immunology, Virology and Microbiology; Laboratory of Molecular Immunology",HumanDCsFromPre-cDCs-human-blood-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/43104b26-6f1f-5b14-8f6e-458d5ab05ea4?catalog=dcp54&version=2022-01-19T06%3A05%3A54.000000Z
7,Single cell analysis of human fetal liver captures the transcriptional profile of hepatobiliary hybrid progenitors,Centre of Stem Cell and Regenerative Medicine; Human Cell Atlas Data Coordination Platform,HumanDevoLiverSegalRashid-human-liver-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/52608937-5b23-5037-8d0e-8f00696e8bcb?catalog=dcp54&version=2022-01-11T18%3A39%3A29.000000Z
8,Single-Cell Transcriptomics of the Human Endocrine Pancreas,Human Cell Atlas Data Coordination Platform; School of Medicine,HumanEndocrinePancreas-human-pancreas-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/f50fbeda-ebb1-59db-ab01-5009313bd1cc?catalog=dcp54&version=2022-01-10T15%3A54%3A23.000000Z
9,Single cell RNA-sequencing of human tonsil Innate lymphoid cells (ILCs),Cell and Molecular Biology; Human Cell Atlas Data Coordination Platform,HumanInnateLymphoidCells-human-pancreas-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/6dcc3ab8-784e-5650-b500-16cc12fed239?catalog=dcp54&version=2022-01-11T14%3A45%3A26.000000Z


In [55]:
# accounting for multiple samples in hits

rows = []
for hit in hits:
    for project in hit.projects or []:
        project_title = ", ".join(project.projectTitle or [])
        labs = project.laboratory or []
        lab_names = "; ".join(str(lab) for lab in labs if lab) or "N/A"

        # collect all organs/diseases across samples
        samples = hit.samples or []
        if samples:
            organs = set()
            diseases = set()
            for s in samples:
                if getattr(s, "organ", None):
                    organs.update([str(o) for o in s.organ if o])
                if getattr(s, "disease", None):
                    diseases.update([str(d) for d in s.disease if d])
            organ = ", ".join(organs) if organs else "N/A"
            disease = ", ".join(diseases) if diseases else "N/A"
        else:
            organ = "N/A"
            disease = "N/A"

# Sasb4; in future add other metadata here
        for file in hit.files or []:
            rows.append({
                "Project": project_title,
                "Project lab": lab_names,
                "Organ": organ,
                "Disease": disease,
                "File": file.name,
                "Url": file.url
            })

df = pd.DataFrame(rows)
df

Unnamed: 0,Project,Project lab,Organ,Disease,File,Url
0,Re-evaluation of human BDCA-2+ DC during acute sterile skin inflammation.,"Centre for Computational Biology; Department of Plastic and Reconstructive Surgery; Medical Research Council Human Immunology Unit, Radcliffe Department of Medicine",blood,normal,AcuteSkinInflammation-human-blood-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/feb5f419-da5c-58ac-b866-d5f1fa9e0548?catalog=dcp54&version=2022-01-20T20%3A13%3A43.000000Z
1,Re-evaluation of human BDCA-2+ DC during acute sterile skin inflammation.,"Centre for Computational Biology; Department of Plastic and Reconstructive Surgery; Medical Research Council Human Immunology Unit, Radcliffe Department of Medicine",skin of body,normal,AcuteSkinInflammation-human-skin-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/e17ad464-787e-5526-8b5d-f2fcea28cb06?catalog=dcp54&version=2022-01-19T14%3A08%3A08.000000Z
2,Dissecting the clonal nature of allelic expression in somatic cells by single-cell RNA-seq,Human Cell Atlas Data Coordination Platform; Sandberg,hematopoietic system,normal,AllelicExpressionPatterns-human-hematopoeitic-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/d0e303fb-b679-5968-b814-1ad6c1e6db28?catalog=dcp54&version=2022-01-08T18%3A21%3A57.000000Z
3,Single cell RNA sequencing of multiple myeloma II,Human Cell Atlas Data Coordination Platform; Samsung Medical Center/Sungkyunkwan University School of Medicine,hematopoietic system,plasma cell myeloma,Bone-Marrow-Myeloma-human-hematopoeitic-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/3014ec47-1399-57ca-ab74-c8c296232b9e?catalog=dcp54&version=2021-10-14T20%3A19%3A06.000000Z
4,Precursors of human CD4+ cytotoxic T lymphocytes identified by single-cell transcriptome analysis,Division of Vaccine Discovery; Human Cell Atlas Data Coordination Platform; Molecular Atlas,blood,,CD4-cytotoxic-human-blood-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/ec0ad12e-9828-54a5-86bf-58a9dbb1e679?catalog=dcp54&version=2021-10-13T03%3A43%3A25.000000Z
5,Cryopreservation and post-thaw characterization of dissociated human islet cells,Human Cell Atlas Data Coordination Platform,pancreas,normal,CryoPancreatic-human-pancreas-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/2dc04008-2d62-5c3f-a6f1-c7cae7ac1009?catalog=dcp54&version=2022-01-08T16%3A18%3A42.000000Z
6,Single-cell RNA-seq reveals heterogeneity within human pre-cDCs,"Human Cell Atlas Data Coordination Platform; Immunology, Virology and Microbiology; Laboratory of Molecular Immunology",blood,,HumanDCsFromPre-cDCs-human-blood-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/43104b26-6f1f-5b14-8f6e-458d5ab05ea4?catalog=dcp54&version=2022-01-19T06%3A05%3A54.000000Z
7,Single cell analysis of human fetal liver captures the transcriptional profile of hepatobiliary hybrid progenitors,Centre of Stem Cell and Regenerative Medicine; Human Cell Atlas Data Coordination Platform,liver,normal,HumanDevoLiverSegalRashid-human-liver-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/52608937-5b23-5037-8d0e-8f00696e8bcb?catalog=dcp54&version=2022-01-11T18%3A39%3A29.000000Z
8,Single-Cell Transcriptomics of the Human Endocrine Pancreas,Human Cell Atlas Data Coordination Platform; School of Medicine,pancreas,"normal, type 2 diabetes mellitus, type 1 diabetes mellitus",HumanEndocrinePancreas-human-pancreas-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/f50fbeda-ebb1-59db-ab01-5009313bd1cc?catalog=dcp54&version=2022-01-10T15%3A54%3A23.000000Z
9,Single cell RNA-sequencing of human tonsil Innate lymphoid cells (ILCs),Cell and Molecular Biology; Human Cell Atlas Data Coordination Platform,tonsil,normal,HumanInnateLymphoidCells-human-pancreas-SS2.loom,https://service.azul.data.humancellatlas.org/repository/files/6dcc3ab8-784e-5650-b500-16cc12fed239?catalog=dcp54&version=2022-01-11T14%3A45%3A26.000000Z


In [56]:
df["Url"].head()

0    https://service.azul.data.humancellatlas.org/repository/files/feb5f419-da5c-58ac-b866-d5f1fa9e0548?catalog=dcp54&version=2022-01-20T20%3A13%3A43.000000Z
1    https://service.azul.data.humancellatlas.org/repository/files/e17ad464-787e-5526-8b5d-f2fcea28cb06?catalog=dcp54&version=2022-01-19T14%3A08%3A08.000000Z
2    https://service.azul.data.humancellatlas.org/repository/files/d0e303fb-b679-5968-b814-1ad6c1e6db28?catalog=dcp54&version=2022-01-08T18%3A21%3A57.000000Z
3    https://service.azul.data.humancellatlas.org/repository/files/3014ec47-1399-57ca-ab74-c8c296232b9e?catalog=dcp54&version=2021-10-14T20%3A19%3A06.000000Z
4    https://service.azul.data.humancellatlas.org/repository/files/ec0ad12e-9828-54a5-86bf-58a9dbb1e679?catalog=dcp54&version=2021-10-13T03%3A43%3A25.000000Z
Name: Url, dtype: object

In [57]:
# test for (user) specific filtering from the df
#df[df["Organ"] == "liver"].head()
## todo- create col. for project page.html
pd.set_option("display.max_colwidth", None)
df[df["Organ"] == "blood"]["Url"]

0     https://service.azul.data.humancellatlas.org/repository/files/feb5f419-da5c-58ac-b866-d5f1fa9e0548?catalog=dcp54&version=2022-01-20T20%3A13%3A43.000000Z
4     https://service.azul.data.humancellatlas.org/repository/files/ec0ad12e-9828-54a5-86bf-58a9dbb1e679?catalog=dcp54&version=2021-10-13T03%3A43%3A25.000000Z
6     https://service.azul.data.humancellatlas.org/repository/files/43104b26-6f1f-5b14-8f6e-458d5ab05ea4?catalog=dcp54&version=2022-01-19T06%3A05%3A54.000000Z
11    https://service.azul.data.humancellatlas.org/repository/files/f02fb30a-1f32-50d0-a3d5-95051ffa5899?catalog=dcp54&version=2022-01-19T14%3A15%3A32.000000Z
15    https://service.azul.data.humancellatlas.org/repository/files/95f5f493-304f-5cec-a776-b9e72b97d6c2?catalog=dcp54&version=2021-10-14T16%3A54%3A34.000000Z
26    https://service.azul.data.humancellatlas.org/repository/files/8464fdc4-9927-56d7-a244-08a4a0de16c9?catalog=dcp54&version=2021-11-09T21%3A28%3A56.000000Z
Name: Url, dtype: object

In [58]:
## Extract Url
urls = df["Url"].tolist()

In [37]:
## Parallel download
### Use pooch
### async+aiohttp

In [38]:
import aiohttp
import asyncio

# asynchronous func.
async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()

connector = aiohttp.TCPConnector(limit=10) # no overloading req.

# Here plug the urls from above
async def main():

    # instantiating a shared session
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [fetch(session, url) for url in urls]
        responses = await asyncio.gather(*tasks)
        
        # validate or do smtg else
        for url, response in zip(urls, responses):
            print(f"URL: {url}\nResponse length: {len(response)}\n")

#asyncio.run(main()) ## commented due to weird jupyter behaviour
await main()

CancelledError: 

In [59]:
async def fetch(session, url):
    async with session.head(url) as response:
        return url, response.status

connector = aiohttp.TCPConnector(limit=10) # no overloading req.

async def main():

    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [fetch(session, url) for url in urls]
        responses = await asyncio.gather(*tasks)

        for url, status in responses:
            print(f"{url} = {status}")

    
await main()

https://service.azul.data.humancellatlas.org/repository/files/feb5f419-da5c-58ac-b866-d5f1fa9e0548?catalog=dcp54&version=2022-01-20T20%3A13%3A43.000000Z = 403
https://service.azul.data.humancellatlas.org/repository/files/e17ad464-787e-5526-8b5d-f2fcea28cb06?catalog=dcp54&version=2022-01-19T14%3A08%3A08.000000Z = 403
https://service.azul.data.humancellatlas.org/repository/files/d0e303fb-b679-5968-b814-1ad6c1e6db28?catalog=dcp54&version=2022-01-08T18%3A21%3A57.000000Z = 403
https://service.azul.data.humancellatlas.org/repository/files/3014ec47-1399-57ca-ab74-c8c296232b9e?catalog=dcp54&version=2021-10-14T20%3A19%3A06.000000Z = 403
https://service.azul.data.humancellatlas.org/repository/files/ec0ad12e-9828-54a5-86bf-58a9dbb1e679?catalog=dcp54&version=2021-10-13T03%3A43%3A25.000000Z = 403
https://service.azul.data.humancellatlas.org/repository/files/2dc04008-2d62-5c3f-a6f1-c7cae7ac1009?catalog=dcp54&version=2022-01-08T16%3A18%3A42.000000Z = 403
https://service.azul.data.humancellatlas.org/r

In [None]:
# authentication needed # dwld later

In [23]:
import aiohttp
import asyncio

async def download_file(url, filename):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            data = await response.read()
            with open(filename, "wb") as f:
                chunk_size = 4096 #adjust?
                async for data in response.content.iter_chunked(chunk_size):
                    f.write(data)

await(download_file("https://unsplash.com/photos/SLgCDEqHav4/download?ixid=M3wxMjA3fDF8MXxhbGx8MXx8fHx8fHx8MTc2MjMzODcwOXw&force=true", "/Users/aman/Desktop/unsplash_image.jpg"))

In [25]:
import aiohttp
import asyncio
from aiohttp import ClientTimeout

timeout = ClientTimeout(total=None)
headers = {"User-Agent": "Mozilla/5.0"}
connector = aiohttp.TCPConnector(limit=10)

#1 mb chunks
async def download_file(url, filename, chunk_size=1024*1024):
    async with aiohttp.ClientSession(timeout=timeout, headers=headers, connector=connector) as session:
        async with session.get(url) as response:
            if response.status != 200:
                raise Exception(f"Failed: {response.status}")
            
            with open(filename, "wb") as f:
                async for chunk in response.content.iter_chunked(chunk_size):
                    f.write(chunk)

await(download_file("https://unsplash.com/photos/zJD6UVVW_Ck/download?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzYyMzQxODUxfA&force=true", "/Users/aman/Desktop/unsplash_image.jpg"))


In [46]:
# test with metadata loom file
from aiohttp import ClientTimeout
timeout = ClientTimeout(total=None)
headers = {"User-Agent": "Mozilla/5.0"}

#1 mb chunks
async def download_file(url, filename, chunk_size=1024*1024, retries=3):
    for i in range(retries):
        try:
            connector = aiohttp.TCPConnector(limit=10)
            async with aiohttp.ClientSession(timeout=timeout, headers=headers, connector=connector) as session:
                async with session.get(url) as response:
                    if response.status != 200:
                        raise Exception(f"Failed: {response.status}")
                    
                    with open(filename, "wb") as f:
                        async for chunk in response.content.iter_chunked(chunk_size):
                            f.write(chunk)
            print("Download complete:", filename)
            return
        except Exception as e:
            print(f"Retry {i+1} failed:", e)
            time.sleep(2 ** i) # adjust

await(download_file("https://unsplash.com/photos/I33IeaCFPRY/download?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzYyMzUyNDMyfA&force=true", "/Users/aman/Desktop/unsplash2.jpg"))

Download complete: /Users/aman/Desktop/unsplash2.jpg
