Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions coderdata/dataset.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
figshare: https://api.figshare.com/v2/articles/29923646
version: 2.2.0
figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500
version: 2.2.1
datasets:
beataml:
description: Beat acute myeloid leukemia (BeatAML) focuses on acute myeloid leukemia
Expand Down
149 changes: 96 additions & 53 deletions coderdata/download/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,59 @@
import os
import requests
import warnings

import yaml
from typing import Iterable, List, Dict, Any, Optional



def _gather_files_from_response(resp: requests.Response) -> List[Dict[str, Any]]:
"""
Normalize Figshare API responses into a list of file dicts.

Supports:
1) Article endpoint: https://api.figshare.com/v2/articles/{id}
-> JSON object with key 'files' (list)

2) Files endpoint: https://api.figshare.com/v2/articles/{id}/files[?...]
-> JSON list of file objects (possibly paginated with Link headers)
"""
data = resp.json()
if isinstance(data, dict) and "files" in data and isinstance(data["files"], list):
return data["files"]
if isinstance(data, list):
return data
raise ValueError("Unexpected Figshare API response structure; expected dict with 'files' "
"or a list of file objects.")


def _iter_paginated_files(url: str, session: Optional[requests.Session] = None) -> Iterable[Dict[str, Any]]:
"""
Iterate over all files, following 'Link: <...>; rel=\"next\"' pagination if present.
Works for both the article endpoint (no pagination) and the files endpoint (may paginate).
"""
sess = session or requests.Session()
next_url = url

while next_url:
resp = sess.get(next_url)
if resp.status_code != 200:
raise Exception(f"Failed to get dataset details from Figshare: {resp.text}")

for f in _gather_files_from_response(resp):
yield f

# RFC5988-style 'Link' header pagination
link = resp.headers.get("Link") or resp.headers.get("link")
next_url = None
if link:
parts = [p.strip() for p in link.split(",")]
for part in parts:
if 'rel="next"' in part:
start = part.find("<") + 1
end = part.find(">", start)
if start > 0 and end > start:
next_url = part[start:end]
break

def download(
name: str='all',
Expand Down Expand Up @@ -46,81 +97,73 @@ def download(
local_path = Path(local_path)

if not local_path.exists():
Path.mkdir(local_path)
local_path.mkdir(parents=True, exist_ok=True)
# Get the dataset details
with resources.open_text('coderdata', 'dataset.yml') as f:
data_information = yaml.load(f, Loader=yaml.FullLoader)
url = data_information['figshare']

response = requests.get(url)
if response.status_code != 200:
raise Exception(
f"Failed to get dataset details from Figshare: {response.text}"
)

data = response.json()

# making sure that we are case insensitive
name = name.casefold()
name = (name or "all").casefold()
session = requests.Session()
all_files = list(_iter_paginated_files(url, session=session))

# Filter files by the specified prefix
if name != "all":
filtered_files = [
file
for file
in data['files']
if file['name'].startswith(name) or 'genes' in file['name']
]
f for f in all_files
if (f.get('name', '').casefold().startswith(name)) or ('genes' in f.get('name', '').casefold())
]
else:
filtered_files = data['files']
filtered_files = all_files

# Group files by name and select the one with the highest ID
unique_files = {}
for file in filtered_files:
file_name = local_path.joinpath(file['name'])
file_id = file['id']
if (
file_name not in unique_files
or file_id > unique_files[file_name]['id']
):
unique_files[file_name] = {'file_info': file, 'id': file_id}
fname = file.get('name')
fid = file.get('id')
if fname is None or fid is None:
continue
file_name = local_path.joinpath(fname)
if (file_name not in unique_files) or (fid > unique_files[file_name]['id']):
unique_files[file_name] = {'file_info': file, 'id': fid}

for file_name, file_data in unique_files.items():
file_info = file_data['file_info']
file_id = str(file_info['id'])
file_url = "https://api.figshare.com/v2/file/download/" + file_id
file_md5sum = file_info['supplied_md5']
file_url = f"https://api.figshare.com/v2/file/download/{file_id}"
file_md5sum = file_info.get('supplied_md5')

if file_name.exists() and not exist_ok:
warnings.warn(
f"{file_name} already exists. Use argument 'exist_ok=True' to overwrite the existing file."
)

retry_count = 10
# Download the file
while retry_count > 0:
with requests.get(file_url, stream=True) as r:
with session.get(file_url, stream=True) as r:
r.raise_for_status()
if file_name.exists() and not exist_ok:
warnings.warn(
f"{file_name} already exists. Use argument 'exist_ok=True'"
"to overwrite existing file."
)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

if file_md5sum:
with open(file_name, 'rb') as f:
check_md5sum = md5(f.read()).hexdigest()
if file_md5sum == check_md5sum:
break
else:
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
with open(file_name, 'rb') as f:
check_md5sum = md5(f.read()).hexdigest()
if file_md5sum == check_md5sum:
retry_count -= 1
if retry_count > 0:
warnings.warn(
f"{file_name} failed MD5 verification "
f"(expected: {file_md5sum}, got: {check_md5sum}). Retrying..."
)
else:
break
elif retry_count > 0:
warnings.warn(
f"{file_name} could not be downloaded successfully. "
f"(expected md5sum: {file_md5sum} - "
f"calculated md5sum: {check_md5sum})... retrying..."
)
retry_count = retry_count - 1
if retry_count == 0:

if retry_count == 0 and file_md5sum:
warnings.warn(
f"{file_name} could not be downloaded. Try again."
)
f"{file_name} could not be downloaded with a matching MD5 after retries."
)
else:
print(f"Downloaded '{file_url}' to '{file_name}'")

return

9 changes: 7 additions & 2 deletions scripts/push_to_figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
# update dataset.yml
with open("coderdata/dataset.yml", "r") as f:
data = yaml.safe_load(f)
data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}"
data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}/files?page=1&page_size=500"
data["version"] = version
with open('/tmp/dataset.yml', 'w') as f:
yaml.safe_dump(data, f, sort_keys=False)
Expand Down Expand Up @@ -232,7 +232,12 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
remote_file_info = get_remote_file_info(article_id, file_name)
if remote_file_info:
local_md5, local_size = get_file_check_data(file_path)
if remote_file_info['size'] != local_size or remote_file_info['computed_md5'] != local_md5:
remote_md5 = (
remote_file_info.get('computed_md5')
or remote_file_info.get('md5')
or remote_file_info.get('supplied_md5')
)
if remote_file_info.get('size') != local_size or remote_md5 != local_md5:
print(f"Updating file {file_name} in Figshare...")
delete_existing_file(article_id, remote_file_info['id'])
file_info = initiate_new_upload(article_id, file_path)
Expand Down