diff --git a/coderdata/dataset.yml b/coderdata/dataset.yml index 62da5f42..a7bf8457 100644 --- a/coderdata/dataset.yml +++ b/coderdata/dataset.yml @@ -1,5 +1,5 @@ -figshare: https://api.figshare.com/v2/articles/29923646 -version: 2.2.0 +figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500 +version: 2.2.1 datasets: beataml: description: Beat acute myeloid leukemia (BeatAML) focuses on acute myeloid leukemia diff --git a/coderdata/download/downloader.py b/coderdata/download/downloader.py index 1facc076..552cd9ba 100644 --- a/coderdata/download/downloader.py +++ b/coderdata/download/downloader.py @@ -7,8 +7,59 @@ import os import requests import warnings - import yaml +from typing import Iterable, List, Dict, Any, Optional + + + +def _gather_files_from_response(resp: requests.Response) -> List[Dict[str, Any]]: + """ + Normalize Figshare API responses into a list of file dicts. + + Supports: + 1) Article endpoint: https://api.figshare.com/v2/articles/{id} + -> JSON object with key 'files' (list) + + 2) Files endpoint: https://api.figshare.com/v2/articles/{id}/files[?...] + -> JSON list of file objects (possibly paginated with Link headers) + """ + data = resp.json() + if isinstance(data, dict) and "files" in data and isinstance(data["files"], list): + return data["files"] + if isinstance(data, list): + return data + raise ValueError("Unexpected Figshare API response structure; expected dict with 'files' " + "or a list of file objects.") + + +def _iter_paginated_files(url: str, session: Optional[requests.Session] = None) -> Iterable[Dict[str, Any]]: + """ + Iterate over all files, following 'Link: <...>; rel=\"next\"' pagination if present. + Works for both the article endpoint (no pagination) and the files endpoint (may paginate). + """ + sess = session or requests.Session() + next_url = url + + while next_url: + resp = sess.get(next_url) + if resp.status_code != 200: + raise Exception(f"Failed to get dataset details from Figshare: {resp.text}") + + for f in _gather_files_from_response(resp): + yield f + + # RFC5988-style 'Link' header pagination + link = resp.headers.get("Link") or resp.headers.get("link") + next_url = None + if link: + parts = [p.strip() for p in link.split(",")] + for part in parts: + if 'rel="next"' in part: + start = part.find("<") + 1 + end = part.find(">", start) + if start > 0 and end > start: + next_url = part[start:end] + break def download( name: str='all', @@ -46,81 +97,73 @@ def download( local_path = Path(local_path) if not local_path.exists(): - Path.mkdir(local_path) + local_path.mkdir(parents=True, exist_ok=True) # Get the dataset details with resources.open_text('coderdata', 'dataset.yml') as f: data_information = yaml.load(f, Loader=yaml.FullLoader) url = data_information['figshare'] - - response = requests.get(url) - if response.status_code != 200: - raise Exception( - f"Failed to get dataset details from Figshare: {response.text}" - ) - - data = response.json() - # making sure that we are case insensitive - name = name.casefold() + name = (name or "all").casefold() + session = requests.Session() + all_files = list(_iter_paginated_files(url, session=session)) - # Filter files by the specified prefix if name != "all": filtered_files = [ - file - for file - in data['files'] - if file['name'].startswith(name) or 'genes' in file['name'] - ] + f for f in all_files + if (f.get('name', '').casefold().startswith(name)) or ('genes' in f.get('name', '').casefold()) + ] else: - filtered_files = data['files'] + filtered_files = all_files - # Group files by name and select the one with the highest ID unique_files = {} for file in filtered_files: - file_name = local_path.joinpath(file['name']) - file_id = file['id'] - if ( - file_name not in unique_files - or file_id > unique_files[file_name]['id'] - ): - unique_files[file_name] = {'file_info': file, 'id': file_id} + fname = file.get('name') + fid = file.get('id') + if fname is None or fid is None: + continue + file_name = local_path.joinpath(fname) + if (file_name not in unique_files) or (fid > unique_files[file_name]['id']): + unique_files[file_name] = {'file_info': file, 'id': fid} for file_name, file_data in unique_files.items(): file_info = file_data['file_info'] file_id = str(file_info['id']) - file_url = "https://api.figshare.com/v2/file/download/" + file_id - file_md5sum = file_info['supplied_md5'] + file_url = f"https://api.figshare.com/v2/file/download/{file_id}" + file_md5sum = file_info.get('supplied_md5') + + if file_name.exists() and not exist_ok: + warnings.warn( + f"{file_name} already exists. Use argument 'exist_ok=True' to overwrite the existing file." + ) + retry_count = 10 - # Download the file while retry_count > 0: - with requests.get(file_url, stream=True) as r: + with session.get(file_url, stream=True) as r: r.raise_for_status() - if file_name.exists() and not exist_ok: - warnings.warn( - f"{file_name} already exists. Use argument 'exist_ok=True'" - "to overwrite existing file." - ) + with open(file_name, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + if file_md5sum: + with open(file_name, 'rb') as f: + check_md5sum = md5(f.read()).hexdigest() + if file_md5sum == check_md5sum: + break else: - with open(file_name, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - with open(file_name, 'rb') as f: - check_md5sum = md5(f.read()).hexdigest() - if file_md5sum == check_md5sum: + retry_count -= 1 + if retry_count > 0: + warnings.warn( + f"{file_name} failed MD5 verification " + f"(expected: {file_md5sum}, got: {check_md5sum}). Retrying..." + ) + else: break - elif retry_count > 0: - warnings.warn( - f"{file_name} could not be downloaded successfully. " - f"(expected md5sum: {file_md5sum} - " - f"calculated md5sum: {check_md5sum})... retrying..." - ) - retry_count = retry_count - 1 - if retry_count == 0: + + if retry_count == 0 and file_md5sum: warnings.warn( - f"{file_name} could not be downloaded. Try again." - ) + f"{file_name} could not be downloaded with a matching MD5 after retries." + ) else: print(f"Downloaded '{file_url}' to '{file_name}'") - return diff --git a/scripts/push_to_figshare.py b/scripts/push_to_figshare.py index e17735c3..4ee464b3 100644 --- a/scripts/push_to_figshare.py +++ b/scripts/push_to_figshare.py @@ -197,7 +197,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version): # update dataset.yml with open("coderdata/dataset.yml", "r") as f: data = yaml.safe_load(f) - data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}" + data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}/files?page=1&page_size=500" data["version"] = version with open('/tmp/dataset.yml', 'w') as f: yaml.safe_dump(data, f, sort_keys=False) @@ -232,7 +232,12 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version): remote_file_info = get_remote_file_info(article_id, file_name) if remote_file_info: local_md5, local_size = get_file_check_data(file_path) - if remote_file_info['size'] != local_size or remote_file_info['computed_md5'] != local_md5: + remote_md5 = ( + remote_file_info.get('computed_md5') + or remote_file_info.get('md5') + or remote_file_info.get('supplied_md5') + ) + if remote_file_info.get('size') != local_size or remote_md5 != local_md5: print(f"Updating file {file_name} in Figshare...") delete_existing_file(article_id, remote_file_info['id']) file_info = initiate_new_upload(article_id, file_path)