diff --git a/.github/workflows/update-model-info.yml b/.github/update-model-info.yml similarity index 85% rename from .github/workflows/update-model-info.yml rename to .github/update-model-info.yml index 13639ec2..1eda2a98 100644 --- a/.github/workflows/update-model-info.yml +++ b/.github/update-model-info.yml @@ -2,7 +2,7 @@ name: update-model-info on: schedule: - - cron: "0 10 * * *" # 10:00, everyday + # - cron: "0 10 * * *" # 10:00, everyday # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -20,7 +20,7 @@ jobs: python -m pip install -r requirements-dev.txt changes=$(git diff --name-only HEAD^..HEAD -- models) if [ ! -z "$changes" ]; then - python $(pwd)/ci/update_model_info.py --f "$changes" + python $(pwd)/ci/update_model_info_deparate.py --f "$changes" fi env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/ci/update_model_info.py b/ci/update_model_info.py index 9ba1baff..decf6c66 100644 --- a/ci/update_model_info.py +++ b/ci/update_model_info.py @@ -16,14 +16,14 @@ from utils import ( compress_bundle, - create_pull_request, download_large_files, get_changed_bundle_list, get_checksum, + get_existing_bundle_list, get_hash_func, get_json_dict, - push_new_model_info_branch, save_model_info, + submit_pull_request, upload_bundle, ) @@ -74,15 +74,25 @@ def update_model_info( checksum = get_checksum(dst_path=zipfile_path, hash_func=hash_func) # step 3 + # check if uploading a new bundle + model_info_path = os.path.join(models_path, model_info_file) + model_info = get_json_dict(model_info_path) + existing_bundle_list = get_existing_bundle_list(model_info) + exist_flag = False + if bundle_name in existing_bundle_list: + exist_flag = True try: - source = upload_bundle(bundle_zip_file_path=zipfile_path, bundle_zip_filename=bundle_zip_name) + source = upload_bundle( + bundle_name=bundle_name, + version=latest_version, + root_path=temp_dir, + bundle_zip_name=bundle_zip_name, + exist_flag=exist_flag, + ) except Exception as e: return (False, f"Upload bundle error: {e}") # step 4 - model_info_path = os.path.join(models_path, model_info_file) - model_info = get_json_dict(model_info_path) - if bundle_name_with_version not in model_info.keys(): model_info[bundle_name_with_version] = {"checksum": "", "source": ""} @@ -105,6 +115,7 @@ def main(changed_dirs): bundle_list = get_changed_bundle_list(changed_dirs) models_path = "models" model_info_file = "model_info.json" + if len(bundle_list) > 0: for bundle in bundle_list: # create a temporary copy of the bundle for further processing @@ -120,8 +131,8 @@ def main(changed_dirs): raise AssertionError(f"update bundle: {bundle} failed. {msg}") # push a new branch that contains the updated model_info.json - branch_name = push_new_model_info_branch(model_info_path=os.path.join(models_path, model_info_file)) - create_pull_request(branch_name) + submit_pull_request(model_info_path=os.path.join(models_path, model_info_file)) + print("a pull request with updated model info is submitted.") else: print(f"all changed files: {changed_dirs} are not related to any existing bundles, skip updating.") diff --git a/ci/update_model_info_deparate.py b/ci/update_model_info_deparate.py new file mode 100644 index 00000000..5f4756f9 --- /dev/null +++ b/ci/update_model_info_deparate.py @@ -0,0 +1,134 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil +import tempfile + +from utils_deparate import ( + compress_bundle, + create_pull_request, + download_large_files, + get_changed_bundle_list, + get_checksum, + get_hash_func, + get_json_dict, + push_new_model_info_branch, + save_model_info, + upload_bundle, +) + + +def update_model_info( + bundle_name: str, temp_dir: str, models_path: str = "models", model_info_file: str = "model_info.json" +): + """ + For a changed model (bundle), this function is used to do the following steps in order to update it: + + 1. download large files (if having the corresponding config file) into the copy. + 2. compress the copy. + 3. upload a compressed copy. + 4. update `model_info_file`. + + Returns: + a 2-tuple. + If update successful, the form is (True,""). + If update failed, the form is (False, "error reason") + """ + temp_path = os.path.join(temp_dir, bundle_name) + shutil.copytree(os.path.join(models_path, bundle_name), temp_path) + # step 1 + try: + for large_file_type in [".yml", ".yaml", ".json"]: + large_file_name = "large_files" + large_file_type + large_file_path = os.path.join(temp_path, large_file_name) + if os.path.exists(large_file_path): + download_large_files(bundle_path=temp_path, large_file_name=large_file_name) + # remove the large file config + os.remove(large_file_path) + except Exception as e: + return (False, f"Download large files error: {e}") + + # step 2 + bundle_metadata_path = os.path.join(temp_path, "configs/metadata.json") + metadata = get_json_dict(bundle_metadata_path) + latest_version = metadata["version"] + bundle_zip_name = f"{bundle_name}_v{latest_version}.zip" + bundle_name_with_version = f"{bundle_name}_v{latest_version}" + zipfile_path = os.path.join(temp_dir, bundle_zip_name) + try: + compress_bundle(root_path=temp_dir, bundle_name=bundle_name, bundle_zip_name=bundle_zip_name) + except Exception as e: + return (False, f"Compress bundle error: {e}") + + hash_func = get_hash_func(hash_type="sha1") + checksum = get_checksum(dst_path=zipfile_path, hash_func=hash_func) + + # step 3 + try: + source = upload_bundle(bundle_zip_file_path=zipfile_path, bundle_zip_filename=bundle_zip_name) + except Exception as e: + return (False, f"Upload bundle error: {e}") + + # step 4 + model_info_path = os.path.join(models_path, model_info_file) + model_info = get_json_dict(model_info_path) + + if bundle_name_with_version not in model_info.keys(): + model_info[bundle_name_with_version] = {"checksum": "", "source": ""} + + model_info[bundle_name_with_version]["checksum"] = checksum + model_info[bundle_name_with_version]["source"] = source + + save_model_info(model_info, model_info_path) + return (True, "update successful") + + +def main(changed_dirs): + """ + main function to process all changed files. It will do the following steps: + + 1. according to changed directories, get changed bundles. + 2. update each bundle. + 3. according to the update results, push changed model_info_file if needed. + + """ + bundle_list = get_changed_bundle_list(changed_dirs) + models_path = "models" + model_info_file = "model_info.json" + if len(bundle_list) > 0: + for bundle in bundle_list: + # create a temporary copy of the bundle for further processing + temp_dir = tempfile.mkdtemp() + update_state, msg = update_model_info( + bundle_name=bundle, temp_dir=temp_dir, models_path=models_path, model_info_file=model_info_file + ) + shutil.rmtree(temp_dir) + + if update_state is True: + print(f"update bundle: {bundle} successful.") + else: + raise AssertionError(f"update bundle: {bundle} failed. {msg}") + + # push a new branch that contains the updated model_info.json + branch_name = push_new_model_info_branch(model_info_path=os.path.join(models_path, model_info_file)) + create_pull_request(branch_name) + else: + print(f"all changed files: {changed_dirs} are not related to any existing bundles, skip updating.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("-f", "--f", type=str, help="changed files.") + args = parser.parse_args() + changed_dirs = args.f.splitlines() + main(changed_dirs) diff --git a/ci/utils.py b/ci/utils.py index 22182192..ef4806e9 100644 --- a/ci/utils.py +++ b/ci/utils.py @@ -13,10 +13,12 @@ import hashlib import json import os +import re import shutil import subprocess from typing import List +from github import Github from monai.apps.utils import download_url from monai.bundle.config_parser import ConfigParser from monai.utils import look_up_option @@ -120,27 +122,32 @@ def get_latest_version(bundle_name: str, model_info_path: str): return sorted(versions)[-1] -def push_new_model_info_branch(model_info_path: str): - email = os.environ["email"] - username = os.environ["username"] - +def submit_pull_request(model_info_path: str): + # set required info for a pull request branch_name = "auto-update-model-info" - create_push_cmd = f"git checkout -b {branch_name}; git push --set-upstream origin {branch_name}" - - git_config = f"git config user.email {email}; git config user.name {username}" - commit_message = "git commit -m 'auto update model_info'" - full_cmd = f"{git_config}; git add {model_info_path}; {commit_message}; {create_push_cmd}" - - call_status = subprocess.run(full_cmd, shell=True) - call_status.check_returncode() - - return branch_name - - -def create_pull_request(branch_name: str, pr_title: str = "'auto update model_info [skip ci]'"): - create_command = f"gh pr create --fill --title {pr_title} --base dev --head {branch_name}" - call_status = subprocess.run(create_command, shell=True) - call_status.check_returncode() + pr_title = "auto update model_info [skip ci]" + pr_description = "This PR is automatically created to update model_info.json" + commit_message = "auto update model_info" + repo_file_path = "models/model_info.json" + # authenticate with Github CLI + github_token = os.environ["GITHUB_TOKEN"] + repo_name = "Project-MONAI/model-zoo" + g = Github(github_token) + # create new branch + repo = g.get_repo(repo_name) + default_branch = repo.default_branch + new_branch = repo.create_git_ref(ref=f"refs/heads/{branch_name}", sha=repo.get_branch(default_branch).commit.sha) + # push changes + model_info = get_json_dict(model_info_path) + repo.update_file( + path=repo_file_path, + message=commit_message, + content=json.dumps(model_info), + sha=repo.get_contents(repo_file_path, ref=default_branch).sha, + branch=new_branch.ref, + ) + # create PR + repo.create_pull(title=pr_title, body=pr_description, head=new_branch.ref, base=default_branch) def compress_bundle(root_path: str, bundle_name: str, bundle_zip_name: str): @@ -156,15 +163,71 @@ def get_checksum(dst_path: str, hash_func): return hash_func.hexdigest() +def split_bundle_name_version(bundle_name: str): + pattern_version = re.compile(r"^(.+)\_v(\d.*)$") + matched_result = pattern_version.match(bundle_name) + if matched_result is not None: + b_name, b_version = matched_result.groups() + return b_name, b_version + raise ValueError(f"{bundle_name} does not meet the naming format.") + + +def get_existing_bundle_list(model_info): + all_bundle_names = [] + for k in model_info.keys(): + bundle_name, _ = split_bundle_name_version(k) + if bundle_name not in all_bundle_names: + all_bundle_names.append(bundle_name) + return all_bundle_names + + +def create_bundle_to_ngc(bundle_name: str, org_name: str): + options = "--short-desc '' --application '' --format '' --framework MONAI --precision ''" + # models in NGC need to be lowercase + ngc_create_cmd = f"ngc registry model create {org_name}/{bundle_name.lower()} {options}" + try: + _ = subprocess.run(ngc_create_cmd, shell=True, check=True, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as e: + msg = e.stderr.decode("utf-8") + if "already exists" in msg: + print(f"{bundle_name} already exists, skip creating.") + pass + else: + raise e + + +def upload_version_to_ngc(bundle_name: str, version: str, root_path: str, org_name: str): + upload_file = f"{bundle_name}_v{version}.zip" + ngc_upload_cmd = ( + f"ngc registry model upload-version --source {upload_file} {org_name}/{bundle_name.lower()}:{version}" + ) + + try: + _ = subprocess.run(ngc_upload_cmd, shell=True, cwd=root_path, check=True, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as e: + msg = e.stderr.decode("utf-8") + if "already exists" in msg: + print(f"{bundle_name} with version {version} already exists, skip uploading.") + pass + else: + raise e + + def upload_bundle( - bundle_zip_file_path: str, - bundle_zip_filename: str, - release_tag: str = "hosting_storage_v1", - repo_name: str = "Project-MONAI/model-zoo", + bundle_name: str, + version: str, + root_path: str, + bundle_zip_name: str, + exist_flag: bool, + org_name: str = "nvidia/monaihosting", ): - upload_command = f"gh release upload {release_tag} {bundle_zip_file_path} -R {repo_name}" - call_status = subprocess.run(upload_command, shell=True) - call_status.check_returncode() - source = f"https://github.com/{repo_name}/releases/download/{release_tag}/{bundle_zip_filename}" - - return source + if exist_flag is False: + # need to create bundle first + create_bundle_to_ngc(bundle_name=bundle_name, org_name=org_name) + # upload version + upload_version_to_ngc(bundle_name=bundle_name, version=version, root_path=root_path, org_name=org_name) + # access link + site = "https://api.ngc.nvidia.com/v2/models/" + access_link = f"{site}{org_name}/{bundle_name.lower()}/versions/{version}/files/{bundle_zip_name}" + + return access_link diff --git a/ci/utils_deparate.py b/ci/utils_deparate.py new file mode 100644 index 00000000..22182192 --- /dev/null +++ b/ci/utils_deparate.py @@ -0,0 +1,170 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import hashlib +import json +import os +import shutil +import subprocess +from typing import List + +from monai.apps.utils import download_url +from monai.bundle.config_parser import ConfigParser +from monai.utils import look_up_option + +SUPPORTED_HASH_TYPES = {"md5": hashlib.md5, "sha1": hashlib.sha1, "sha256": hashlib.sha256, "sha512": hashlib.sha512} + + +def get_sub_folders(root_dir: str): + """ + This function is used to get all sub folders (as a list) within the root_dir. + """ + sub_folder_list = [f.name for f in os.scandir(root_dir) if f.is_dir()] + + return sub_folder_list + + +def get_json_dict(json_dict_path: str): + with open(json_dict_path, "r") as f: + json_dict = json.load(f) + + return json_dict + + +def get_hash_func(hash_type: str = "sha1"): + actual_hash_func = look_up_option(hash_type.lower(), SUPPORTED_HASH_TYPES) + + return actual_hash_func() + + +def get_changed_bundle_list(changed_dirs: List[str], root_path: str = "models"): + """ + This function is used to return all bundle names that have changed files. + If a bundle is totally removed, it will be ignored (since it not exists). + + """ + bundles = get_sub_folders(root_path) + + changed_bundle_list = [] + for sub_dir in changed_dirs: + for bundle in bundles: + bundle_path = os.path.join(root_path, bundle) + if os.path.commonpath([bundle_path]) == os.path.commonpath([bundle_path, sub_dir]): + changed_bundle_list.append(bundle) + + return list(set(changed_bundle_list)) + + +def prepare_schema(bundle_list: List[str], root_path: str = "models"): + """ + This function is used to prepare schema for changed bundles. + Due to Github's limitation (see: https://github.com/Project-MONAI/model-zoo/issues/111), + to avoid repeated downloading, all distinct schemas will be downloaded first, and copy + to changed bundle directories. + + """ + schema_dict = {} + for bundle_name in bundle_list: + bundle_path = os.path.join(root_path, bundle_name) + if os.path.exists(bundle_path): + meta_file_path = os.path.join(bundle_path, "configs/metadata.json") + metadata = get_json_dict(meta_file_path) + schema_url = metadata["schema"] + schema_name = schema_url.split("/")[-1] + + if schema_url not in schema_dict.keys(): + schema_path = os.path.join(root_path, schema_name) + download_url(url=schema_url, filepath=schema_path) + schema_dict[schema_url] = schema_path + os.makedirs(os.path.join(bundle_path, "eval"), exist_ok=True) + shutil.copyfile(schema_dict[schema_url], os.path.join(bundle_path, "eval/schema.json")) + print("prepared schema for: ", bundle_name) + + +def download_large_files(bundle_path: str, large_file_name: str = "large_file.yml"): + parser = ConfigParser() + parser.read_config(os.path.join(bundle_path, large_file_name)) + large_files_list = parser.get()["large_files"] + for lf_data in large_files_list: + lf_data["fuzzy"] = True + if "hash_val" in lf_data and lf_data.get("hash_val", "") == "": + lf_data.pop("hash_val") + if "hash_type" in lf_data and lf_data.get("hash_type", "") == "": + lf_data.pop("hash_type") + lf_data["filepath"] = os.path.join(bundle_path, lf_data["path"]) + lf_data.pop("path") + download_url(**lf_data) + + +def save_model_info(model_info_dict, model_info_path: str): + with open(model_info_path, "w") as f: + json.dump(model_info_dict, f) + + +def get_latest_version(bundle_name: str, model_info_path: str): + model_info_dict = get_json_dict(model_info_path) + versions = [] + for k in model_info_dict.keys(): + if bundle_name in k: + versions.append(k.split(f"{bundle_name}_v")[1]) + + return sorted(versions)[-1] + + +def push_new_model_info_branch(model_info_path: str): + email = os.environ["email"] + username = os.environ["username"] + + branch_name = "auto-update-model-info" + create_push_cmd = f"git checkout -b {branch_name}; git push --set-upstream origin {branch_name}" + + git_config = f"git config user.email {email}; git config user.name {username}" + commit_message = "git commit -m 'auto update model_info'" + full_cmd = f"{git_config}; git add {model_info_path}; {commit_message}; {create_push_cmd}" + + call_status = subprocess.run(full_cmd, shell=True) + call_status.check_returncode() + + return branch_name + + +def create_pull_request(branch_name: str, pr_title: str = "'auto update model_info [skip ci]'"): + create_command = f"gh pr create --fill --title {pr_title} --base dev --head {branch_name}" + call_status = subprocess.run(create_command, shell=True) + call_status.check_returncode() + + +def compress_bundle(root_path: str, bundle_name: str, bundle_zip_name: str): + touch_cmd = f"find {bundle_name} -exec touch -t 202205300000 " + "{} +" + zip_cmd = f"zip -rq -D -X -9 -A --compression-method deflate {bundle_zip_name} {bundle_name}" + subprocess.check_call(f"{touch_cmd}; {zip_cmd}", shell=True, cwd=root_path) + + +def get_checksum(dst_path: str, hash_func): + with open(dst_path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + hash_func.update(chunk) + return hash_func.hexdigest() + + +def upload_bundle( + bundle_zip_file_path: str, + bundle_zip_filename: str, + release_tag: str = "hosting_storage_v1", + repo_name: str = "Project-MONAI/model-zoo", +): + upload_command = f"gh release upload {release_tag} {bundle_zip_file_path} -R {repo_name}" + call_status = subprocess.run(upload_command, shell=True) + call_status.check_returncode() + source = f"https://github.com/{repo_name}/releases/download/{release_tag}/{bundle_zip_filename}" + + return source diff --git a/requirements-dev.txt b/requirements-dev.txt index 98d35c70..77bda4c6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -23,7 +23,7 @@ jsonschema gdown>=4.5.4 tensorboard parameterized -monai>=1.2.0rc7 +monai>=1.2.0 pillow!=8.3.0 # https://github.com/python-pillow/Pillow/issues/5571 itk>=5.2 scikit-learn diff --git a/requirements-update-model.txt b/requirements-update-model.txt new file mode 100644 index 00000000..e25ba74c --- /dev/null +++ b/requirements-update-model.txt @@ -0,0 +1,4 @@ +monai>=1.0.1 +gdown>=4.5.4 +PyGithub +pyyaml