In [1]:
import tempfile
import os
import shutil
import hashlib
import base64

from transformers import LlamaForCausalLM
from transformers import AutoTokenizer

In [4]:
def get_hf_download_path(local_path: str, account_name, model_name, commit) -> str:
    return os.path.join(
        local_path,
        "models" + "--" + account_name + "--" + model_name,
        "snapshots",
        commit,
    )

def realize_symlinks_in_directory(path: str) -> int:
    """Realizes all symlinks in the given directory, moving the linked file to the location. Returns count removed."""
    realized_symlinks = 0

    for cur_path, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.abspath(os.path.join(cur_path, filename))
            # Get path resolving symlinks if encountered
            real_path = os.path.realpath(path)
            # If different then move
            if path != real_path:
                realized_symlinks += 1
                shutil.move(real_path, path)

    return realized_symlinks

def get_hash_of_file(path: str) -> str:
    blocksize = 64 * 1024
    file_hash = hashlib.sha256()
    with open(path, "rb") as fp:
        while True:
            data = fp.read(blocksize)
            if not data:
                break
            file_hash.update(data)
    return base64.b64encode(file_hash.digest()).decode("utf-8")


def get_hash_of_directory(path: str) -> str:
    dir_hash = hashlib.sha256()

    # Recursively walk everything under the directory for files.
    for cur_path, dirnames, filenames in os.walk(path):
        # Ensure we walk future directories in a consistent order.
        dirnames.sort()
        # Ensure we walk files in a consistent order.
        for filename in sorted(filenames):
            path = os.path.join(cur_path, filename)
            file_hash = get_hash_of_file(path)
            dir_hash.update(file_hash.encode())

    return base64.b64encode(dir_hash.digest()).decode("utf-8")

HF_TOKEN = "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"

In [6]:
async def download_model(account_name, model_name, commit, local_path: str):
    """Retrieves a trained model from Hugging Face."""
    if not commit:
        raise ValueError("No Hugging Face commit id found to read from the hub.")

    repo_id = account_name + "/" + model_name

    # Transformers library can pick up a model based on the hugging face path (username/model) + rev.

    model = LlamaForCausalLM.from_pretrained(
        pretrained_model_name_or_path=repo_id,
        revision=commit,
        cache_dir=local_path,
        use_safetensors=True,
        token=HF_TOKEN
    )

    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=repo_id,
        revision=commit,
        cache_dir=local_path,
        token=HF_TOKEN
    )

    # Get the directory the model was stored to.
    model_dir = get_hf_download_path(local_path, account_name, model_name, commit)

    # Realize all symlinks in that directory since Transformers library does not support avoiding symlinks.
    realize_symlinks_in_directory(model_dir)

    # Compute the hash of the downloaded model.
    model_hash = get_hash_of_directory(model_dir)
    
    print("Model hash: ", model_hash)
    return model_hash

In [None]:
lora_name = "Proterozoic"
upload_name = "MesozoicMetallurgist" + "/llam-" + lora_name
account_name, model_name = upload_name.split("/")
commit = "a3c15d03543c33d06622ec63edce5a2fd4348493"

with tempfile.TemporaryDirectory() as temp_dir:
    model_hash = await download_model(account_name, model_name, commit, temp_dir)
    print(model_hash)