 MIT License



 Copyright (c) 2024-present K. S. Ernest (iFire) Lee



 Copyright (c) 2024 Marcus Loren



 Permission is hereby granted, free of charge, to any person obtaining a copy

 of this software and associated documentation files (the "Software"), to deal

 in the Software without restriction, including without limitation the rights

 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

 copies of the Software, and to permit persons to whom the Software is

 furnished to do so, subject to the following conditions:



 The above copyright notice and this permission notice shall be included in all

 copies or substantial portions of the Software.



 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

 SOFTWARE.



 # Install the dependencies



 python3 -m pip install --break-system-packages --user requests tqdm trimesh pandas

 ### Step 1 - Get model sizes & path



 Option 1 - Extract manually:

 1. Run "git clone https://huggingface.co/datasets/allenai/objaverse" and then abort the command when it starts to download the models.

 2. This will create a git repo folder, you then can run "python dump_gitcommits.py > out.txt" to dump the entire commit history

 3. Then you call extract_models_from_dump("out.txt") to parse and get all the model paths and their sizes.



 Option 2 - Use the pre-extracted json (model_sizes.json.gz)

In [1]:
# %%
import json 
import gzip

def extract_models_from_dump(file_path):
    model_sizes = {}
    current_model = None
    with open(file_path, 'r') as file:
        for line in file:
            # Get model path
            if ".glb" in line:
                # Extract model path
                model_path = line.split()[-1].strip()
                model_path = model_path.replace("b/", "")
                current_model = model_path
            # Get current_model size
            elif current_model and "size" in line: 
                
                size = int(line.split()[-1].strip()) 
                model_sizes[current_model] = size 
                current_model = None
    return model_sizes
 
 
 ## Option 1
#model_sizes = extract_models_from_dump("out.txt")  


## Option 2
with gzip.open("model_sizes.json.gz", 'rb') as gzip_file: 
    model_sizes = json.loads(gzip_file.read().decode('utf-8'))
    
print(len(model_sizes))


798759


 ### Download the meshes as per specified size limit

In [2]:
import os
import requests
from tqdm import tqdm  
from concurrent.futures import ThreadPoolExecutor 

 ### Download metadata

In [3]:
# %%
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def download_file(url, folder_path, filename):
    url = url + "?download=true"
    print(url)
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # If the response was successful, no Exception will be raised
        with open(os.path.join(folder_path, filename), 'wb') as f:
            f.write(response.content) 
        return True
    except Exception as err:
        print(f"Failed to download {filename}. Error: {err}")
        return False

def download_metadata(base_url, save_dir, num_threads=6):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for i in range(1, 161):
            filename = f"000-{i:03d}.json.gz"
            file_url = base_url + filename
            futures.append(executor.submit(download_file, file_url, save_dir, filename))

        for future in tqdm(futures, total=len(futures)):
            result = future.result()
            if not result:
                continue
            
base_url = "https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/" 
save_dir = './objaverse/metadata'
os.makedirs(save_dir, exist_ok=True)   

download_metadata(base_url, save_dir)

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-001.json.gz?download=truehttps://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-002.json.gz?download=true

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-003.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-004.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-005.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-006.json.gz?download=true


  4%|▍         | 7/160 [00:00<00:06, 22.54it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-007.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-008.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-009.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-010.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-011.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-012.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-013.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-014.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-015.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-016.js

  8%|▊         | 13/160 [00:00<00:05, 26.58it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-018.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-019.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-020.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-021.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-022.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-023.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-024.json.gz?download=true


 14%|█▍        | 23/160 [00:00<00:04, 30.06it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-025.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-026.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-027.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-028.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-029.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-030.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-031.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-032.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-033.json.gz?download=true


 17%|█▋        | 27/160 [00:00<00:04, 30.11it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-034.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-035.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-036.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-037.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-038.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-039.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-040.json.gz?download=true


 24%|██▍       | 39/160 [00:01<00:03, 33.80it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-041.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-042.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-043.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-044.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-045.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-046.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-047.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-048.json.gz?download=true


 29%|██▉       | 46/160 [00:01<00:03, 37.58it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-049.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-050.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-051.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-052.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-053.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-054.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-055.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-056.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-057.json.gz?download=true


 32%|███▎      | 52/160 [00:01<00:03, 30.56it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-058.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-059.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-060.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-061.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-062.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-063.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-064.json.gz?download=true


 36%|███▋      | 58/160 [00:01<00:03, 29.25it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-065.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-066.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-067.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-068.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-069.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-070.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-071.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-072.json.gz?download=true


 46%|████▌     | 73/160 [00:02<00:02, 41.17it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-073.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-074.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-075.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-076.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-077.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-078.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-079.json.gz?download=true


 49%|████▉     | 78/160 [00:02<00:02, 38.61it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-080.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-081.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-082.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-083.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-084.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-085.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-086.json.gz?download=true


 54%|█████▍    | 87/160 [00:02<00:01, 36.75it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-087.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-088.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-089.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-090.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-091.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-092.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-093.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-094.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-095.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-096.js

 61%|██████    | 97/160 [00:02<00:01, 38.06it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-097.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-098.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-099.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-100.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-101.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-102.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-103.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-104.json.gz?download=true


 63%|██████▎   | 101/160 [00:03<00:01, 29.57it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-105.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-106.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-107.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-108.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-109.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-110.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-111.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-112.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-113.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-114.js

 66%|██████▋   | 106/160 [00:03<00:02, 19.09it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-118.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-119.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-120.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-121.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-122.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-123.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-124.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-125.json.gz?download=true


 77%|███████▋  | 123/160 [00:03<00:00, 39.57it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-126.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-127.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-128.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-129.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-130.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-131.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-132.json.gz?download=true


 81%|████████  | 129/160 [00:03<00:00, 34.68it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-133.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-134.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-135.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-136.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-137.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-138.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-139.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-140.json.gz?download=true


 84%|████████▍ | 135/160 [00:04<00:00, 37.37it/s]

https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-141.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-142.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-143.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-144.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-145.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-146.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-147.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-148.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-149.json.gz?download=true
https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/000-150.js

100%|██████████| 160/160 [00:05<00:00, 30.23it/s]


 ### Extract the metadata to a JSON with only the relevant information, e.g the models you downloaded

In [4]:
# %%
import os
import glob
import gzip
import json
import pandas as pd
import time
import numpy as np
from pygltflib import GLTF2, BufferFormat
from tqdm import tqdm

captions_df = pd.read_csv('./objaverse_annotations/pali_captions.csv', sep=';')
material_annotations_df = pd.read_csv('./objaverse_annotations/pali_material_annotations.csv', sep=';')
type_annotations_df = pd.read_csv('./objaverse_annotations/pali_type_annotations.csv', sep=';')
captions_dict = captions_df.set_index('object_uid').T.to_dict('list')
material_annotations_dict = material_annotations_df.set_index('object_uid').T.to_dict('list')
type_annotations_dict = type_annotations_df.set_index('object_uid').T.to_dict('list')

existing_models = {}
metadata = {}
filtered_metadata = {}
metadata_path = './objaverse/metadata'
for file_name in os.listdir(metadata_path):
    if file_name.endswith(".gz"):
        file_path = os.path.join(metadata_path, file_name)
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            file_metadata = json.load(f)
            metadata.update(file_metadata)

input_directory = './objaverse/glbs'
output_gltf_directory = './objaverse/gltf_xmp_json_ld'
scaling_factor_constant = 0.95

os.makedirs(output_gltf_directory, exist_ok=True)

def convert_lists_to_ordered_xmp_format(data):
    for key, value in data.items():
        if isinstance(value, list):
            # Always use '@list' to represent an ordered list.
            data[key] = {'@list': value}
        elif isinstance(value, dict):
            convert_lists_to_ordered_xmp_format(value)

def add_to_filtered_metadata(key, value):
    if value is not None:
        filtered_metadata[f"vsekai:{key}"] = value

def download_model_convert_and_delete(model_url, glb_path, gltf_path):
    try:
        response = requests.get(model_url)
        if response.status_code == 200:
            # Save the GLB file temporarily
            with open(glb_path, 'wb') as f:
                f.write(response.content)

            # Convert the GLB file to GLTF format with XMP metadata
            gltf = GLTF2().load(glb_path)
            gltf.convert_buffers(BufferFormat.DATAURI)
            if not os.path.isfile(file_path):
                return
            start_time = time.time()
            file_name, file_extension = os.path.splitext(file_path)
            existing_models[os.path.basename(file_name)] = file_path
            if not file_extension.lower() == ".glb" and os.path.basename(file_name) in metadata:
                return
            gltf_file_path = os.path.join(output_gltf_directory, os.path.basename(file_name) + ".gltf")
            if os.path.exists(gltf_file_path):
                return
            gltf = GLTF2().load(file_path)
            gltf.convert_buffers(BufferFormat.DATAURI)
            gltf.save(gltf_file_path)
            data = metadata[os.path.basename(file_name)]
            if data["license"] != "by":
                os.remove(gltf_file_path)
                return
            
            if not (100 <= data["faceCount"] <= 2000):
                os.remove(gltf_file_path)
                return
                
            convert_lists_to_ordered_xmp_format(data)
            filtered_metadata = {
                "@context": {
                    "dc": "http://purl.org/dc/elements/1.1/",
                    "vsekai": "http://v-sekai.org/vsekai/elements/0.4/"
                },
                "@id": data["uid"],
                "dc:title": data["name"],
                "dc:creator": {
                    "@id": data["user"]["uid"],
                    "dc:name": data["user"]["username"]
                },
                "dc:description": data["description"],
                "dc:date": data["createdAt"],
                "dc:identifier": data["uri"],
                "dc:source": data["viewerUrl"],
                "dc:rights": data["license"],
                "dc:subject": data["tags"],
                "dc:type": "3D Model",
                "dc:relation": data["user"]["profileUrl"],
                "vsekai:viewCount": data["viewCount"],
                "vsekai:likeCount": data["likeCount"],
                "vsekai:commentCount": data["commentCount"],
                "vsekai:isDownloadable": data["isDownloadable"],
                "vsekai:publishedAt": data["publishedAt"],
                "vsekai:faceCount": data["faceCount"],
                "vsekai:vertexCount": data["vertexCount"],
                "vsekai:isAgeRestricted": data["isAgeRestricted"],
            }

            if data["uid"] in captions_dict:
                caption_annotation, caption_annotation_probability = captions_dict[data["uid"]]
                add_to_filtered_metadata("captionAnnotation", caption_annotation)
                add_to_filtered_metadata("captionAnnotationProbability", caption_annotation_probability)

            if data["uid"] in material_annotations_dict:
                material_annotation, material_annotation_probability = material_annotations_dict[data["uid"]]
                add_to_filtered_metadata("materialAnnotation", material_annotation)
                add_to_filtered_metadata("materialAnnotationProbability", material_annotation_probability)

            if data["uid"] in type_annotations_dict:
                type_annotation, type_annotation_probability = type_annotations_dict[data["uid"]]
                add_to_filtered_metadata("typeAnnotation", type_annotation)
                add_to_filtered_metadata("typeAnnotationProbability", type_annotation_probability)

            optional_tags = ["animationCount", "staffpickedAt", "archives", "categories"]
            for tag in optional_tags:
                if tag in data:
                    add_to_filtered_metadata(tag, data[tag])

            with open(gltf_file_path, 'r') as f:
                gltf_json = json.load(f)

            xmp_extension = {
                "KHR_xmp_json_ld": {
                    "packets": [filtered_metadata]
                }
            }

            if 'extensions' in gltf_json['asset']:
                if 'KHR_xmp_json_ld' in gltf_json['asset']['extensions']:
                    gltf_json['asset']['extensions']['KHR_xmp_json_ld']['packets'].append(filtered_metadata)
                else:
                    gltf_json['asset']['extensions'].update(xmp_extension)
            else:
                gltf_json['asset']['extensions'] = xmp_extension

            gltf_json['asset']['extensions']['KHR_xmp_json_ld']['packet'] = len(gltf_json['asset']['extensions']['KHR_xmp_json_ld']['packets']) - 1

            if 'extensionsUsed' in gltf_json:
                if "KHR_xmp_json_ld" not in gltf_json['extensionsUsed']:
                    gltf_json['extensionsUsed'].append("KHR_xmp_json_ld")
            else:
                gltf_json['extensionsUsed'] = ["KHR_xmp_json_ld"]

            with open(gltf_file_path, 'w') as f:
                json.dump(gltf_json, f, indent=4)



            # Save the converted GLTF file
            gltf.save(gltf_path)

            # Delete the GLB file
            os.remove(glb_path)
        else:
            print(f"Failed to download: {model_url}")
    except Exception as e:
        print(f"Error downloading: {model_url}, {e}")

def download_filtered_models(model_sizes, filtered_json, base_url, save_dir, minKb, maxKb, num_threads = 6, maxDownloadedMeshes = 1):
    filtered_models = {model_path: size for model_path, size in model_sizes.items() if minKb < size < maxKb * 1024}

    downloaded_meshes = 0

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for model_path, size in filtered_models.items():
            if downloaded_meshes >= maxDownloadedMeshes:
                break

            if model_path in filtered_json:
                continue
            
            folder_name = os.path.dirname(model_path)
            sub_folder = os.path.join(save_dir, folder_name)
            os.makedirs(sub_folder, exist_ok=True)
            
            file_name = os.path.basename(model_path)
            save_path = os.path.join(sub_folder, file_name)

            if file_name in model_sizes:
                print("The file is filtered from this dataset.")
                continue
            
            if not os.path.exists(save_path):
                model_url = f"{base_url}/{model_path}?download=true"
                base_save_path, _ = os.path.splitext(save_path)
                glb_path = base_save_path + '.glb'                
                futures.append(executor.submit(download_model_convert_and_delete, model_url, glb_path, save_path))
                
                downloaded_meshes += 1
                
        for future in tqdm(futures, total=len(futures)):
            future.result()

base_url = "https://huggingface.co/datasets/allenai/objaverse/resolve/main"  
save_dir = f'./objaverse' 

json_file_path = "filtered_face_count.json"
filtered_json = {}

if os.path.exists(json_file_path):
    with open(json_file_path, 'r') as f:
        filtered_json = json.load(f)
else:
    print(f'File {json_file_path} does not exist.')

os.makedirs(save_dir, exist_ok=True)   


In [5]:
download_filtered_models(model_sizes, filtered_json, base_url, save_dir, minKb = 301, maxKb = 40960, num_threads = 24, maxDownloadedMeshes = 300000) 

  0%|          | 0/10 [00:00<?, ?it/s]

Error downloading: https://huggingface.co/datasets/allenai/objaverse/resolve/main/glbs/000-159/003e2f4de1fb4d3fadc18655a5d8966e.glb?download=true, 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
Error downloading: https://huggingface.co/datasets/allenai/objaverse/resolve/main/glbs/000-159/0020199e55034ddba35a8daf4670253b.glb?download=true, 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
Error downloading: https://huggingface.co/datasets/allenai/objaverse/resolve/main/glbs/000-159/005a8f2d0e734c338445d93ed4b8f53f.glb?download=true, 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
Error downloading: https://huggingface.co/datasets/allenai/objaverse/resolve/main/glbs/000-159/001df836dd9e46edb196a975e59bb63a.glb?download=true, 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
Error downloading: https://huggingface.co/datasets/allenai/objaverse/resolve/main/glbs/000-159/007b95d23a834fb5bad6a09daf27126d.

 10%|█         | 1/10 [00:00<00:06,  1.42it/s]

Error downloading: https://huggingface.co/datasets/allenai/objaverse/resolve/main/glbs/000-159/0017dc02a1e74e029ecf929fd9d94172.glb?download=true, 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte


100%|██████████| 10/10 [00:18<00:00,  1.84s/it]

Error downloading: https://huggingface.co/datasets/allenai/objaverse/resolve/main/glbs/000-159/007a654b3ac742bc8fa3073fb055ccb2.glb?download=true, 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte



