In [1]:
import tqdm
import fs.zipfs
import fs.multifs
import os
import fs.copy
import sys
sys.path.append("/scr/ig_pipeline")
import b1k_pipeline.utils

OUT_FILENAME = "/scr/rc14.zip"

# In the order of priority
PARALLELS = [
    "/scr/rc13.zip",
    "/scr/rc14_patch.zip",
]

OBJECTS_TO_REMOVE = []

def main():
    # Get a multi-FS view over all of the parallel filesystems.
    multi_fs = fs.multifs.MultiFS()
    for priority, parallel_zip_name in enumerate(PARALLELS):
        print("Adding", parallel_zip_name)
        multi_fs.add_fs(os.path.basename(parallel_zip_name), fs.zipfs.ZipFS(parallel_zip_name), priority=priority)

    # Copy all the files to the output zip filesystem.
    print("Copying files")
    total_files = sum(1 for f in multi_fs.walk.files())
    with b1k_pipeline.utils.WriteOnly7ZipFS(OUT_FILENAME) as out_fs:
        with tqdm.tqdm(total=total_files) as pbar:
            fs.copy.copy_fs(multi_fs, out_fs, on_copy=lambda *args: pbar.update(1))

        print("Removing some objects")
        objects_dir = out_fs.opendir("objects")
        for obj_to_remove in OBJECTS_TO_REMOVE:
            objdir_glob = [x.path for x in objects_dir.glob(f"*/{obj_to_remove}/")]
            assert len(objdir_glob) == 1, f"Needed exactly one dir for {obj_to_remove}, got {objdir_glob}"
            objdir = objdir_glob[0]
            print("Removing", objdir)
            objects_dir.removetree(objdir)

main()

Adding /scr/rc13.zip
Adding /scr/rc14_patch.zip
Copying files


100%|██████████| 202128/202128 [03:13<00:00, 1045.03it/s]


Removing some objects


In [2]:
! gsutil cp "/scr/rc14.zip" "gs://gibson_scenes/og_dataset_1_2_0rc14.zip"

Copying file:///scr/rc14.zip [Content-Type=application/zip]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][ 29.0 GiB/ 29.0 GiB]  102.0 MiB/s                                   
Operation completed over 1 objects/29.0 GiB.                                     
