In [None]:
import h5py

def select_random_rows_from_npz(directory_path, k):
    """
    Reads each .npz file in the given directory, randomly selects k rows from each file,
    and returns the chosen rows.

    Args:
        directory_path (str): Path to the directory containing .npz files.
        k (int): Number of rows to randomly select from each file.

    Returns:
        dict: A dictionary where keys are filenames and values are arrays of selected rows.
    """
    import os
    import numpy as np

    selected_rows = []

    # Get a list of all .npz files in the directory
    npz_files = [f for f in os.listdir(directory_path) if f.endswith('.npz')]

    for filename in npz_files:
        file_path = os.path.join(directory_path, filename)

        # Load the .npz file
        data = np.load(file_path)
        
        # Retrieve all arrays stored in the .npz file
        array_keys = data.files
        # For this example, we'll use the first array
        array = data[array_keys[0]]

        num_rows = array.shape[0]

        # Ensure that k does not exceed the number of rows
        if k >= num_rows:
            # If k is greater or equal to num_rows, select all rows
            selected_indices = np.arange(num_rows)
        else:
            # Randomly select k unique indices without replacement
            selected_indices = np.random.choice(num_rows, size=k, replace=False)

        # Select the rows using the sampled indices
        selected_data = array[selected_indices]

        # Store the selected data in the dictionary with filename as the key
        selected_rows.extend(selected_data)

    # For large datasets and improved performance, consider using GPU acceleration with CuPy:
    # import cupy as cp
    # Replace numpy arrays with cupy arrays for GPU computations

    return selected_rows

out = select_random_rows_from_npz("/viscam/u/iamisaac/datacomp/small/metadata", 2)
print(out[0])


In [None]:
import h5py
import numpy as np
import json

def select_random_rows(file_path, k, dataset_name='dataset'):
    """
    Select k random rows from an HDF5 file.

    Parameters:
    file_path (str): Path to the HDF5 file
    k (int): Number of random rows to select
    dataset_name (str): Name of the dataset in the HDF5 file (default: 'default_dataset')

    Returns:
    numpy.ndarray: Array containing k randomly selected rows
    """
    with h5py.File(file_path, 'r') as f:
        # Get the dataset
        dataset = f[dataset_name]["url"]

        # Get the total number of rows
        total_rows = dataset.shape[0]

        # Ensure k is not larger than the total number of rows
        k = min(k, total_rows)

        # Generate k random indices
        random_indices = np.random.choice(total_rows, k, replace=False)

        # Select the random rows
        # Start of Selection
        # Sort the random indices as required by h5py for advanced indexing
        sorted_indices = np.sort(random_indices)

        # Retrieve the data corresponding to the sorted indices
        random_rows_sorted = dataset[sorted_indices]
        random_rows = []
        for random_row in random_rows_sorted:
            random_rows.append(random_row.decode('utf-8'))
        
    # Convert the random rows to a list and save to a JSON file
    with open('random_urls.json', 'w') as json_file:
        json.dump(random_rows, json_file)
    return random_rows

select_random_rows("/viscam/u/iamisaac/datacomp/small_merged/metadata.hdf5", 1000)

In [1]:
from sfs_util import check_and_download_image
import json

download_path = "/viscam/projects/sfs/mast3r/mast3r_outputs/random_imgs"
with open("random_urls.json", "r") as file:
    urls = json.load(file)
num_failed = 0
for url in urls:
    try:
        check_and_download_image(url, download_path, set())
    except:
        num_failed +=1
print(f"num failed = {num_failed}")
    


num failed = 271


In [None]:
def move_reconstruction_files(json_file_path, destination_dir):
    """
    Reads a JSON file, accumulates all file paths under "reconstructionFile",
    and moves those files to the specified destination directory.

    Parameters:
        json_file_path (str): Path to the JSON file.
        destination_dir (str): Path to the destination directory.

    Returns:
        None
    """
    import json
    import os
    import shutil

    # Ensure the destination directory exists
    os.makedirs(destination_dir, exist_ok=True)

    def find_reconstruction_files(obj, collected_paths):
        """Recursively find all 'reconstructionFile' entries in the JSON object."""
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key == "reconstructionFile":
                    collected_paths.append(value)
                else:
                    find_reconstruction_files(value, collected_paths)
        elif isinstance(obj, list):
            for item in obj:
                find_reconstruction_files(item, collected_paths)

    # Load JSON data
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    # Collect all paths
    reconstruction_files = []
    find_reconstruction_files(data, reconstruction_files)

    # Move each file to the destination directory
    for file_path in reconstruction_files:
        if file_path.startswith("../"):
            file_path = file_path[3:]
        if os.path.isfile(file_path):
            # Get the base name of the file
            file_name = os.path.basename(file_path)
            dest_path = os.path.join(destination_dir, file_name)
            
            # Move the file
            shutil.move(file_path, dest_path)
            print(f"Moved: {file_path} -> {dest_path}")
        else:
            print(f"File not found: {file_path}")
move_reconstruction_files("htmls/data_20241009_055216.json", "mast3r_outputs/random_high_quality_pairs")

In [None]:
def delete_unused_glb_files(json_file_paths, remove_dir):
    """
    Reads a JSON file, accumulates all file paths under "reconstructionFile",
    and moves those files to the specified destination directory.

    Parameters:
        json_file_path (str): Path to the JSON file.
        destination_dir (str): Path to the destination directory.

    Returns:
        None
    """
    import json
    import os
    import shutil

    def find_reconstruction_files(obj, collected_paths):
        """Recursively find all 'reconstructionFile' entries in the JSON object."""
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key == "reconstructionFile":
                    file_name = os.path.basename(value)
                    collected_paths.add(file_name)
                else:
                    find_reconstruction_files(value, collected_paths)
        elif isinstance(obj, list):
            for item in obj:
                find_reconstruction_files(item, collected_paths)

    # Collect all paths
    reconstruction_files = set()
    for json_file_path in json_file_paths:
        with open(json_file_path, 'r') as file:
            data = json.load(file)
        find_reconstruction_files(data, reconstruction_files)
    
    # Iterate through all .glb files under remove_dir
    for root, dirs, files in os.walk(remove_dir):
        for file in files:
            if file.endswith('.glb'):
                # If the file is not in the reconstruction_files set, delete it
                if file not in reconstruction_files:
                    file_path = os.path.join(root, file)
                    os.remove(file_path)
                    print(f"Deleted: {file_path}")
    
    return reconstruction_files

    # Move each file to the destination directory
    # for file_path in reconstruction_files:
    #     if file_path.startswith("../"):
    #         file_path = file_path[3:]
    #     if os.path.isfile(file_path):
    #         # Get the base name of the file
    #         file_name = os.path.basename(file_path)
    #         dest_path = os.path.join(destination_dir, file_name)
            
    #         # Move the file
    #         shutil.move(file_path, dest_path)
    #         print(f"Moved: {file_path} -> {dest_path}")
    #     else:
    #         print(f"File not found: {file_path}")
delete_unused_glb_files(["htmls/data_20241009_055216.json", "htmls/data_20241004_142239.json"], "mast3r_outputs/")