In [None]:
import scipy.io
import numpy as np
from pathlib import Path
import os
from PIL import Image

def extract_from_mat_no_normalization(mat_file, output_dir, bad_files_log):
    """
    Extracts image, depth, and surface normals from a .mat file without normalization.
    Saves the image as PNG and depth and normals as .npy files for exact value preservation.
    """
    try:
        # Create necessary output directories if they don't exist
        output_dir = Path(output_dir)
        img_dir = output_dir / "images"
        depth_dir = output_dir / "depths"
        norm_dir = output_dir / "normals"
        
        img_dir.mkdir(parents=True, exist_ok=True)
        depth_dir.mkdir(parents=True, exist_ok=True)
        norm_dir.mkdir(parents=True, exist_ok=True)

        # Use the filename without extension for saving
        base_name = Path(mat_file).stem

        # Check if the files already exist
        img_path = img_dir / f"{base_name}_image.png"
        depth_path = depth_dir / f"{base_name}_depth.npy"
        norm_path = norm_dir / f"{base_name}_norm.npy"

        # Skip processing if all files already exist
        if img_path.exists() and depth_path.exists() and norm_path.exists():
            print(f"Files for {base_name} already exist, skipping processing.")
            return None

        # Load the .mat file
        instance = scipy.io.loadmat(mat_file)

        # Extract the required fields
        image = instance['img']  # Do not alter dimensions
        depth = instance['depth']  # Depth map, no normalization
        snorm = instance['norm']  # Surface normals, no normalization

        # Save the image as PNG
        Image.fromarray(image.astype(np.uint8)).save(img_path)

        # Save the depth as .npy to preserve raw values
        np.save(depth_path, depth)

        # Save the surface normals as .npy to preserve raw values
        np.save(norm_path, snorm)

        return {
            "image": img_path,
            "depth": depth_path,
            "norm": norm_path,
        }

    except Exception as e:
        # Log the file as a bad one
        with open(bad_files_log, 'a') as log_file:
            log_file.write(f"Error processing {mat_file}: {e}\n")
        print(f"Error processing {mat_file}: {e}")
        return None

def process_directory_no_normalization(mat_dir, output_dir, bad_files_log="bad_mat_files.log"):
    """
    Processes all .mat files in a directory and extracts image, depth, and normals without normalization.
    Skips files that have already been processed and records bad .mat files.
    """
    mat_files = [f for f in os.listdir(mat_dir) if f.endswith(".mat")]
    extracted_data = []

    for mat_file in mat_files:
        mat_path = os.path.join(mat_dir, mat_file)
        extracted = extract_from_mat_no_normalization(mat_path, output_dir, bad_files_log)
        if extracted:  # Only add to the list if processing was done
            extracted_data.append(extracted)

    print(f"Processed {len(mat_files)} files.")
    return extracted_data

In [None]:
mat_directory = "/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyu_geonet"  # Directory containing .mat files
output_directory = "/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyu_geonet_hf_datasets"  # Directory for saving extracted files

process_directory_no_normalization(mat_directory, output_directory)

In [None]:
import scipy.io
import numpy as np
from pathlib import Path
import os
from PIL import Image
from concurrent.futures import ProcessPoolExecutor, as_completed

def extract_from_mat_no_normalization(mat_file, output_dir, bad_files_log):
    """
    Extracts image, depth, and surface normals from a .mat file without normalization.
    Saves the image as PNG and depth and normals as .npy files for exact value preservation.
    """
    try:
        # Create necessary output directories if they don't exist
        output_dir = Path(output_dir)
        img_dir = output_dir / "images"
        depth_dir = output_dir / "depths"
        norm_dir = output_dir / "normals"
        
        img_dir.mkdir(parents=True, exist_ok=True)
        depth_dir.mkdir(parents=True, exist_ok=True)
        norm_dir.mkdir(parents=True, exist_ok=True)

        # Use the filename without extension for saving
        base_name = Path(mat_file).stem

        # Check if the files already exist
        img_path = img_dir / f"{base_name}_image.png"
        depth_path = depth_dir / f"{base_name}_depth.npy"
        norm_path = norm_dir / f"{base_name}_norm.npy"

        # Skip processing if all files already exist
        if img_path.exists() and depth_path.exists() and norm_path.exists():
            print(f"Files for {base_name} already exist, skipping processing.")
            return None

        # Load the .mat file
        instance = scipy.io.loadmat(mat_file)

        # Extract the required fields
        image = instance['img']  # Do not alter dimensions
        image[:, :, 0] = image[:, :, 0] + 2 * 122.175
        image[:, :, 1] = image[:, :, 1] + 2 * 116.169
        image[:, :, 2] = image[:, :, 2] + 2 * 103.508
        image = image.astype(np.uint8)
        depth = instance['depth']  # Depth map, no normalization
        snorm = instance['norm']  # Surface normals, no normalization

        # Save the image as PNG
        Image.fromarray(image.astype(np.uint8)).save(img_path)

        # Save the depth as .npy to preserve raw values
        np.save(depth_path, depth)

        # Save the surface normals as .npy to preserve raw values
        np.save(norm_path, snorm)

        return {
            "image": img_path,
            "depth": depth_path,
            "norm": norm_path,
        }

    except Exception as e:
        # Log the file as a bad one
        with open(bad_files_log, 'a') as log_file:
            log_file.write(f"Error processing {mat_file}: {e}\n")
        print(f"Error processing {mat_file}: {e}")
        return None

def process_directory_no_normalization(mat_dir, output_dir, bad_files_log="bad_mat_files.log", num_workers=4):
    """
    Processes all .mat files in a directory and extracts image, depth, and normals without normalization.
    Skips files that have already been processed and records bad .mat files.
    Parallelized using ProcessPoolExecutor.
    """
    mat_files = [f for f in os.listdir(mat_dir) if f.endswith(".mat")]
    extracted_data = []

    # Parallel processing with ProcessPoolExecutor
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(extract_from_mat_no_normalization, os.path.join(mat_dir, mat_file), output_dir, bad_files_log): mat_file for mat_file in mat_files}
        
        # Process files as they complete
        for future in as_completed(futures):
            mat_file = futures[future]
            try:
                result = future.result()
                if result:
                    extracted_data.append(result)
            except Exception as e:
                print(f"Error processing {mat_file}: {e}")

    print(f"Processed {len(mat_files)} files.")
    return extracted_data

In [None]:
mat_directory = "/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyu_geonet"  # Directory containing .mat files
output_directory = "/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyu_geonet_hf_datasets_v2"  # Directory for saving extracted files
bad_files_log = "bad_mat_files.log"

# Process files in parallel using 4 workers
process_directory_no_normalization(mat_directory, output_directory, bad_files_log, num_workers=15)

In [None]:
import pickle
import numpy as np
from pathlib import Path
from PIL import Image
import os
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

def extract_nyuv2_test_data(test_set, output_dir, bad_files_log):
    """
    Extracts image, depth, surface normals, room type, and NYU index from the NYUv2 test set without normalization.
    Saves the image as PNG and depth and normals as .npy files for exact value preservation.
    """
    try:
        # Create necessary output directories if they don't exist
        output_dir = Path(output_dir)
        img_dir = output_dir / "images"
        depth_dir = output_dir / "depths"
        norm_dir = output_dir / "normals"
        meta_dir = output_dir / "metadata"

        img_dir.mkdir(parents=True, exist_ok=True)
        depth_dir.mkdir(parents=True, exist_ok=True)
        norm_dir.mkdir(parents=True, exist_ok=True)
        meta_dir.mkdir(parents=True, exist_ok=True)

        # Process each sample in the test set with tqdm
        for index, (image, depth, snorm, room, nyu_index) in tqdm(enumerate(test_set), total=len(test_set), desc="Processing test set"):
            base_name = f"nyuv2_test_{index}"

            # Check if the files already exist
            img_path = img_dir / f"{base_name}_image.png"
            depth_path = depth_dir / f"{base_name}_depth.npy"
            norm_path = norm_dir / f"{base_name}_norm.npy"
            meta_path = meta_dir / f"{base_name}_metadata.npy"

            if img_path.exists() and depth_path.exists() and norm_path.exists() and meta_path.exists():
                print(f"Files for {base_name} already exist, skipping processing.")
                continue

            # Save the image as PNG
            image = np.transpose(image, (1, 2, 0)).astype(np.uint8)  # Ensure correct shape and type
            Image.fromarray(image).save(img_path)

            # Save the depth and surface normals as .npy to preserve raw values
            np.save(depth_path, depth)
            np.save(norm_path, snorm)

            # Save the room type and NYU index as .npy metadata
            metadata = {"room": room, "nyu_index": nyu_index}
            np.save(meta_path, metadata)

        return {"status": "success"}

    except Exception as e:
        # Log the file as a bad one
        with open(bad_files_log, 'a') as log_file:
            log_file.write(f"Error processing test set: {e}\n")
        print(f"Error processing test set: {e}")
        return {"status": "failure"}

def process_nyuv2_test(path, output_dir, bad_files_log="bad_test_files.log"):
    """
    Load and process NYUv2 test set from .pkl file.
    """
    with open(path, "rb") as f:
        data_dict = pickle.load(f)

    # Use tqdm for tracking the progress of loading data
    indices = data_dict["test_indices"]
    depths = [data_dict["depths"][_i] for _i in tqdm(indices, desc="Loading depths")]
    images = [data_dict["images"][_i] for _i in tqdm(indices, desc="Loading images")]
    snorms = [data_dict["snorms"][_i] for _i in tqdm(indices, desc="Loading surface normals")]
    scenes = [data_dict["scene_types"][_i][0] for _i in tqdm(indices, desc="Loading scene types")]

    test_set = list(zip(images, depths, snorms, scenes, indices))

    return extract_nyuv2_test_data(test_set, output_dir, bad_files_log)

In [None]:
# Define the path to the NYUv2 .pkl file and the output directory
nyuv2_pkl_path = "/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyuv2/nyuv2_snorm_all.pkl"
output_dir = "/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyuv2_test_processed"

# Process the NYUv2 test set and extract data
process_nyuv2_test(nyuv2_pkl_path, output_dir)

In [None]:
from datasets import load_dataset

# Load train and test datasets
# train_dataset = load_dataset('parquet', data_files='/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyu_geomet_parquet_seg/train_nyu_dataset.parquet')
# test_dataset = load_dataset('parquet', data_files='/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data/nyu_geomet_parquet_seg/test_nyu_dataset.parquet')

In [None]:
import pandas as pd
import os
from tqdm import tqdm  # Importing tqdm for progress bar


def save_dataset_in_segments(dataset, segment_size, save_dir, split_name):
    """
    Saves a dataset in segments to separate Parquet files using Hugging Face `datasets`.
    
    Args:
        dataset: Dataset to be saved.
        segment_size: The size of each segment (number of samples).
        save_dir: Directory to save the segments.
        split_name: Name of the split (train/val/test) to organize the files.
    """
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Calculate the number of segments
    num_segments = len(dataset) // segment_size + (1 if len(dataset) % segment_size != 0 else 0)

    # Save each segment as a separate Parquet file
    for i in tqdm(range(num_segments), desc=f"Saving {split_name} segments"):
        start_idx = i * segment_size
        end_idx = min((i + 1) * segment_size, len(dataset))

        # Extract the segment
        segment = dataset.select(range(start_idx, end_idx))

        # Define the file path for each segment
        segment_file = os.path.join(save_dir, f"{split_name}_segment_{i}.parquet")

        # Save the segment as a Parquet file
        segment.to_parquet(segment_file)

        print(f"Saved segment {i} with {end_idx - start_idx} samples to {segment_file}")

In [None]:
# Parameters for saving
segment_size = 100  # Define how many samples per segment
save_dir = "/p/openvocabdustr/probing_midlevel_vision/data/nyu_geonet_segmentation_parquet_v2"  # Directory to save the Parquet files
split_name = "train"  # Name of the split

# Save the train_normal_dataset in segments
save_dataset_in_segments(train_dataset["train"], segment_size, save_dir, split_name)

In [None]:
# Parameters for saving
segment_size = 100  # Define how many samples per segment
save_dir = "/p/openvocabdustr/probing_midlevel_vision/data/nyu_geonet_segmentation_parquet_v2"  # Directory to save the Parquet files
split_name = "val"  # Name of the split

# Save the train_normal_dataset in segments
save_dataset_in_segments(test_dataset["train"], segment_size, save_dir, split_name)

In [None]:
import os
import concurrent.futures
from huggingface_hub import HfApi, HfFolder

# Initialize the HfApi object and ensure authentication
api = HfApi()

# Ensure that you are logged in or provide the token
token = HfFolder.get_token()
if token is None:
    raise ValueError("You need to log in to Hugging Face or provide an access token!")

def get_uploaded_files(repo_id, token):
    """
    Get a list of files already uploaded to the Hugging Face Hub.
    """
    try:
        # List the files in the repository
        files_info = api.list_repo_files(repo_id, repo_type="dataset", token=token)
        return set(files_info)
    except Exception as e:
        print(f"Failed to retrieve uploaded files: {e}")
        return set()

# Define a function to push a dataset segment to the Hub
def push_segment_to_hub_hfapi(segment_file, repo_id, token, commit_message, progress_counter, total_files):
    # Upload the segment to the repository
    try:
        api.upload_file(
            path_or_fileobj=segment_file,  # Local file path
            path_in_repo=os.path.basename(segment_file),  # The path in the repo
            repo_id=repo_id,  # Repo name on Hugging Face Hub
            repo_type="dataset",  # Specify that it's a dataset
            token=token,
            commit_message=commit_message
        )
        progress_counter['uploaded'] += 1  # Update the counter
        print(f"Successfully uploaded {segment_file} ({progress_counter['uploaded']}/{total_files})")
    except Exception as e:
        print(f"Failed to upload {segment_file} due to: {e}")

def push_dataset_segments_in_parallel(dataset_dir, repo_id, num_threads=4):
    # Get the list of already uploaded files
    uploaded_files = get_uploaded_files(repo_id, token)

    # List all segment files (e.g., Parquet files) in the dataset directory
    segment_files = [os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir) if f.endswith(".parquet")]

    # Filter out the files that are already uploaded
    files_to_upload = [f for f in segment_files if os.path.basename(f) not in uploaded_files]

    total_files = len(files_to_upload)
    
    if total_files == 0:
        print("All files are already uploaded!")
        return

    # Counter to track the number of uploaded files
    progress_counter = {'uploaded': 0}

    print(f"Starting upload of {total_files} files...")

    # Use ThreadPoolExecutor to push segments in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for segment_file in files_to_upload:
            commit_message = f"Uploading {os.path.basename(segment_file)}"
            futures.append(executor.submit(push_segment_to_hub_hfapi, segment_file, repo_id, token, commit_message, progress_counter, total_files))

        # Ensure all threads are completed
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"Error during uploading: {e}")

In [None]:
dataset = load_dataset("/p/openvocabdustr/probing_midlevel_vision/data/nyu_geonet_segmentation_parquet_v2", split="test")

In [None]:
repo_id = "uva-cv-lab/nyu_geonet_segmentation_parquet"
dataset_dir = "/p/openvocabdustr/probing_midlevel_vision/data/nyu_geonet_segmentation_parquet_v2"

# Call the function to push dataset segments in parallel
push_dataset_segments_in_parallel(dataset_dir, repo_id, num_threads=3)