In [None]:
import os
import random
from tqdm import tqdm
from datasets import Dataset, Features, Image, Value
from joblib import Parallel, delayed

train_scenes = [
    "hanson", "merom", "klickitat", "onaga", "leonardo", "marstons", "newfields", "pinesdale",
    "lakeville", "cosmos", "benevolence", "pomaria", "tolstoy", "shelbyville", "allensville",
    "wainscott", "beechwood", "coffeen", "stockman", "hiteman", "woodbine", "lindenwood",
    "forkland", "mifflinburg", "ranchester"
]

# Validation scenes
validation_scenes = [
    "wiconisco", "corozal", "collierville", "markleeville", "darden"
]

# Test scenes
test_scenes = [
    "ihlen", "muleshoe", "uvalda", "noxapater", "mcdade"
]

In [None]:
import os
import random
from tqdm import tqdm
from datasets import Dataset, Features, Image, Value

def load_and_sample_dataset(input_dir_rgb, output_dir_depth, output_dir_reshading, output_dir_edge_occlusion,
                            output_dir_edge_texture, output_dir_keypoints2d, output_dir_keypoints3d,
                            output_dir_curvature, output_dir_normal, output_dir_mask_valid,
                            train_sample_size, val_sample_size, test_sample_size):
    
    # Initialize data structures for train, validation, and test
    train_data = {
        'rgb': [], 'depth': [], 'reshading': [], 'edge_occlusion': [], 'edge_texture': [],
        'keypoints2d': [], 'keypoints3d': [], 'principal_curvature': [], 'mask_valid': [], 'scene': []
    }
    
    val_data = {
        'rgb': [], 'depth': [], 'reshading': [], 'edge_occlusion': [], 'edge_texture': [],
        'keypoints2d': [], 'keypoints3d': [], 'principal_curvature': [], 'mask_valid': [], 'scene': []
    }

    test_data = {
        'rgb': [], 'depth': [], 'reshading': [], 'edge_occlusion': [], 'edge_texture': [],
        'keypoints2d': [], 'keypoints3d': [], 'principal_curvature': [], 'mask_valid': [], 'scene': []
    }

    # Sample size per scene
    train_sample_per_scene = train_sample_size // len(train_scenes) * 2
    val_sample_per_scene = val_sample_size // len(validation_scenes) * 2
    test_sample_per_scene = test_sample_size // len(test_scenes) * 2

    # Total number of images to be collected
    total_images_to_collect = train_sample_size + val_sample_size + test_sample_size
    collected_images = 0  # Track the number of images collected

    # Create a tqdm progress bar for the total number of images to collect
    pbar = tqdm(total=total_images_to_collect, desc="Collecting images", unit="img")

    # Track images per scene
    scene_image_count = {scene: 0 for scene in train_scenes + validation_scenes + test_scenes}

    # Traverse through the directories for keypoints3d images (use keypoints3d as reference)
    for root, _, files in os.walk(output_dir_keypoints3d):
        random.shuffle(files)  # Shuffle to randomize sampling
        for file in files:
            if file.endswith('.png'):
                # Get the relative path from the keypoints3d directory
                relative_path = os.path.relpath(os.path.join(root, file), output_dir_keypoints3d)

                # Build paths for all other outputs using the relative path of keypoints3d
                rgb_path = os.path.join(input_dir_rgb, relative_path.replace('keypoints3d', 'rgb'))
                depth_path = os.path.join(output_dir_depth, relative_path.replace('keypoints3d', 'depth_euclidean'))
                reshading_path = os.path.join(output_dir_reshading, relative_path.replace('keypoints3d', 'reshading'))
                edge_occlusion_path = os.path.join(output_dir_edge_occlusion, relative_path.replace('keypoints3d', 'edge_occlusion'))
                edge_texture_path = os.path.join(output_dir_edge_texture, relative_path.replace('keypoints3d', 'edge_texture'))
                keypoints2d_path = os.path.join(output_dir_keypoints2d, relative_path.replace('keypoints3d', 'keypoints2d'))
                curvature_path = os.path.join(output_dir_curvature, relative_path.replace('keypoints3d', 'principal_curvature'))
                # normal_path = os.path.join(output_dir_normal, relative_path.replace('keypoints3d', 'normal'))
                mask_valid_path = os.path.join(output_dir_mask_valid, relative_path.replace('keypoints3d', 'depth_zbuffer'))
                
                # Check if all corresponding files exist
                if all(os.path.exists(path) for path in [rgb_path, depth_path, reshading_path, edge_occlusion_path, 
                                                         edge_texture_path, keypoints2d_path, curvature_path, mask_valid_path]):
                    # Determine scene name
                    scene_name = relative_path.split('/')[0]  # Assuming first folder is the scene name

                    # Sample images for train, validation, and test based on scene
                    if scene_name in train_scenes and scene_image_count[scene_name] < train_sample_per_scene:
                        train_data['rgb'].append(rgb_path)
                        train_data['depth'].append(depth_path)
                        train_data['reshading'].append(reshading_path)
                        train_data['edge_occlusion'].append(edge_occlusion_path)
                        train_data['edge_texture'].append(edge_texture_path)
                        train_data['keypoints2d'].append(keypoints2d_path)
                        train_data['keypoints3d'].append(os.path.join(root, file))
                        train_data['principal_curvature'].append(curvature_path)
                        # train_data['normal'].append(normal_path)
                        train_data['mask_valid'].append(mask_valid_path)
                        train_data['scene'].append(scene_name)
                        scene_image_count[scene_name] += 1
                        collected_images += 1  # Update collected image count
                        pbar.update(1)  # Update tqdm bar by 1

                    elif scene_name in validation_scenes and scene_image_count[scene_name] < val_sample_per_scene:
                        val_data['rgb'].append(rgb_path)
                        val_data['depth'].append(depth_path)
                        val_data['reshading'].append(reshading_path)
                        val_data['edge_occlusion'].append(edge_occlusion_path)
                        val_data['edge_texture'].append(edge_texture_path)
                        val_data['keypoints2d'].append(keypoints2d_path)
                        val_data['keypoints3d'].append(os.path.join(root, file))
                        val_data['principal_curvature'].append(curvature_path)
                        # val_data['normal'].append(normal_path)
                        val_data['mask_valid'].append(mask_valid_path)
                        val_data['scene'].append(scene_name)
                        scene_image_count[scene_name] += 1
                        collected_images += 1  # Update collected image count
                        pbar.update(1)  # Update tqdm bar by 1

                    elif scene_name in test_scenes and scene_image_count[scene_name] < test_sample_per_scene:
                        test_data['rgb'].append(rgb_path)
                        test_data['depth'].append(depth_path)
                        test_data['reshading'].append(reshading_path)
                        test_data['edge_occlusion'].append(edge_occlusion_path)
                        test_data['edge_texture'].append(edge_texture_path)
                        test_data['keypoints2d'].append(keypoints2d_path)
                        test_data['keypoints3d'].append(os.path.join(root, file))
                        test_data['principal_curvature'].append(curvature_path)
                        # test_data['normal'].append(normal_path)
                        test_data['mask_valid'].append(mask_valid_path)
                        test_data['scene'].append(scene_name)
                        scene_image_count[scene_name] += 1
                        collected_images += 1  # Update collected image count
                        pbar.update(1)  # Update tqdm bar by 1

    pbar.close()  # Close the progress bar

    # Define the feature structure of the dataset (loading image files)
    features = Features({
        'rgb': Image(),         # Load RGB images
        'depth': Image(),       # Load Depth images
        'reshading': Image(),   # Load Reshading images
        'edge_occlusion': Image(),
        'edge_texture': Image(),
        'keypoints2d': Image(),
        'keypoints3d': Image(),
        'principal_curvature': Image(),
        'mask_valid': Image(),
        'scene': Value("string") # Use the correct type for string values
    })

    # Create train, validation, and test datasets using the collected data
    train_dataset = Dataset.from_dict(train_data, features=features)
    val_dataset = Dataset.from_dict(val_data, features=features)
    test_dataset = Dataset.from_dict(test_data, features=features)

    return train_dataset, val_dataset, test_dataset

In [None]:
input_dir_rgb = '/p/openvocabdustr/probing_midlevel_vision/data/rgb/taskonomy'
output_dir_depth = '/p/openvocabdustr/probing_midlevel_vision/data/depth_euclidean/taskonomy'
output_dir_reshading = '/p/openvocabdustr/probing_midlevel_vision/data/reshading/taskonomy'
output_dir_edge_occulusion = '/p/openvocabdustr/probing_midlevel_vision/data/edge_occlusion/taskonomy'
output_dir_edge_texture = '/p/openvocabdustr/probing_midlevel_vision/data/edge_texture/taskonomy'
output_dir_keypoints2d = '/p/openvocabdustr/probing_midlevel_vision/data/keypoints2d/taskonomy'
output_dir_keypoints3d = '/p/openvocabdustr/probing_midlevel_vision/data/keypoints3d/taskonomy'
output_dir_curvature = '/p/openvocabdustr/probing_midlevel_vision/data/principal_curvature/taskonomy'
output_dir_normal = '/p/openvocabdustr/probing_midlevel_vision/data/normal/taskonomy'
output_dir_mask_valid = '/p/openvocabdustr/probing_midlevel_vision/data/mask_valid/taskonomy'

# Set the sample size for train, validation, and test sets
train_sample_size = 20000
val_sample_size = 2000
test_sample_size = 2000

# Load and sample the custom dataset
train_dataset, val_dataset, test_dataset = load_and_sample_dataset(input_dir_rgb, output_dir_depth, output_dir_reshading, 
                                                      output_dir_edge_occulusion, output_dir_edge_texture, 
                                                      output_dir_keypoints2d, output_dir_keypoints3d, 
                                                      output_dir_curvature, output_dir_normal, output_dir_mask_valid,
                                                      train_sample_size, val_sample_size, test_sample_size)

In [None]:
# Access train and test datasets
print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import collections

# Count occurrences of each scene in the train, validation, and test datasets
train_scene_distribution = collections.Counter(train_dataset['scene'])
val_scene_distribution = collections.Counter(val_dataset['scene'])
test_scene_distribution = collections.Counter(test_dataset['scene'])

# Convert the scene distributions to DataFrames for easier plotting
train_df = pd.DataFrame(train_scene_distribution.items(), columns=['Scene', 'Count'])
val_df = pd.DataFrame(val_scene_distribution.items(), columns=['Scene', 'Count'])
test_df = pd.DataFrame(test_scene_distribution.items(), columns=['Scene', 'Count'])

# Add a 'Type' column to differentiate between Train, Validation, and Test
train_df['Type'] = 'Train'
val_df['Type'] = 'Validation'
test_df['Type'] = 'Test'

# Combine all three datasets into a single DataFrame for plotting
combined_df = pd.concat([train_df, val_df, test_df])

# Plot the distribution of train, validation, and test samples
plt.figure(figsize=(12, 6))

# Plot different colors for Train, Validation, and Test scenes
plt.bar(train_df["Scene"], train_df["Count"], color='blue', label="Train")
plt.bar(val_df["Scene"], val_df["Count"], color='orange', label="Validation")
plt.bar(test_df["Scene"], test_df["Count"], color='green', label="Test")

# Add labels and title
plt.xlabel("Scene")
plt.ylabel("Sampled Images Count")
plt.title("Sampled Image Distribution: Train, Validation, and Test Sets")
plt.xticks(rotation=45, ha="right")
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

# Print the scene distribution for train, validation, and test datasets
print("Train scene distribution:", train_scene_distribution)
print("Validation scene distribution:", val_scene_distribution)
print("Test scene distribution:", test_scene_distribution)

In [None]:
import os
import random
from tqdm import tqdm
from datasets import Dataset, Features, Image, Value

def load_and_sample_rgb_normal_mask_valid(input_dir_rgb, output_dir_normal, output_dir_mask_valid,
                                          train_sample_size, val_sample_size, test_sample_size):
    
    # Initialize data structures for train, validation, and test
    train_data = {
        'rgb': [], 'normal': [], 'mask_valid': [], 'scene': []
    }
    
    val_data = {
        'rgb': [], 'normal': [], 'mask_valid': [], 'scene': []
    }

    test_data = {
        'rgb': [], 'normal': [], 'mask_valid': [], 'scene': []
    }

    # Sample size per scene
    train_sample_per_scene = train_sample_size // len(train_scenes) * 4
    val_sample_per_scene = val_sample_size // len(validation_scenes) * 4
    test_sample_per_scene = test_sample_size // len(test_scenes) * 4

    # Total number of images to be collected
    total_images_to_collect = train_sample_size + val_sample_size + test_sample_size
    collected_images = 0  # Track the number of images collected

    # Create a tqdm progress bar for the total number of images to collect
    pbar = tqdm(total=total_images_to_collect, desc="Collecting images", unit="img")

    # Track images per scene
    scene_image_count = {scene: 0 for scene in train_scenes + validation_scenes + test_scenes}

    # Traverse through the directories for normal images (use normal images as reference)
    for root, _, files in os.walk(output_dir_normal):
        random.shuffle(files)  # Shuffle to randomize sampling
        for file in files:
            if file.endswith('.png'):
                # Get the relative path from the normal directory
                relative_path = os.path.relpath(os.path.join(root, file), output_dir_normal)

                # Build paths for RGB and mask_valid using the relative path of normal
                rgb_path = os.path.join(input_dir_rgb, relative_path.replace('normal', 'rgb'))
                mask_valid_path = os.path.join(output_dir_mask_valid, relative_path.replace('normal', 'depth_zbuffer'))
                
                # Check if all corresponding files exist (RGB and mask_valid)
                if all(os.path.exists(path) for path in [rgb_path, mask_valid_path]):
                    # Determine scene name
                    scene_name = relative_path.split('/')[0]  # Assuming first folder is the scene name

                    # Sample images for train, validation, and test based on scene
                    if scene_name in train_scenes and scene_image_count[scene_name] < train_sample_per_scene:
                        train_data['rgb'].append(rgb_path)
                        train_data['normal'].append(os.path.join(root, file))
                        train_data['mask_valid'].append(mask_valid_path)
                        train_data['scene'].append(scene_name)
                        scene_image_count[scene_name] += 1
                        collected_images += 1  # Update collected image count
                        pbar.update(1)  # Update tqdm bar by 1

                    elif scene_name in validation_scenes and scene_image_count[scene_name] < val_sample_per_scene:
                        val_data['rgb'].append(rgb_path)
                        val_data['normal'].append(os.path.join(root, file))
                        val_data['mask_valid'].append(mask_valid_path)
                        val_data['scene'].append(scene_name)
                        scene_image_count[scene_name] += 1
                        collected_images += 1  # Update collected image count
                        pbar.update(1)  # Update tqdm bar by 1

                    elif scene_name in test_scenes and scene_image_count[scene_name] < test_sample_per_scene:
                        test_data['rgb'].append(rgb_path)
                        test_data['normal'].append(os.path.join(root, file))
                        test_data['mask_valid'].append(mask_valid_path)
                        test_data['scene'].append(scene_name)
                        scene_image_count[scene_name] += 1
                        collected_images += 1  # Update collected image count
                        pbar.update(1)  # Update tqdm bar by 1

    pbar.close()  # Close the progress bar

    # Define the feature structure of the dataset (loading image files)
    features = Features({
        'rgb': Image(),         # Load RGB images
        'normal': Image(),      # Load normal images
        'mask_valid': Image(),  # Load mask_valid images
        'scene': Value("string") # Use the correct type for string values
    })

    # Create train, validation, and test datasets using the collected data
    train_dataset = Dataset.from_dict(train_data, features=features)
    val_dataset = Dataset.from_dict(val_data, features=features)
    test_dataset = Dataset.from_dict(test_data, features=features)

    return train_dataset, val_dataset, test_dataset

In [None]:
input_dir_rgb = '/p/openvocabdustr/probing_midlevel_vision/data/rgb/taskonomy'
output_dir_normal = '/p/openvocabdustr/probing_midlevel_vision/data/normal/taskonomy'
output_dir_mask_valid = '/p/openvocabdustr/probing_midlevel_vision/data/mask_valid/taskonomy'

# Set the sample size for train, validation, and test sets
train_sample_size = 20000
val_sample_size = 2000
test_sample_size = 2000

# Load and sample the custom dataset with only RGB, Normal, and Mask_Valid
train_normal_dataset, val_normal_dataset, test_normal_dataset = load_and_sample_rgb_normal_mask_valid(input_dir_rgb, output_dir_normal, 
                                                      output_dir_mask_valid, train_sample_size, val_sample_size, 
                                                      test_sample_size)

# Check the sizes of the datasets
print(f"Train dataset size: {len(train_normal_dataset)}")
print(f"Validation dataset size: {len(val_normal_dataset)}")
print(f"Test dataset size: {len(test_normal_dataset)}")

In [None]:
train_dataset.to_parquet("train_dataset.parquet")
val_dataset.to_parquet("val_dataset.parquet")
test_dataset.to_parquet("test_dataset.parquet")

In [None]:
from huggingface_hub import HfApi

In [None]:
api = HfApi()

# Push the Parquet files to the same dataset repo
api.upload_file(
    path_or_fileobj="train_dataset.parquet",
    path_in_repo="train_dataset.parquet",  # The name of the file in the repo
    repo_id="Xuweiyi/ssl_probing_taskonomy",  # Replace with your repo
    repo_type="dataset"
)

api.upload_file(
    path_or_fileobj="val_dataset.parquet",
    path_in_repo="val_dataset.parquet",  # The name of the file in the repo
    repo_id="Xuweiyi/ssl_probing_taskonomy",  # Replace with your repo
    repo_type="dataset"
)

api.upload_file(
    path_or_fileobj="test_dataset.parquet",
    path_in_repo="test_dataset.parquet",  # The name of the file in the repo
    repo_id="Xuweiyi/ssl_probing_taskonomy",  # Replace with your repo
    repo_type="dataset"
)

In [None]:
train_normal_dataset.to_parquet("train_normal_dataset.parquet")
val_normal_dataset.to_parquet("val_normal_dataset.parquet")
test_normal_dataset.to_parquet("test_normal_dataset.parquet")

In [None]:
api = HfApi()

# Push the Parquet files to the same dataset repo
api.upload_file(
    path_or_fileobj="train_normal_dataset.parquet",
    path_in_repo="train_dataset.parquet",  # The name of the file in the repo
    repo_id="Xuweiyi/ssl_probing_taskonomy_snorm",  # Replace with your repo
    repo_type="dataset"
)

api.upload_file(
    path_or_fileobj="val_normal_dataset.parquet",
    path_in_repo="val_dataset.parquet",  # The name of the file in the repo
    repo_id="Xuweiyi/ssl_probing_taskonomy_snorm",  # Replace with your repo
    repo_type="dataset"
)

api.upload_file(
    path_or_fileobj="test_normal_dataset.parquet",
    path_in_repo="test_dataset.parquet",  # The name of the file in the repo
    repo_id="Xuweiyi/ssl_probing_taskonomy_snorm",  # Replace with your repo
    repo_type="dataset"
)

In [None]:
from datasets import load_dataset

# Load the train split from the Hugging Face Hub
train_normal_dataset = load_dataset("/p/openvocabdustr/probing_midlevel_vision/data/probing_ssl_snorm", split="train")

# Load the test split from the Hugging Face Hub
val_normal_dataset = load_dataset("/p/openvocabdustr/probing_midlevel_vision/data/probing_ssl_snorm", split="validation")

# Load the test split from the Hugging Face Hub
test_normal_dataset = load_dataset("/p/openvocabdustr/probing_midlevel_vision/data/probing_ssl_snorm", split="test")

# Access the train and test datasets
print(f"Train dataset size: {len(train_normal_dataset)}")
print(f"Val dataset size: {len(val_normal_dataset)}")
print(f"Test dataset size: {len(test_normal_dataset)}")

In [None]:
import numpy as np
from tqdm import tqdm
from PIL import Image
import torch
from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation

# Initialize the OneFormer model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_large")
model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_ade20k_swin_large").to(device)

def add_panoptic_map_and_id2label(example):
    """
    Process each example by running panoptic segmentation and adding the panoptic map and id2label to the dataset.
    """
    # Extract the RGB image directly from the example
    image = example['rgb']

    # Preprocess the image and run inference
    inputs = processor(image, ["panoptic"], return_tensors="pt", do_reduce_labels=True).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)

    # Run post-processing step to get panoptic segmentation
    panoptic_result = processor.post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]

    # Extract the panoptic map and segment information
    panoptic_map = panoptic_result["segmentation"].cpu().numpy()

    # Create the id2label dictionary
    id2label = {}
    for segment in panoptic_result["segments_info"]:
        segment_id = segment['id']
        area = np.sum(panoptic_map == segment_id)

        # Save id2label mapping with computed area
        id2label[segment_id] = {
            "label_id": segment['label_id'],
            "was_fused": segment.get('was_fused', False),
            "score": segment.get('score', 0),
            "area": area
        }

    # Add the panoptic_map and id2label to the dataset example
    example['panoptic_map'] = panoptic_map
    example['id2label'] = id2label

    return example

In [None]:
train_normal_dataset = train_normal_dataset.map(add_panoptic_map_and_id2label)
val_normal_dataset = val_normal_dataset.map(add_panoptic_map_and_id2label)
test_normal_dataset = test_normal_dataset.map(add_panoptic_map_and_id2label)

In [None]:
import sys
import os

# Add the correct directory to sys.path
# Change directory to the desired path
os.chdir("/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision/data_processing/taskonomy_preprocess")

# Add the current directory to sys.path
sys.path.append(".")

# Use absolute import
from transforms import task_transform

In [None]:
def task_transform_wrapper(example):
    # Apply transformations to 'rgb', 'normal', and 'mask_valid'
    if 'rgb' in example:
        example['rgb_processed'] = task_transform(example['rgb'], 'rgb')
    if 'normal' in example:
        example['normal_processed'] = task_transform(example['normal'], 'normal')
    if 'mask_valid' in example:
        example['mask_valid_processed'] = task_transform(example['mask_valid'], 'mask_valid')
    return example

In [None]:
train_normal_dataset = train_normal_dataset.map(
    task_transform_wrapper,
    num_proc=12,  # Use multiple processes for efficiency
    desc="Processing Train Set: Transforming and Saving New Keys",
    batch_size=1000,  # Process in batches
)