# Global SKM Data Preparation
Take in the global gaussians CSV file and generate the old ID-less array expected by the algorithm explorer.

In [1]:
DATA_NAME = "playroom_23"

# Load the data from CSV

In [2]:
from os.path import join

from numpy import float32
from polars import Float32, Schema, UInt8, UInt32, scan_csv, all

from clustering_exploration.utils.constants import DATA_DIR, IMAGE_HEIGHT, IMAGE_WIDTH, NUMBER_OF_SPLATS_PER_PIXEL

# Define the column names.
column_names = [
    f"gaussian_{i}_{part}" for i in range(NUMBER_OF_SPLATS_PER_PIXEL) for part in
    ["id", "alpha", "depth", "color_r", "color_g", "color_b"]
]
column_names = [
    "sample_index",
    "out_color_r",
    "out_color_g",
    "out_color_b",
    "background_r",
    "background_g",
    "background_b",
    *column_names,
]

# Define schema.
schema_dict = {name: Float32 for name in column_names}
schema_dict["sample_index"] = UInt32
schema_dict["background_r"] = UInt8
schema_dict["background_g"] = UInt8
schema_dict["background_b"] = UInt8
schema = Schema(schema_dict)

# Load the data from CSV.
data_path = join(DATA_DIR, f"{DATA_NAME}.csv")
raw_data = scan_csv(data_path, schema=schema)

# Extracts the splats.
id_splats = (
    raw_data.select(
        all().exclude(
            "sample_index",
            "out_color_r",
            "out_color_g",
            "out_color_b",
            "background_r",
            "background_g",
            "background_b",
        )
    )
    .collect()
    .to_numpy()
    .astype(float32, copy=False)
    .reshape((IMAGE_HEIGHT * IMAGE_WIDTH, NUMBER_OF_SPLATS_PER_PIXEL, 6))
)

# Get Global Splat Data
- Global list of splat ID's
- Shuffle them

In [3]:
from numpy import unique, delete

# Get the global list of splat ID's.
all_ids = delete(unique(id_splats[:, :, 0].flatten().astype(int)), 0)
all_ids

array([      4,       5,       7, ..., 1847085, 1847087, 1847098],
      shape=(707154,))

In [4]:
from numpy.random import shuffle

# Shuffle the ID's.
shuffle(all_ids)
all_ids

array([ 48329, 138548, 223153, ..., 837576, 375627, 501478],
      shape=(707154,))

# Create ID-less splats using global splat order.
- Create HxW, 500, 5 array
- For each pixel, copy the splat data in the oder of the shuffled ID's

In [5]:
from numpy import isin, zeros, ndarray
from tqdm.auto import tqdm
from joblib import Parallel, delayed


def extract_splat_data_in_order(pixel_splats: ndarray) -> ndarray:
    # Compute the new global order of splats.
    pixel_splat_id_order = all_ids[isin(all_ids, pixel_splats[:, 0])]

    # Create the output array.
    pixel_output = zeros((NUMBER_OF_SPLATS_PER_PIXEL, 5), dtype=float32)

    # For each splat, copy the data from the id_splats array into the output_splats array.
    for id_index, splat_id in enumerate(pixel_splat_id_order):
        splat_data = pixel_splats[pixel_splats[:, 0] == splat_id][0]
        pixel_output[id_index] = splat_data[1:]

    return pixel_output


output_splats = Parallel(n_jobs=-1)(
    delayed(extract_splat_data_in_order)(pixel_splats) for pixel_splats in tqdm(id_splats))

  0%|          | 0/1051648 [00:00<?, ?it/s]

# Save the splats

In [6]:
from clustering_exploration.utils.constants import CACHE_DIR
from numpy import save

del id_splats

cache_path = join(CACHE_DIR, f"{DATA_NAME}_global_ordered_tenth.npy")
save(cache_path, output_splats)