make point cloud samples for training treePointr models

### 1 make complete samples only for viewpoint method



In [None]:
import numpy as np
import os
import glob
import random
import pandas as pd
import sys

import make_samples_complete

In [None]:
# path to folder with single tree point cloud files
inpath = "path/to/singletrees/"
# output path for samples
outpath = "path/for/output/"

# this function loops through all files (.npy, .xyz, .txt, .ply) in inpath and
# makes the specified number of samples from each file:
# make samples, specify start_count, stop_count, boxsize, and name of data source
# creates samples while in range(stop_count)
make_samples_complete.mksamples(inpath, outpath, stop_count=100, method='grove')

In [None]:
path_samples = "path/to/samples/" # outpath from before
path = "path/to/parentdirectory"

# divide into train and test sets by writing a txt file
make_samples_complete.train_test_txt(path_samples, outpath=path, test_trees = False)

### 2 make complete and partial samples from simulated scans

Partial samples are based on the simulated scan positions (legs)

This version is an update based on treePoinTr

In [None]:
import numpy as np
import open3d as o3d
import random
import os
import json
import shutil
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D

import make_samples_from_sim


In [None]:
# Paths

root = "/home/katnips/treePoinTr/"

# path to folder with single tree complete point cloud files
fulltree_path = root + "data_brut/data_treepointr/TheGrove_pointclouds_npy/" #TheGrove_pointclouds_npy/
# path to folder with with individual legs of Helios++ simulations in .xyz format
sim_path = root + "data_brut/data_treepointr/HeliosSim/"

# output path for samples
#outpath = root + "data_training/tree_ada_v2_synthetic_8192_1/"
#outpath = root + "data_training/tree_ada_v2_synthetic_2048_1/"
#outpath = root + "data_training/tree_ada_v2_synthetic_2048_05/"
outpath = root + "data_training/tree_ada_v2_synthetic_8192_05/"

In [None]:
# Load or generate centers for reproducibility
# With a min_volume (0.5 m3) and num_samples (e.g., 200)

num_samples = 200
fulltree_path_npy = root + "data_brut/data_treepointr/TheGrove_pointclouds_npy" #TheGrove_pointclouds_npy/
first_outpath = root + "data_training/tree_ada_v2_synthetic_8192_1/"

if not os.path.exists(first_outpath):
    os.makedirs(first_outpath)

for file in os.listdir(fulltree_path_npy):
    if file.endswith(".npy"):
        treename = os.path.splitext(file)[0]
        pc_path = os.path.join(fulltree_path_npy, file)
        centers_path = os.path.join(first_outpath, f"{treename}_centers.npy")

        if os.path.exists(centers_path):
            print(f"Loading existing centers for {treename}")
            centers = np.load(centers_path)
        else:
            print(f"Generating centers for {treename}")
            pc = np.load(pc_path)
            centers = make_samples_from_sim.generate_sample_centers(pc, num_samples=num_samples, cube_size=1, min_volume=0.5, save_path=centers_path)
            print(f"Generated {len(centers)} valid centers for {treename}")

In [None]:
# Create samples from the generated centers 

num_samples = 200 # number of samples per tree
nb_points = 8192 # 8192 or 2048 points for the downsampling 
boxsize = 0.3968  # 0.5 for 1m3 (cube size), 0.3968 for 0.5m3  

for file in os.listdir(fulltree_path):
    if file.endswith(".npy"):
        treename = os.path.splitext(file)[0]
        pc_path = os.path.join(fulltree_path, file)
        centers_path = os.path.join(first_outpath, f"{treename}_centers.npy")

        if os.path.exists(centers_path):
            centers = np.load(centers_path)
            print(f"Loaded centers for {treename}")
        else:
            pc = np.load(pc_path)
            centers = make_samples_from_sim.generate_sample_centers(
                pc, 
                num_samples=num_samples, 
                cube_size=boxsize,          
                min_volume=0.5,             
                save_path=centers_path
            )
            print(f"Generated centers for {treename}")

        pc = np.load(pc_path)

        print(f"Start processing tree: {treename} with {len(centers)} centers")

        make_samples_from_sim.mksamples_sim_for_one_tree(
            pc=pc,
            sim_path=sim_path,
            outpath=outpath,
            centers=centers,
            treename=treename,
            stop_count=num_samples,
            nb_points=nb_points,
            boxsize=boxsize
        )

        print(f"Finished processing tree: {treename}")

In [None]:
# Reduce the number of partial samples  -> 15 max

def keep_balanced_random_partial_variants(root_dir, max_total=15):
    """
    For each subfolder in root_dir, keep up to max_total .npy files:
    - Half with '_noise_downsample' in name
    - Half without '_noise_downsample'
    If not enough files exist, keep what’s available but preserve balance.
    """
    random.seed(42)

    for dirpath, _, filenames in os.walk(root_dir):
        npy_files = [f for f in filenames if f.endswith('.npy')]
        if len(npy_files) <= max_total:
            continue  # Skip if already within the limit

        # Split into noise and clean groups
        noise_files = [f for f in npy_files if '_noise_downsample' in f]
        clean_files = [f for f in npy_files if '_noise_downsample' not in f]

        # Compute how many we can take from each group
        half = max_total // 2
        max_noise = min(half, len(noise_files))
        max_clean = min(max_total - max_noise, len(clean_files))

        # Adjust again if not enough in either group
        if max_noise + max_clean < max_total:
            # Try to top up from the other group if possible
            remaining = max_total - (max_noise + max_clean)
            if len(noise_files) > max_noise:
                max_noise += min(remaining, len(noise_files) - max_noise)
            elif len(clean_files) > max_clean:
                max_clean += min(remaining, len(clean_files) - max_clean)

        keep_noise = random.sample(noise_files, max_noise)
        keep_clean = random.sample(clean_files, max_clean)
        keep_set = set(keep_noise + keep_clean)

        for f in npy_files:
            if f not in keep_set:
                os.remove(os.path.join(dirpath, f))
                #print(f"Removed: {os.path.join(dirpath, f)}")

# Usage
partial_outpath = outpath + "train/partial/"
keep_balanced_random_partial_variants(partial_outpath, max_total=15) 


In [None]:
# Function to reduce the number of cubes for each dataset (5 000 cubes per dataset, with at least 1 partial sample per cube)

def reduce_partial_samples(complete_dir, partial_dirs, max_partial_samples=5000, min_per_cube=1, seed=42):
    random.seed(seed)

    # Step 1: Get all complete cubes
    cubes = []
    for root, _, files in os.walk(complete_dir):
        for f in files:
            if f.endswith(".npy"):
                rel_path = os.path.relpath(os.path.join(root, f), complete_dir)
                cube_id = os.path.splitext(rel_path)[0]
                cubes.append(cube_id)

    print(f"Total complete cubes: {len(cubes)}")

    # Step 2: Collect all partial samples for each cube
    cube_to_partials = defaultdict(list)
    for pdir in partial_dirs:
        for root, _, files in os.walk(pdir):
            for f in files:
                if f.endswith(".npy"):
                    rel_dir = os.path.relpath(root, pdir)
                    cube_id = rel_dir.replace("\\", "/")  # normalize windows slashes
                    cube_to_partials[cube_id].append(os.path.join(root, f))

    total_partials_before = sum(len(v) for v in cube_to_partials.values())
    print(f"Total partial samples before reduction: {total_partials_before}")

    # Step 3: Keep at least `min_per_cube` partial samples per cube (if available)
    guaranteed_partials = []
    leftovers = []

    for cube in cubes:
        partials = cube_to_partials.get(cube, [])
        if len(partials) <= min_per_cube:
            guaranteed_partials.extend(partials)  # keep all if less than min
        else:
            # randomly select min_per_cube partials to keep
            chosen = random.sample(partials, min_per_cube)
            guaranteed_partials.extend(chosen)
            # put the rest in leftovers
            for p in partials:
                if p not in chosen:
                    leftovers.append(p)

    print(f"Guaranteed partial samples (min {min_per_cube} per cube): {len(guaranteed_partials)}")

    # Step 4: Randomly select additional partials from leftovers to fill up to max_partial_samples
    remaining_quota = max_partial_samples - len(guaranteed_partials)
    if remaining_quota > 0:
        if len(leftovers) > remaining_quota:
            extras = random.sample(leftovers, remaining_quota)
        else:
            extras = leftovers
    else:
        extras = []

    # Step 5: Delete all partial files that are not in the keep list
    to_keep = set(guaranteed_partials + extras)
    removed_count = 0
    kept_count = 0

    for pdir in partial_dirs:
        for root, _, files in os.walk(pdir):
            for f in files:
                if f.endswith(".npy"):
                    full_path = os.path.join(root, f)
                    if full_path not in to_keep:
                        os.remove(full_path)
                        removed_count += 1
                    else:
                        kept_count += 1

    print(f"Kept {kept_count} partial samples, removed {removed_count}")


In [None]:
# Use our function to reduce the number of partial samples

points = str(8192)  # 8192 or 2048 points
size = str("05")    # 1 or 05 (m3)

complete_dir = os.path.join(root, f"data_training/tree_ada_v2_synthetic_{points}_{size}", "train", "complete")
partial_dirs = [os.path.join(root, f"data_training/tree_ada_v2_synthetic_{points}_{size}", "train", "partial")]

reduce_partial_samples(
    complete_dir=complete_dir,
    partial_dirs=partial_dirs,
    max_partial_samples=5000,
    min_per_cube=1,
    seed=42
)


In [None]:
# Check if everything is ok with the partial and complete samples

def check_partial_and_complete_info(complete_dir, partial_dirs):
    print(f"\n--- Checking dataset in: {complete_dir} ---")

    # Count total partial npy files
    total_partials = 0
    for pdir in partial_dirs:
        for root, _, files in os.walk(pdir):
            total_partials += sum(f.endswith(".npy") for f in files)
    print(f"Total partial .npy files: {total_partials}")

    # Find one example complete .npy file
    example_file = None
    for root, _, files in os.walk(complete_dir):
        for f in files:
            if f.endswith(".npy"):
                example_file = os.path.join(root, f)
                break
        if example_file:
            break

    if example_file:
        pc = np.load(example_file)
        n_points = pc.shape[0]
        print(f"Example complete file: {example_file}")
        print(f"Number of points: {n_points}")

        # Compute bounding box size
        mins = pc.min(axis=0)
        maxs = pc.max(axis=0)
        bbox_sizes = maxs - mins
        volume = bbox_sizes[0] * bbox_sizes[1] * bbox_sizes[2]

        print(f"Bounding box sizes (X, Y, Z): {bbox_sizes}")
        print(f"Calculated cube volume: {volume:.4f} m³")
    else:
        print("No complete .npy files found")

# Root path
root = "/home/katnips/treePoinTr/"

# List of (complete_dir, partial_dirs) for each dataset
datasets = [
    ("tree_ada_v2_synthetic_8192_1",),
    ("tree_ada_v2_synthetic_2048_1",),
    ("tree_ada_v2_synthetic_2048_05",),
    ("tree_ada_v2_synthetic_8192_05",)
]

for name in datasets:
    complete_dir = os.path.join(root, "data_training", name[0], "train", "complete")
    partial_dirs = [os.path.join(root, "data_training", name[0], "train", "partial")]
    check_partial_and_complete_info(complete_dir, partial_dirs)



In [None]:
# Collect and visualize cube volumes from the complete samples

# Function to compute cube volumes
def collect_cube_volumes(dataset_name, complete_dir):
    volumes = []
    for root, _, files in os.walk(complete_dir):
        for f in files:
            if f.endswith(".npy"):
                path = os.path.join(root, f)
                try:
                    pc = np.load(path)
                    bbox = pc.max(axis=0) - pc.min(axis=0)
                    volume = np.prod(bbox)
                    volumes.append((dataset_name, volume))
                except Exception as e:
                    print(f"Error loading {path}: {e}")
    return volumes

# Dataset configuration
root = "/home/katnips/treePoinTr"
dataset_configs = [
    ("2048_1", os.path.join(root, "data_training/tree_ada_v2_synthetic_2048_1/train/complete")),
    ("2048_05", os.path.join(root, "data_training/tree_ada_v2_synthetic_2048_05/train/complete")),
    ("8192_1", os.path.join(root, "data_training/tree_ada_v2_synthetic_8192_1/train/complete")),
    ("8192_05", os.path.join(root, "data_training/tree_ada_v2_synthetic_8192_05/train/complete")),
]

# Collect volumes
all_volumes = []
for name, cdir in dataset_configs:
    vols = collect_cube_volumes(name, cdir)
    all_volumes.extend(vols)

# Convert to DataFrame
df = pd.DataFrame(all_volumes, columns=["Dataset", "Volume"])

# Plot settings
plt.figure(figsize=(10, 6))
sns.set_context("notebook", font_scale=1.5)

ax = sns.histplot(
    data=df,
    x="Volume",
    hue="Dataset",
    element="step",
    stat="count",
    common_norm=False,
    bins=50,
    legend=True
)

# Labels and title
#plt.title("Distribution of cube volumes per dataset", fontsize=18)
plt.xlabel("Cube volume (m³)", fontsize=16)
plt.ylabel("Number of cubes", fontsize=16)
ax.tick_params(axis='both', labelsize=14)

# Legend
leg = ax.get_legend()
if leg:
    leg.set_title("Dataset", prop={'size': 14})
    for text in leg.get_texts():
        text.set_fontsize(13)
    leg.set_bbox_to_anchor((0.02, 0.98))
    leg._loc = 2  # upper left

# Layout and export
plt.grid(True)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig("/home/katnips/treePoinTr/figures/synthetic_dataset_cube_volumes_english.png", dpi=600, bbox_inches='tight', pad_inches=0.05)
plt.show()


In [None]:
# Split : 80% train, 10% test, 10% val
# and write a .json file listing all the train and test samples

#outpath = root + "/data_training/tree_ada_v2_synthetic_8192_1"
#outpath = root + "/data_training/tree_ada_v2_synthetic_2048_1"
#outpath = root + "/data_training/tree_ada_v2_synthetic_2048_05"
outpath = root + "/data_training/tree_ada_v2_synthetic_8192_05"

# Directory containing the complete samples made before
complete_dir_train = outpath+"/train/complete/" 

# Specify the name for the .json file 
dataset_name = "tree_ada_v2_synthetic_8192_05"

# Use the function
make_samples_from_sim.traintestval_json(complete_dir_train, outpath, dataset_name)

In [None]:
# Visualize (complete and partial)

def plot_point_cloud(points, ax, title, color='b', s=0.4):
    ax.scatter(points[:, 0], points[:, 1], points[:, 2], s=s, color=color, alpha=0.7)
    ax.set_title(title, fontsize=8)
    ax.axis('off')
    ax.view_init(elev=30, azim=135)  # nouvelle vue 3D

def extract_cube_prefix(name):
    return "_".join(name.split("_")[:2])

dataset_info = {
    "tree_ada_v2_synthetic_2048_1/train": "2048 pts – 1m³",
    "tree_ada_v2_synthetic_2048_05/train": "2048 pts – 0.5m³",
    "tree_ada_v2_synthetic_8192_1/train": "8192 pts – 1m³",
    "tree_ada_v2_synthetic_8192_05/train": "8192 pts – 0.5m³",
}

root = "/home/katnips/treePoinTr/data_training"
dataset_dirs = list(dataset_info.keys())

# Step 1: Find common cube prefixes
dataset_complete_cubes = []
for dataset_path in dataset_dirs:
    complete_dir = os.path.join(root, dataset_path, "complete")
    valid_cubes = set()
    for tree_name in os.listdir(complete_dir):
        tree_dir = os.path.join(complete_dir, tree_name)
        if not os.path.isdir(tree_dir):
            continue
        for file in os.listdir(tree_dir):
            if file.endswith(".npy"):
                cube_prefix = extract_cube_prefix(os.path.splitext(file)[0])
                valid_cubes.add((tree_name, cube_prefix))
    dataset_complete_cubes.append(valid_cubes)

common_complete_cubes = set.intersection(*dataset_complete_cubes)
if not common_complete_cubes:
    print("❌ No common cube prefix found.")
    exit()

selected_tree, selected_prefix = sorted(common_complete_cubes)[0]
print(f"✔ Selected cube: {selected_tree}/{selected_prefix}")

# Step 2: Load point clouds
pointcloud_pairs = []
for dataset_path in dataset_dirs:
    label = dataset_info[dataset_path]
    complete_dir = os.path.join(root, dataset_path, "complete", selected_tree)
    partial_dir = os.path.join(root, dataset_path, "partial", selected_tree)

    complete_file = None
    for f in os.listdir(complete_dir):
        if f.endswith(".npy") and extract_cube_prefix(f) == selected_prefix:
            complete_file = os.path.join(complete_dir, f)
            break

    if not complete_file:
        continue

    matching_cube_folder = None
    for folder in os.listdir(partial_dir):
        if extract_cube_prefix(folder) == selected_prefix:
            matching_cube_folder = folder
            break

    if not matching_cube_folder:
        continue

    partial_files = [f for f in os.listdir(os.path.join(partial_dir, matching_cube_folder))
                     if f.endswith(".npy") and "_noise_downsample" not in f]

    if not partial_files:
        continue

    complete_pc = np.load(complete_file)
    partial_pc = np.load(os.path.join(partial_dir, matching_cube_folder, partial_files[0]))
    pointcloud_pairs.append((label, complete_pc, partial_pc))

# Step 3: Plot
n = len(pointcloud_pairs)
fig = plt.figure(figsize=(3.2 * n, 5), dpi=300)

for idx, (label, complete_pc, partial_pc) in enumerate(pointcloud_pairs):
    ax1 = fig.add_subplot(2, n, idx + 1, projection='3d')
    plot_point_cloud(complete_pc, ax1, f"{label}\nComplete", color='c')

    ax2 = fig.add_subplot(2, n, n + idx + 1, projection='3d')
    plot_point_cloud(partial_pc, ax2, "Partial", color='#a855f7')  # violet clair

plt.tight_layout(pad=0.5)
plt.subplots_adjust(top=0.98, wspace=0.05, hspace=0.05)

plt.show()


In [None]:
# Creates variants of point cloud samples by adding noise and downsampling
# Useful for data augmentation, as a second part of the training (idea of curriculum learning)
# It is a parallel dataset to the original one, with the same structure

def add_noise(points, noise_level=0.01):
    noise = np.random.normal(scale=noise_level, size=points.shape)
    return points + noise

def downsample(points, ratio=0.5):
    n_samples = int(points.shape[0] * ratio)
    if n_samples == 0:
        return points
    indices = np.random.choice(points.shape[0], n_samples, replace=False)
    return points[indices]

def create_noisy_parallel_dataset(src_root, dst_root, noise_level=0.01, downsample_ratio=0.5):
    splits = ['train', 'val', 'test']

    for split in splits:
        src_partial_dir = os.path.join(src_root, split, 'partial')
        dst_partial_dir = os.path.join(dst_root, split, 'partial')

        if not os.path.exists(src_partial_dir):
            print(f"[{split}] Source partial directory does not exist: {src_partial_dir}. Skipping.")
            continue

        print(f"[{split}] Processing partials from {src_partial_dir}...")

        # Walk one more level to reach actual .npy files
        for tree_name in os.listdir(src_partial_dir):
            tree_path = os.path.join(src_partial_dir, tree_name)
            if not os.path.isdir(tree_path):
                continue

            # Now iterate over cube folders (like ash1_4_size1.0)
            for cube_folder in os.listdir(tree_path):
                cube_path = os.path.join(tree_path, cube_folder)
                if not os.path.isdir(cube_path):
                    continue

                rel_path = os.path.relpath(cube_path, src_partial_dir)
                dst_dir = os.path.join(dst_partial_dir, rel_path)
                os.makedirs(dst_dir, exist_ok=True)

                # Process all .npy files inside this cube folder
                for filename in os.listdir(cube_path):
                    if filename.endswith('.npy') and ('noise' not in filename) and ('downsample' not in filename):
                        src_file = os.path.join(cube_path, filename)
                        points = np.load(src_file)

                        noisy_points = add_noise(points, noise_level)
                        noisy_down_points = downsample(noisy_points, downsample_ratio)

                        dst_file = os.path.join(dst_dir, filename)
                        np.save(dst_file, noisy_down_points)

        print(f"[{split}] Done.")

    print("Parallel noisy dataset creation completed.")

import shutil

def copy_complete_and_json(src_root, dst_root):
    # Copy complete folders recursively
    for split in ["train", "val", "test"]:
        src_complete = os.path.join(src_root, split, "complete")
        dst_complete = os.path.join(dst_root, split, "complete")
        if os.path.exists(src_complete):
            shutil.copytree(src_complete, dst_complete, dirs_exist_ok=True)
            print(f"Copied complete data: {src_complete} -> {dst_complete}")
        else:
            print(f"Complete folder not found: {src_complete}")

    # Copy JSON file(s) if exist
    for f in os.listdir(src_root):
        if f.endswith(".json"):
            src_json = os.path.join(src_root, f)
            dst_json = os.path.join(dst_root, f)
            shutil.copy2(src_json, dst_json)
            print(f"Copied JSON file: {src_json} -> {dst_json}")


In [None]:
# Use the function to create a parallel noisy dataset
src_root = "/home/katnips/treePoinTr/data_training/after_split_without_noise/tree_ada_v2_synthetic_8192_1"
dst_root = "/home/katnips/treePoinTr/data_training/after_split_with_noise/tree_ada_v2_synthetic_8192_1_noisy"

create_noisy_parallel_dataset(
    src_root=src_root,
    dst_root=dst_root,
    noise_level=0.007,       # noise std dev
    downsample_ratio=0.7     # keep 70% points after downsampling
)

copy_complete_and_json(src_root, dst_root)  

In [None]:
# Visualize the point clouds, including the noisy ones

def plot_point_cloud(points, ax, color='b', s=0.4):
    ax.scatter(points[:, 0], points[:, 1], points[:, 2], s=s, color=color, alpha=0.7)
    ax.axis('off')
    ax.view_init(elev=30, azim=135)

def extract_cube_prefix(name):
    return "_".join(name.split("_")[:2])

# Datasets
base_dir_clean = "/home/katnips/treePoinTr/data_training/after_split_without_noise"
base_dir_noisy = "/home/katnips/treePoinTr/data_training/after_split_with_noise"

dataset_info = {
    "tree_ada_v2_synthetic_2048_1": "2048 pts – 1m³",
    "tree_ada_v2_synthetic_2048_05": "2048 pts – 0.5m³",
    "tree_ada_v2_synthetic_8192_1": "8192 pts – 1m³",
    "tree_ada_v2_synthetic_8192_05": "8192 pts – 0.5m³",
}

dataset_dirs = list(dataset_info.keys())
dataset_complete_cubes = []

# Trouver un cube commun à tous les datasets
for dataset_name in dataset_dirs:
    complete_dir = os.path.join(base_dir_clean, dataset_name, "train", "complete")
    valid_cubes = set()
    for tree_name in os.listdir(complete_dir):
        tree_path = os.path.join(complete_dir, tree_name)
        if not os.path.isdir(tree_path):
            continue
        for fname in os.listdir(tree_path):
            if fname.endswith(".npy"):
                prefix = extract_cube_prefix(fname)
                valid_cubes.add((tree_name, prefix))
    dataset_complete_cubes.append(valid_cubes)

common_complete_cubes = set.intersection(*dataset_complete_cubes)
if not common_complete_cubes:
    print("❌ No common cubes found.")
    exit()

selected_tree, selected_prefix = sorted(common_complete_cubes)[0]
print(f"✔ Selected cube: {selected_tree}/{selected_prefix}")

# Charger les triplets de point clouds
pointcloud_triplets = []

for dataset_name in dataset_dirs:
    label = dataset_info[dataset_name]

    clean_complete_dir = os.path.join(base_dir_clean, dataset_name, "train", "complete", selected_tree)
    clean_partial_dir = os.path.join(base_dir_clean, dataset_name, "train", "partial", selected_tree)
    noisy_partial_dir = os.path.join(base_dir_noisy, dataset_name + "_noisy", "train", "partial", selected_tree)

    complete_file = None
    for f in os.listdir(clean_complete_dir):
        if f.endswith(".npy") and extract_cube_prefix(f) == selected_prefix:
            complete_file = os.path.join(clean_complete_dir, f)
            break
    if not complete_file:
        continue

    matching_folder = None
    for folder in os.listdir(clean_partial_dir):
        if extract_cube_prefix(folder) == selected_prefix:
            matching_folder = folder
            break
    if not matching_folder:
        continue

    clean_partial_folder = os.path.join(clean_partial_dir, matching_folder)
    noisy_partial_folder = os.path.join(noisy_partial_dir, matching_folder)

    partial_files = [f for f in os.listdir(clean_partial_folder) if f.endswith(".npy")]
    if not partial_files:
        continue

    partial_file = partial_files[0]
    clean_partial_file = os.path.join(clean_partial_folder, partial_file)
    noisy_partial_file = os.path.join(noisy_partial_folder, partial_file)
    has_noisy = os.path.exists(noisy_partial_file)

    complete_pc = np.load(complete_file)
    partial_pc = np.load(clean_partial_file)
    noisy_pc = np.load(noisy_partial_file) if has_noisy else None

    pointcloud_triplets.append((label, complete_pc, partial_pc, noisy_pc))

# Affichage
n = len(pointcloud_triplets)
fig, axes = plt.subplots(nrows=3, ncols=n, figsize=(3.2 * n, 7), subplot_kw={"projection": "3d"}, dpi=300)

row_labels = [
    "Nuage de points complet", 
    "Partiel (sans bruit)", 
    "Partiel (avec bruit)"
]

for col, (label, complete_pc, partial_pc, noisy_pc) in enumerate(pointcloud_triplets):
    plot_point_cloud(complete_pc, axes[0, col], color='c')
    plot_point_cloud(partial_pc, axes[1, col], color='#a855f7')
    if noisy_pc is not None:
        plot_point_cloud(noisy_pc, axes[2, col], color='#f472b6')
    else:
        axes[2, col].text(0, 0, 0, "Missing", ha='center')
        axes[2, col].axis('off')

    # Ajouter le titre du modèle en haut
    axes[0, col].set_title(label, fontsize=9)

# Ajouter les labels de ligne à gauche
for row in range(3):
    axes[row, 0].text2D(-0.15, 0.5, row_labels[row], transform=axes[row, 0].transAxes,
                        fontsize=10, ha='right', va='center', rotation=90)

plt.tight_layout(pad=0.5)
plt.subplots_adjust(top=0.92, left=0.07, wspace=0.05, hspace=0.05)
plt.savefig("/home/katnips/treePoinTr/figures/synthetic_dataset.png", dpi=600, bbox_inches='tight', pad_inches=0.05)
plt.show()


### 3 make complete and partial samples from real scans

partial samples are based on individual scan positions (PointSourceId) or alternatively split by GpsTime

In [None]:
import numpy as np
import pandas as pd
import os
import glob
import json
import random
import shutil

import make_samples_from_real

In [None]:
# folder containing single tree files .xyz format that include the columns 'PointSourceId' or 'GpsTime'
fulltree_path = "path/to/singletrees/"
# output path for samples
outpath = "path/for/output/"

# this function loops through all files (.xyz) in inpath and
# makes the specified number of complete samples from each file including partial samples in a separate directory:
# make samples, specify start_count, stop_count, boxsize,
# creates samples while in range(stop_count)
make_samples_from_real.mksamples_real(fulltree_path, outpath, start_count=0, stop_count=200)

In [None]:
# directory containing the complete samples made before
complete_dir_train = outpath+"/train/complete/" 
# specify the name for the .json file (should be the dataset name)
dataset_name = "real"


# move a random 20% of the created data to a new directory called "test"
# and write a .json file listing all the train and test samples
make_samples_from_real.traintest_json(complete_dir_train, outpath, dataset_name)