## IMPORTING LIBRARIES

In [1]:
import open3d as o3d
import numpy as np
import pandas as pd
import os
import sys
from plyfile import PlyData, PlyElement

ModuleNotFoundError: No module named 'open3d'

## CONFIGURATION

In [3]:

# --- Configuration ---
# IMPORTANT: Adjust these paths to your downloaded EDF dataset location
dataset_root_path = 'C:/RUTUL/GBC/WIP/Dataset/EDF Industrial Facility'

# Assuming these file names based on your description
global_labels_file = 'ytrain_i9bpfD4.csv' # The big Excel file with all ID-class mappings
train_map_file = 'suppfiles_HtLAvex/ytrain_map_ind_station.csv' # Mapping for training stations
test_map_file = 'suppfiles_HtLAvex/ytest_map_ind_station.csv'   # Mapping for testing stations

# Choose whether to load from 'train' or 'test' set
data_split = 'train' # or 'test'

# Choose which station ID to visualize (e.g., 0, 1, 31, 32 based on your map files)
# If data_split is 'train', choose a station_id from ytrain_map_ind_station.csv
# If data_split is 'test', choose a station_id from ytest_map_ind_station.csv
station_id_to_visualize = 0 # Example: visualizing SCAN_0.ply from train set


In [4]:

# --- Define Paths ---
global_labels_path = os.path.join(dataset_root_path, global_labels_file)

if data_split == 'train':
    map_file_path = os.path.join(dataset_root_path, train_map_file)
    ply_folder_path = os.path.join(dataset_root_path, 'xtrain_kW4SLO1') # Assuming 'train' subdir for PLY files
elif data_split == 'test':
    map_file_path = os.path.join(dataset_root_path, test_map_file)
    ply_folder_path = os.path.join(dataset_root_path, 'test') # Assuming 'test' subdir for PLY files
else:
    raise ValueError("data_split must be 'train' or 'test'")

ply_file_name = f'SCAN_{station_id_to_visualize}.ply'
ply_file_path = os.path.join(ply_folder_path, ply_file_name)

# Define class colors for visualization (you can expand this for all 10 classes)
# These are just example colors. You can choose more distinct ones.
# Make sure the index matches the label ID (e.g., colors[0] for label 0, colors[1] for label 1)
# These are the 10 classes as per EDF dataset documentation (0-9)
CLASS_COLORS = {
    0: [0.5, 0.5, 0.5],  # Background (grey)
    1: [1.0, 0.0, 0.0],  # Beams (red)
    2: [0.0, 1.0, 0.0],  # Cabletrays (green)
    3: [0.0, 0.0, 1.0],  # Civils (blue - for walls/floors)
    4: [1.0, 1.0, 0.0],  # Gratings (yellow)
    5: [1.0, 0.5, 0.0],  # Guardrails (orange)
    6: [0.0, 1.0, 1.0],  # Hvac (cyan)
    7: [0.5, 0.0, 0.5],  # Ladders (purple)
    8: [1.0, 0.0, 1.0],  # Pipping (magenta - for pipes!)
    9: [0.5, 0.5, 0.0],  # Supports (olive)
}


In [5]:

# --- 1. Load the Global Labels File ---
print(f"Loading global labels from: {global_labels_path}")
try:
    # Assuming 'ID' column is index or just an identifier, and 'class' column holds the label
    # The ID column might not be strictly necessary if indices directly map to point order.
    # We'll just read the 'class' column as the labels array.
    global_labels_df = pd.read_csv(global_labels_path)
    # Ensure 'class' column exists and get its values
    if 'class' not in global_labels_df.columns:
        raise ValueError("Global labels CSV must contain a 'class' column.")
    all_labels = global_labels_df['class'].values
    print(f"Successfully loaded {len(all_labels)} global labels.")
except Exception as e:
    print(f"Error loading global labels CSV: {e}")
    print("Please check 'global_labels_file' and its content.")
    exit()


Loading global labels from: C:/RUTUL/GBC/WIP/Dataset/EDF Industrial Facility\ytrain_i9bpfD4.csv
Successfully loaded 130661990 global labels.


In [6]:

# --- 2. Load the Map File for the chosen split ---
print(f"Loading map file from: {map_file_path}")
try:
    map_df = pd.read_csv(map_file_path, header=None, names=['Station_index', 'index_start', 'index_end'])
    # Find the row corresponding to our chosen station_id
    station_info = map_df[map_df['Station_index'] == station_id_to_visualize]

    if station_info.empty:
        raise ValueError(f"Station ID {station_id_to_visualize} not found in the map file: {map_file_path}")

    index_start = station_info['index_start'].iloc[0]
    index_end = station_info['index_end'].iloc[0]
    print(f"Found mapping for Station {station_id_to_visualize}: start={index_start}, end={index_end}")

except Exception as e:
    print(f"Error loading map file or finding station info: {e}")
    print("Please check 'map_file_path' and 'station_id_to_visualize'.")
    exit()


Loading map file from: C:/RUTUL/GBC/WIP/Dataset/EDF Industrial Facility\suppfiles_HtLAvex/ytrain_map_ind_station.csv
Found mapping for Station 0: start=0, end=2368289


In [7]:

# --- 3. Extract Labels for the Specific Station ---
try:
    labels_for_station = all_labels[index_start : index_end + 1] # +1 because end index is inclusive
    print(f"Extracted {len(labels_for_station)} labels for Station {station_id_to_visualize}.")
except IndexError as e:
    print(f"Index error when slicing global labels: {e}")
    print("This might indicate an issue with index_start/index_end or global_labels_file size.")
    exit()


Extracted 2368290 labels for Station 0.


0.19.0


In [8]:
# --- 4. Load the Point Cloud (.ply file) for the specific station ---
print(f"\nAttempting to load PLY file: {ply_file_path}")

# --- DEBUGGING CHECKS ---
if not os.path.exists(ply_file_path):
    print(f"DEBUG: ERROR: PLY file DOES NOT EXIST at the specified path: {ply_file_path}")
    print("Please double-check your 'dataset_root_path', 'data_split' folder names (train/test), and 'station_id_to_visualize'.")
    sys.exit(1)
else:
    file_size = os.path.getsize(ply_file_path) / (1024 * 1024) # Size in MB
    print(f"DEBUG: PLY file exists. Size: {file_size:.2f} MB")
    if file_size < 0.001: # Very small size, might be genuinely empty or just a header
        print("DEBUG: WARNING: PLY file size is extremely small. It might be empty or corrupted.")

# --- MODIFIED PLY LOADING SECTION (using plyfile with correct element names) ---
try:
    print(f"Attempting to load PLY with 'plyfile' library for '{ply_file_name}' using custom element names...")
    plydata = PlyData.read(ply_file_path)

    # Extract points from the 'points' element
    if 'points' in plydata:
        points_element = plydata['points']
        points = np.vstack([points_element['x'], points_element['y'], points_element['z']]).T
        print(f"Successfully extracted {len(points)} points from 'points' element.")
    else:
        raise ValueError("PLY file does not contain a 'points' element as expected from header.")

    # Extract colors from the 'rgb' element
    colors = None
    if 'rgb' in plydata:
        rgb_element = plydata['rgb']
        # Assuming 'r', 'g', 'b' properties exist within 'rgb' element
        if 'r' in rgb_element.properties and 'g' in rgb_element.properties and 'b' in rgb_element.properties:
            colors = np.vstack([rgb_element['r'], rgb_element['g'], rgb_element['b']]).T
            # Normalize colors to [0, 1] if they are 0-255 (uchar)
            if colors.max() > 1.0:
                colors = colors / 255.0
            print("Successfully extracted RGB colors from 'rgb' element.")
        else:
            print("Warning: 'rgb' element found but missing 'r', 'g', or 'b' properties.")
    else:
        print("No 'rgb' element found in PLY file for colors.")

    # Optionally, extract intensity from the 'intensity' element
    intensity = None
    if 'intensity' in plydata:
        intensity_element = plydata['intensity']
        if 'i' in intensity_element.properties:
            intensity = intensity_element['i']
            print(f"Successfully extracted {len(intensity)} intensity values from 'intensity' element.")
        else:
            print("Warning: 'intensity' element found but missing 'i' property.")
    else:
        print("No 'intensity' element found in PLY file.")


    # Create Open3D PointCloud object
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points)
    if colors is not None:
        pcd.colors = o3d.utility.Vector3dVector(colors)
    # Note: Open3D does not have a direct 'intensity' property for visualization,
    # but you can store it or use it for other processing.

    if not pcd.has_points():
        raise ValueError("PointCloud created from plyfile has no points after conversion.")

except Exception as e:
    print(f"ERROR: Failed to load PLY file using 'plyfile' library with custom elements: {e}")
    print("This suggests a deeper issue with the PLY file's structure or corruption, or an unexpected header format.")
    sys.exit(1)

print(f"Successfully created Open3D PointCloud object with {len(pcd.points)} points.")



Attempting to load PLY file: C:/RUTUL/GBC/WIP/Dataset/EDF Industrial Facility\xtrain_kW4SLO1\SCAN_0.ply
DEBUG: PLY file exists. Size: 36.14 MB
Attempting to load PLY with 'plyfile' library for 'SCAN_0.ply' using custom element names...
Successfully extracted 2368290 points from 'points' element.
Successfully created Open3D PointCloud object with 2368290 points.


In [9]:
# --- 5. Verify Lengths and Prepare Colors for Visualization ---
if len(pcd.points) != len(labels_for_station):
    print(f"\nCRITICAL WARNING: Number of points ({len(pcd.points)}) in '{ply_file_name}' "
          f"does NOT match number of labels ({len(labels_for_station)}) extracted via map file ({index_start}:{index_end+1}).")
    print("This is a common issue due to incorrect indexing, PLY file corruption, or mismatched data generation.")
    print("For visualization, truncating to the minimum length to avoid errors, but investigate this mismatch.")
    min_len = min(len(pcd.points), len(labels_for_station))
    pcd.points = o3d.utility.Vector3dVector(np.asarray(pcd.points)[:min_len])
    # Preserve original colors if they exist, otherwise set to dummy for slicing
    # Use the colors loaded from PLY, or if not available, default to grey.
    if pcd.has_colors():
        pcd.colors = o3d.utility.Vector3dVector(np.asarray(pcd.colors)[:min_len])
    else:
        pcd.colors = o3d.utility.Vector3dVector(np.full((min_len, 3), 0.7)) # Grey if no colors from PLY
    labels_for_station = labels_for_station[:min_len]
    print(f"Truncated to {min_len} points/labels for visualization.")

# Create an array to hold the colors based on labels
colors_from_labels = np.zeros((len(labels_for_station), 3))
for i, label_id in enumerate(labels_for_station):
    label_id_int = int(label_id) # Ensure label_id is an integer for dictionary lookup
    colors_from_labels[i] = CLASS_COLORS.get(label_id_int, [0.8, 0.8, 0.8]) # Default to light grey for unknown labels
pcd.colors = o3d.utility.Vector3dVector(colors_from_labels) # OVERWRITE with semantic colors
print("Point cloud colored by semantic labels.")

Point cloud colored by semantic labels.


In [11]:
# --- 6. Visualize the Point Cloud ---
print(f"\nVisualizing Station {station_id_to_visualize} ({data_split} split). Close the visualization window to continue.")

# Optional: Downsample for faster visualization if it's too dense
# pcd_downsampled = pcd.voxel_down_sample(voxel_size=0.05) # Adjust voxel_size as needed
# o3d.visualization.draw_geometries([pcd_downsampled])

o3d.visualization.draw_geometries([pcd])
print("Visualization complete.")




Visualizing Station 0 (train split). Close the visualization window to continue.
Visualization complete.


In [20]:
# --- 7. Initial Exploration Insights ---
print("\n--- Initial Exploration Notes ---")
print(f"Number of points loaded for this station: {len(pcd.points)}")
unique_labels, counts = np.unique(labels_for_station, return_counts=True)
print("Label distribution for this station:")
# Mapping label IDs to names for better readability
label_names_map = {
    0: "Background", 1: "Beams", 2: "Cabletrays", 3: "Civils", 4: "Gratings",
    5: "Guardrails", 6: "Hvac", 7: "Ladders", 8: "Pipping", 9: "Supports"
}
for label, count in zip(unique_labels, counts):
    label_name = label_names_map.get(label, f"Unknown ({label})")
    print(f"  Class {label_name} (ID {label}): {count} points")

print("\n**Key things to observe during visualization (if successful):**")
print("- **Geometric Integrity:** Does the point cloud look complete and well-aligned? Are there obvious holes or misalignments?")
print("- **Label Consistency:** Do the colored regions truly correspond to the object types? For example, are all pipes (magenta) connected correctly, and are walls (blue) forming continuous surfaces?")
print("- **Boundaries:** How clear are the boundaries between different segmented objects? Are they sharp or blurry?")
print("- **Object Sizes & Density:** Observe the varying sizes of objects (e.g., large walls vs. thin pipes) and the point density on different surfaces.")
print("- **Occlusions:** Identify areas where objects are hidden behind others. This affects how well algorithms can segment them.")
print("- **Noise & Outliers:** Look for isolated points or small clusters far from the main structures, which might be sensor noise.")
print("- **Overall Complexity:** Get a feel for the complexity of the industrial scene and the challenges it might pose for segmentation.")


--- Initial Exploration Notes ---
Number of points loaded for this station: 2368290
Label distribution for this station:
  Class Background (ID 0): 61829 points
  Class Cabletrays (ID 2): 39641 points
  Class Civils (ID 3): 400446 points
  Class Guardrails (ID 5): 1403 points
  Class Hvac (ID 6): 2766 points
  Class Pipping (ID 8): 1705937 points
  Class Supports (ID 9): 156268 points

**Key things to observe during visualization (if successful):**
- **Geometric Integrity:** Does the point cloud look complete and well-aligned? Are there obvious holes or misalignments?
- **Label Consistency:** Do the colored regions truly correspond to the object types? For example, are all pipes (magenta) connected correctly, and are walls (blue) forming continuous surfaces?
- **Boundaries:** How clear are the boundaries between different segmented objects? Are they sharp or blurry?
- **Object Sizes & Density:** Observe the varying sizes of objects (e.g., large walls vs. thin pipes) and the point dens

### SCAN0


Number of points loaded for this station: 2368290


Class Background (ID 0): 61829 points :: GREY

Class Beam (ID 1): 0 points :: Red

Class Cabletrays (ID 2): 39641 points :: GREEN

Class Civils (ID 3): 400446 points :: Blue(walls and floors)

Class Gratings (ID 4): 0 points :: Yellow

Class Guardrails (ID 5): 1403 points :: Orange

Class Hvac (ID 6): 2766 points :: Cyan

Class ladders (ID 7): 0 points :: purple

Class Pipping (ID 8): 1705937 points :: Magenta for pipes

Class Supports (ID 9): 156268 points :: olive


    0: [0.5, 0.5, 0.5],  # Background (grey)
    1: [1.0, 0.0, 0.0],  # Beams (red)
    2: [0.0, 1.0, 0.0],  # Cabletrays (green)
    3: [0.0, 0.0, 1.0],  # Civils (blue - for walls/floors)
    4: [1.0, 1.0, 0.0],  # Gratings (yellow)
    5: [1.0, 0.5, 0.0],  # Guardrails (orange)
    6: [0.0, 1.0, 1.0],  # Hvac (cyan)
    7: [0.5, 0.0, 0.5],  # Ladders (purple)
    8: [1.0, 0.0, 1.0],  # Pipping (magenta - for pipes!)
    9: [0.5, 0.5, 0.0],  # Supports (olive)

Label distribution for this station:: This is critical!

Look at the Class <Name> (ID <ID>): <Count> points for each unique label.

Identify Class Imbalance: You will likely see that some classes (e.g., 'Civils', 'Pipping') have many more points than others (e.g., 'Ladders', 'Gratings'). This class imbalance is a common challenge in segmentation and will heavily influence your model training strategy (e.g., requiring weighted loss functions, oversampling rare classes).
--> there is huge class imbalance like pipes are almost 1/2 of the total points whereas least are gaurdrails around 1k points

Confirm all expected classes are present: Are all 10 classes potentially represented in this scan, or just a subset?

--> not all classes have points there are there grating, beam and ladders which are not present


Identify Objects by Color: Based on your CLASS_COLORS map, try to visually confirm what each color represents. For SCAN_0.ply, can you clearly distinguish:

Magenta (Pipping): Do you see continuous pipe structures?
--> they are dense but they are not continous, huge gap are there in between

Blue (Civils): Are these mainly walls, floors, or large structural elements?
--> Yes, similary, some are desne whereas some are sparse

Green (Cabletrays): Can you pick out the cable trays?
--> yes, they are less but can be identified and those are dense and get not continous in between 
Red (Beams): Are there structural beams?
--> there is no points for that
Other Colors: What about Gratings, Guardrails, HVAC, Ladders, Supports, Background?


Boundary Observation: How clean are the boundaries between different colored segments? Are they sharp, or do they bleed into each other?
--> there are some overlaps other vise its clean and sharp. magenta and blue are overlapping somewhere and are less overlapped. olive and green are bit overlapping but i guess it should get some overlapped
Density & Completeness: Are there areas that are very dense or very sparse? Are there missing sections or large holes?
--> There are dense but are missing section 

Noise/Outliers: Do you see any isolated points floating away from the main structures that appear to be noise?
--> yes there are many sections which are far away and does not make any importance 

The visual is looked like it has been taken from one static Lidar sensor or scanner and then Point cloud was generated. beacause of that from one specific angle it has point cloud but looking from the opposite angle, the point cloud is missing but difficult to know exact which angle. for example a horizontal pipe has a dense( not so dense) point cloud but from opposite angle it has no points, looking like it has half pipe. 




## Checking for the points in each file

In [18]:
import pandas as pd
import os

Suppfiles_path = 'C:/RUTUL/GBC/WIP/Dataset/EDF Industrial Facility/suppfiles_HtLAvex' # <<< VERIFY THIS PATH

train_map_file = 'ytrain_map_ind_station.csv'
test_map_file = 'ytest_map_ind_station.csv'

# Load train map
train_map_path = os.path.join(Suppfiles_path, train_map_file)
print(train_map_path)
train_map_df = pd.read_csv(train_map_path, header=None, names=['Station_index', 'index_start', 'index_end'])
train_map_df['point_count'] = train_map_df['index_end'] - train_map_df['index_start'] + 1
train_map_df['split'] = 'train'

# Load test map (if you have the test files available)
"""test_map_path = os.path.join(Suppfiles_path, test_map_file)
test_map_df = pd.read_csv(test_map_path, header=None, names=['Station_index', 'index_start', 'index_end'])
test_map_df['point_count'] = test_map_df['index_end'] - test_map_df['index_start'] + 1
test_map_df['split'] = 'test'"""

# Combine and sort for easy viewing
all_scans_df = train_map_df
all_scans_df_sorted = all_scans_df.sort_values(by='point_count').reset_index(drop=True)

print("--- Scan Sizes Overview ---")
print(f"Total scans: {len(all_scans_df_sorted)}")
print(f"Minimum points in a scan: {all_scans_df_sorted['point_count'].min()}")
print(f"Maximum points in a scan: {all_scans_df_sorted['point_count'].max()}")
print(f"Average points per scan: {all_scans_df_sorted['point_count'].mean():.2f}")
print(f"Median points per scan: {all_scans_df_sorted['point_count'].median()}")

print("\n--- Top 5 Smallest Scans ---")
print(all_scans_df_sorted.head(5))

print("\n--- Top 5 Largest Scans ---")
print(all_scans_df_sorted.tail(5))

print("\n--- Example of 'Average' Sized Scans (around median) ---")
median_count = all_scans_df_sorted['point_count'].median()
print(all_scans_df_sorted[(all_scans_df_sorted['point_count'] >= median_count * 0.9) &
                           (all_scans_df_sorted['point_count'] <= median_count * 1.1)].sample(n=min(5, len(all_scans_df_sorted))).sort_values(by='point_count'))

C:/RUTUL/GBC/WIP/Dataset/EDF Industrial Facility/suppfiles_HtLAvex\ytrain_map_ind_station.csv
--- Scan Sizes Overview ---
Total scans: 50
Minimum points in a scan: 1612043
Maximum points in a scan: 4132927
Average points per scan: 2613239.80
Median points per scan: 2656528.5

--- Top 5 Smallest Scans ---
   Station_index  index_start  index_end  point_count  split
0             45    109487941  111099983      1612043  train
1              4     10953766   12608556      1654791  train
2             37     93562963   95226972      1664010  train
3             42    104568399  106424116      1855718  train
4             58    143275610  145192546      1916937  train

--- Top 5 Largest Scans ---
    Station_index  index_start  index_end  point_count  split
45             64    160422335  163585348      3163014  train
46              8     21074972   24391614      3316643  train
47             12     32396605   35738625      3342021  train
48              2      4856610    8309083      3452