In [1]:
import sys
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
  
# append the path of the parent directory
sys.path.append("..")

In [2]:
from utilities import FileUtils
from visualisation import ArrayVisualizer

Methods - these need to be added to the data_processing.py


In [None]:
def read_metadata_to_dataframe(base_dir, folder_names):
        """Read JSON files from specified folders into a Pandas DataFrame.

        Args:
            base_dir (str): The base directory containing the folders.
            folder_names (list): List of folder names containing the JSON files.

        Returns:
            DataFrame: A Pandas DataFrame containing the data from the JSON files.
        """
        data_list = []

        # Iterate over the folder names
        for folder_name in folder_names:
            file_path = os.path.join(base_dir, folder_name, f"{folder_name}_metadata.json")

            # Read the JSON file, ensuring it exists
            if os.path.exists(file_path):
                with open(file_path, 'r') as file:
                    data = json.load(file)
                    # Remove the 'timesteps' key from the data
                    data.pop('timesteps', None)
                    data_list.append(data)
            else:
                print(f"File not found: {file_path}")

        # Convert the list of dictionaries to a DataFrame
        return pd.DataFrame(data_list)

def enhance_metadata(df):
    """Enhance the DataFrame by adding x_dimension, y_dimension, and z_dimension columns.

    Args:
        df (pandas.DataFrame): The DataFrame containing the metadata.

    Returns:
        pandas.DataFrame: The enhanced DataFrame with new dimension columns.
    """
    # Calculate dimensions based on the min and max values
    df['x_dimension'] = df['max_x_value'] - df['min_x_value']
    df['y_dimension'] = df['max_y_value'] - df['min_y_value']
    df['z_dimension'] = df['max_z_value'] - df['min_z_value']

    return df


def find_largest_bounding_box(base_dir, model_ids):
    """Find the largest bounding box dimensions and state with the largest bounding box.

    Args:
        base_dir (str): The base directory where model data is stored.
        model_ids (list): A list of model IDs to process.

    Returns:
        tuple: A tuple containing the overall bounding box dimensions and the details of the state with the largest bounding box.
    """
    largest_bbox = {
        'min_x': float('inf'),
        'min_y': float('inf'),
        'max_x': -float('inf'),
        'max_y': -float('inf'),
    }
    largest_state_bbox = {
        'state_id': None,
        'min_x': float('inf'),
        'min_y': float('inf'),
        'max_x': -float('inf'),
        'max_y': -float('inf'),
        'dim_x': 0,
        'dim_y': 0
    }

    for model_id in model_ids:
        states_dir = os.path.join(base_dir, model_id, 'states')

        for state_file in os.listdir(states_dir):
            state_path = os.path.join(states_dir, state_file)
            state_data = np.load(state_path)

            # Assuming the state data is a structured array with 'x', 'y', 'thickness', 'velocity' fields
            active_indices = np.nonzero(state_data['thickness'] * state_data['velocity'])
            if active_indices[0].size == 0:  # Skip if there's no active debris
                continue

            min_x, max_x = state_data['x'][active_indices].min(), state_data['x'][active_indices].max()
            min_y, max_y = state_data['y'][active_indices].min(), state_data['y'][active_indices].max()

            # Update overall bounding box
            largest_bbox['min_x'] = min(largest_bbox['min_x'], min_x)
            largest_bbox['min_y'] = min(largest_bbox['min_y'], min_y)
            largest_bbox['max_x'] = max(largest_bbox['max_x'], max_x)
            largest_bbox['max_y'] = max(largest_bbox['max_y'], max_y)

            # Check if this state has the largest bounding box
            state_dim_x = max_x - min_x
            state_dim_y = max_y - min_y
            if state_dim_x * state_dim_y > largest_state_bbox['dim_x'] * largest_state_bbox['dim_y']:
                largest_state_bbox.update({
                    'state_id': state_file,
                    'min_x': min_x,
                    'min_y': min_y,
                    'max_x': max_x,
                    'max_y': max_y,
                    'dim_x': state_dim_x,
                    'dim_y': state_dim_y
                })

    overall_dim_x = largest_bbox['max_x'] - largest_bbox['min_x']
    overall_dim_y = largest_bbox['max_y'] - largest_bbox['min_y']

    return (
        (largest_bbox['min_x'], largest_bbox['max_x'], largest_bbox['min_y'], largest_bbox['max_y'], overall_dim_x, overall_dim_y),
        largest_state_bbox
    )




In [None]:
def load_and_analyze_states(base_dir, model_ids):
    """Load arrays for each state and perform checks on them, including max velocity and thickness.

    Args:
        base_dir (str): The base directory containing the model folders.
        model_ids (list): List of model IDs corresponding to subfolders.

    Returns:
        pandas.DataFrame: DataFrame with bounding box details and max velocity and thickness for all states.
    """
    results_data = []

    for model_id in model_ids:
        overall_min_x = overall_max_x = None
        overall_min_y = overall_max_y = None
        largest_bounding_box_dimensions = (0, 0)
        largest_bounding_box_state = None
        largest_bounding_box_coords = None
        max_velocity = 0
        max_thickness = 0

        thickness_dir = os.path.join(base_dir, model_id, 'thickness')
        velocity_dir = os.path.join(base_dir, model_id, 'velocity')

        # Load real-world x and y coordinates
        x_values = np.load(os.path.join(thickness_dir, f"{model_id}_thickness_x_values.npy"))
        y_values = np.load(os.path.join(thickness_dir, f"{model_id}_thickness_y_values.npy"))

        state_no = 1
        while True:
            thickness_file = os.path.join(thickness_dir, f"{model_id}_thickness_{state_no}.npy")
            velocity_file = os.path.join(velocity_dir, f"{model_id}_velocity_{state_no}.npy")

            # Check if both thickness and velocity files exist
            if not os.path.exists(thickness_file) or not os.path.exists(velocity_file):
                break  # No more states to process

            thickness_array = np.load(thickness_file)
            velocity_array = np.load(velocity_file)

            # Update max velocity and thickness
            max_velocity = max(max_velocity, np.max(velocity_array))
            max_thickness = max(max_thickness, np.max(thickness_array))

            # Determine the min and max xy coordinates of the debris
            non_zero_indices = np.where((thickness_array > 0) & (velocity_array > 0))
            if non_zero_indices[0].size > 0:
                min_x, max_x = x_values[non_zero_indices[1].min()], x_values[non_zero_indices[1].max()]
                min_y, max_y = y_values[non_zero_indices[0].min()], y_values[non_zero_indices[0].max()]

                # Update overall bounding box
                overall_min_x = min(overall_min_x, min_x) if overall_min_x is not None else min_x
                overall_max_x = max(overall_max_x, max_x) if overall_max_x is not None else max_x
                overall_min_y = min(overall_min_y, min_y) if overall_min_y is not None else min_y
                overall_max_y = max(overall_max_y, max_y) if overall_max_y is not None else max_y

                # Update largest bounding box state
                bounding_box_dimensions = (max_x - min_x, max_y - min_y)
                if np.prod(bounding_box_dimensions) > np.prod(largest_bounding_box_dimensions):
                    largest_bounding_box_dimensions = bounding_box_dimensions
                    largest_bounding_box_state = state_no
                    largest_bounding_box_coords = (min_x, max_x, min_y, max_y)

            state_no += 1

        # Log the results for the current model
        if overall_min_x is not None and overall_min_y is not None:
            results_data.append({
                'model_id': model_id,
                'overall_min_x': overall_min_x,
                'overall_max_x': overall_max_x,
                'overall_dim_x': overall_max_x - overall_min_x,
                'overall_min_y': overall_min_y,
                'overall_max_y': overall_max_y,
                'overall_dim_y': overall_max_y - overall_min_y,
                'largest_bounding_box_state': largest_bounding_box_state,
                'largest_bounding_box_min_x': largest_bounding_box_coords[0],
                'largest_bounding_box_max_x': largest_bounding_box_coords[1],
                'largest_bounding_box_dim_x': largest_bounding_box_dimensions[0],
                'largest_bounding_box_min_y': largest_bounding_box_coords[2],
                'largest_bounding_box_max_y': largest_bounding_box_coords[3],
                'largest_bounding_box_dim_y': largest_bounding_box_dimensions[1],
                'max_velocity': max_velocity,
                'max_thickness': max_thickness
            })

                # Convert results to a DataFrame
    results_df = pd.DataFrame(results_data)
    return results_df

In [None]:
data_dir = r'../data/processed'

model_ids = FileUtils.get_subfolder_names(data_dir)

In [None]:
metadata_df = read_metadata_to_dataframe(data_dir, model_ids)

metadata_df = enhance_metadata(metadata_df)

In [None]:
metadata_df.head()

In [None]:
results_df = load_and_analyze_states(data_dir, model_ids)



In [None]:
results_df.head()