In [85]:
import sys
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
  
# append the path of the parent directory
sys.path.append("..")

In [86]:
from utilities import FileUtils
from visualisation import ArrayVisualizer

Functions


In [87]:
def read_metadata_to_dataframe(base_dir, folder_names):
        """Read JSON files from specified folders into a Pandas DataFrame.

        Args:
            base_dir (str): The base directory containing the folders.
            folder_names (list): List of folder names containing the JSON files.

        Returns:
            DataFrame: A Pandas DataFrame containing the data from the JSON files.
        """
        data_list = []

        # Iterate over the folder names
        for folder_name in folder_names:
            file_path = os.path.join(base_dir, folder_name, f"{folder_name}_metadata.json")

            # Read the JSON file, ensuring it exists
            if os.path.exists(file_path):
                with open(file_path, 'r') as file:
                    data = json.load(file)
                    # Remove the 'timesteps' key from the data
                    data.pop('timesteps', None)
                    data_list.append(data)
            else:
                print(f"File not found: {file_path}")

        # Convert the list of dictionaries to a DataFrame
        return pd.DataFrame(data_list)

def enhance_metadata(df):
    """Enhance the DataFrame by adding x_dimension, y_dimension, and z_dimension columns.

    Args:
        df (pandas.DataFrame): The DataFrame containing the metadata.

    Returns:
        pandas.DataFrame: The enhanced DataFrame with new dimension columns.
    """
    # Calculate dimensions based on the min and max values
    df['x_dimension'] = df['max_x_value'] - df['min_x_value']
    df['y_dimension'] = df['max_y_value'] - df['min_y_value']
    df['z_dimension'] = df['max_z_value'] - df['min_z_value']

    return df


# def load_and_analyze_states(base_dir, model_ids):
#     """Load arrays for each state and perform checks on them.

#     Args:
#         base_dir (str): The base directory containing the model folders.
#         model_ids (list): List of model IDs corresponding to subfolders.

#     Returns:
#         pandas.DataFrame: DataFrame with bounding box details for all states and the largest bounding box.
#     """
#     results_data = []

#     for model_id in model_ids:
#         overall_min_x = overall_max_x = None
#         overall_min_y = overall_max_y = None
#         largest_bounding_box_dimensions = (0, 0)
#         largest_bounding_box_state = None
#         largest_bounding_box_coords = None

#         thickness_dir = os.path.join(base_dir, model_id, 'thickness')
#         velocity_dir = os.path.join(base_dir, model_id, 'velocity')

#         # Load real-world x and y coordinates
#         x_values = np.load(os.path.join(thickness_dir, f"{model_id}_thickness_x_values.npy"))
#         y_values = np.load(os.path.join(thickness_dir, f"{model_id}_thickness_y_values.npy"))

#         state_no = 1
#         while True:
#             thickness_file = os.path.join(thickness_dir, f"{model_id}_thickness_{state_no}.npy")
#             velocity_file = os.path.join(velocity_dir, f"{model_id}_velocity_{state_no}.npy")

#             # Check if both thickness and velocity files exist
#             if not os.path.exists(thickness_file) or not os.path.exists(velocity_file):
#                 break  # No more states to process

#             thickness_array = np.load(thickness_file)
#             velocity_array = np.load(velocity_file)

#             # Determine the min and max xy coordinates of the debris
#             non_zero_indices = np.where((thickness_array > 0) & (velocity_array > 0))
#             if non_zero_indices[0].size > 0:
#                 min_x, max_x = x_values[non_zero_indices[1].min()], x_values[non_zero_indices[1].max()]
#                 min_y, max_y = y_values[non_zero_indices[0].min()], y_values[non_zero_indices[0].max()]

#                 # Update overall bounding box
#                 overall_min_x = min(overall_min_x, min_x) if overall_min_x is not None else min_x
#                 overall_max_x = max(overall_max_x, max_x) if overall_max_x is not None else max_x
#                 overall_min_y = min(overall_min_y, min_y) if overall_min_y is not None else min_y
#                 overall_max_y = max(overall_max_y, max_y) if overall_max_y is not None else max_y

#                 # Update largest bounding box state
#                 bounding_box_dimensions = (max_x - min_x, max_y - min_y)
#                 if np.prod(bounding_box_dimensions) > np.prod(largest_bounding_box_dimensions):
#                     largest_bounding_box_dimensions = bounding_box_dimensions
#                     largest_bounding_box_state = state_no
#                     largest_bounding_box_coords = (min_x, max_x, min_y, max_y)

#             state_no += 1

#         # Log the results for the current model
#         if overall_min_x is not None and overall_min_y is not None:
#             results_data.append({
#                 'model_id': model_id,
#                 'overall_min_x': overall_min_x,
#                 'overall_max_x': overall_max_x,
#                 'overall_dim_x': overall_max_x - overall_min_x,
#                 'overall_min_y': overall_min_y,
#                 'overall_max_y': overall_max_y,
#                 'overall_dim_y': overall_max_y - overall_min_y,
#                 'largest_bounding_box_state': largest_bounding_box_state,
#                 'largest_bounding_box_min_x': largest_bounding_box_coords[0],
#                 'largest_bounding_box_max_x': largest_bounding_box_coords[1],
#                 'largest_bounding_box_dim_x': largest_bounding_box_dimensions[0],
#                 'largest_bounding_box_min_y': largest_bounding_box_coords[2],
#                 'largest_bounding_box_max_y': largest_bounding_box_coords[3],
#                 'largest_bounding_box_dim_y': largest_bounding_box_dimensions[1]
#             })

#     # Convert results to a DataFrame
#     results_df = pd.DataFrame(results_data)
#     return results_df


def find_largest_bounding_box(base_dir, model_ids):
    """Find the largest bounding box dimensions and state with the largest bounding box.

    Args:
        base_dir (str): The base directory where model data is stored.
        model_ids (list): A list of model IDs to process.

    Returns:
        tuple: A tuple containing the overall bounding box dimensions and the details of the state with the largest bounding box.
    """
    largest_bbox = {
        'min_x': float('inf'),
        'min_y': float('inf'),
        'max_x': -float('inf'),
        'max_y': -float('inf'),
    }
    largest_state_bbox = {
        'state_id': None,
        'min_x': float('inf'),
        'min_y': float('inf'),
        'max_x': -float('inf'),
        'max_y': -float('inf'),
        'dim_x': 0,
        'dim_y': 0
    }

    for model_id in model_ids:
        states_dir = os.path.join(base_dir, model_id, 'states')

        for state_file in os.listdir(states_dir):
            state_path = os.path.join(states_dir, state_file)
            state_data = np.load(state_path)

            # Assuming the state data is a structured array with 'x', 'y', 'thickness', 'velocity' fields
            active_indices = np.nonzero(state_data['thickness'] * state_data['velocity'])
            if active_indices[0].size == 0:  # Skip if there's no active debris
                continue

            min_x, max_x = state_data['x'][active_indices].min(), state_data['x'][active_indices].max()
            min_y, max_y = state_data['y'][active_indices].min(), state_data['y'][active_indices].max()

            # Update overall bounding box
            largest_bbox['min_x'] = min(largest_bbox['min_x'], min_x)
            largest_bbox['min_y'] = min(largest_bbox['min_y'], min_y)
            largest_bbox['max_x'] = max(largest_bbox['max_x'], max_x)
            largest_bbox['max_y'] = max(largest_bbox['max_y'], max_y)

            # Check if this state has the largest bounding box
            state_dim_x = max_x - min_x
            state_dim_y = max_y - min_y
            if state_dim_x * state_dim_y > largest_state_bbox['dim_x'] * largest_state_bbox['dim_y']:
                largest_state_bbox.update({
                    'state_id': state_file,
                    'min_x': min_x,
                    'min_y': min_y,
                    'max_x': max_x,
                    'max_y': max_y,
                    'dim_x': state_dim_x,
                    'dim_y': state_dim_y
                })

    overall_dim_x = largest_bbox['max_x'] - largest_bbox['min_x']
    overall_dim_y = largest_bbox['max_y'] - largest_bbox['min_y']

    return (
        (largest_bbox['min_x'], largest_bbox['max_x'], largest_bbox['min_y'], largest_bbox['max_y'], overall_dim_x, overall_dim_y),
        largest_state_bbox
    )




In [88]:
def load_and_analyze_states(base_dir, model_ids):
    """Load arrays for each state and perform checks on them, including max velocity and thickness.

    Args:
        base_dir (str): The base directory containing the model folders.
        model_ids (list): List of model IDs corresponding to subfolders.

    Returns:
        pandas.DataFrame: DataFrame with bounding box details and max velocity and thickness for all states.
    """
    results_data = []

    for model_id in model_ids:
        overall_min_x = overall_max_x = None
        overall_min_y = overall_max_y = None
        largest_bounding_box_dimensions = (0, 0)
        largest_bounding_box_state = None
        largest_bounding_box_coords = None
        max_velocity = 0
        max_thickness = 0

        thickness_dir = os.path.join(base_dir, model_id, 'thickness')
        velocity_dir = os.path.join(base_dir, model_id, 'velocity')

        # Load real-world x and y coordinates
        x_values = np.load(os.path.join(thickness_dir, f"{model_id}_thickness_x_values.npy"))
        y_values = np.load(os.path.join(thickness_dir, f"{model_id}_thickness_y_values.npy"))

        state_no = 1
        while True:
            thickness_file = os.path.join(thickness_dir, f"{model_id}_thickness_{state_no}.npy")
            velocity_file = os.path.join(velocity_dir, f"{model_id}_velocity_{state_no}.npy")

            # Check if both thickness and velocity files exist
            if not os.path.exists(thickness_file) or not os.path.exists(velocity_file):
                break  # No more states to process

            thickness_array = np.load(thickness_file)
            velocity_array = np.load(velocity_file)

            # Update max velocity and thickness
            max_velocity = max(max_velocity, np.max(velocity_array))
            max_thickness = max(max_thickness, np.max(thickness_array))

            # Determine the min and max xy coordinates of the debris
            non_zero_indices = np.where((thickness_array > 0) & (velocity_array > 0))
            if non_zero_indices[0].size > 0:
                min_x, max_x = x_values[non_zero_indices[1].min()], x_values[non_zero_indices[1].max()]
                min_y, max_y = y_values[non_zero_indices[0].min()], y_values[non_zero_indices[0].max()]

                # Update overall bounding box
                overall_min_x = min(overall_min_x, min_x) if overall_min_x is not None else min_x
                overall_max_x = max(overall_max_x, max_x) if overall_max_x is not None else max_x
                overall_min_y = min(overall_min_y, min_y) if overall_min_y is not None else min_y
                overall_max_y = max(overall_max_y, max_y) if overall_max_y is not None else max_y

                # Update largest bounding box state
                bounding_box_dimensions = (max_x - min_x, max_y - min_y)
                if np.prod(bounding_box_dimensions) > np.prod(largest_bounding_box_dimensions):
                    largest_bounding_box_dimensions = bounding_box_dimensions
                    largest_bounding_box_state = state_no
                    largest_bounding_box_coords = (min_x, max_x, min_y, max_y)

            state_no += 1

        # Log the results for the current model
        if overall_min_x is not None and overall_min_y is not None:
            results_data.append({
                'model_id': model_id,
                'overall_min_x': overall_min_x,
                'overall_max_x': overall_max_x,
                'overall_dim_x': overall_max_x - overall_min_x,
                'overall_min_y': overall_min_y,
                'overall_max_y': overall_max_y,
                'overall_dim_y': overall_max_y - overall_min_y,
                'largest_bounding_box_state': largest_bounding_box_state,
                'largest_bounding_box_min_x': largest_bounding_box_coords[0],
                'largest_bounding_box_max_x': largest_bounding_box_coords[1],
                'largest_bounding_box_dim_x': largest_bounding_box_dimensions[0],
                'largest_bounding_box_min_y': largest_bounding_box_coords[2],
                'largest_bounding_box_max_y': largest_bounding_box_coords[3],
                'largest_bounding_box_dim_y': largest_bounding_box_dimensions[1],
                'max_velocity': max_velocity,
                'max_thickness': max_thickness
            })

                # Convert results to a DataFrame
    results_df = pd.DataFrame(results_data)
    return results_df

In [89]:
data_dir = r'../data/processed'

model_ids = FileUtils.get_subfolder_names(data_dir)

In [90]:
metadata_df = read_metadata_to_dataframe(data_dir, model_ids)

metadata_df = enhance_metadata(metadata_df)

In [91]:
metadata_df.head()

Unnamed: 0,model_id,total_number_of_states,min_x_value,max_x_value,min_y_value,max_y_value,min_z_value,max_z_value,grid_resolution_x,grid_resolution_y,average_timestep,x_dimension,y_dimension,z_dimension
0,5,242,815893.0,816305.0,832859.0,833143.0,24.01,168.925,2.0,2.0,0.497922,412.0,284.0,144.915
1,4,122,816083.0,816295.0,832881.0,833097.0,24.765,125.5725,2.0,2.0,0.495858,212.0,216.0,100.8075
2,7,122,815939.0,816331.0,832754.0,833062.0,23.05,158.5975,2.0,2.0,0.495868,392.0,308.0,135.5475
3,6,242,815950.0,816304.0,832833.0,833171.0,23.905,164.1925,2.0,2.0,0.497921,354.0,338.0,140.2875
4,1,122,822649.0,822811.0,822644.0,822728.0,0.84,83.21,3.0,3.0,0.495861,162.0,84.0,82.37


In [92]:
results_df = load_and_analyze_states(data_dir, model_ids)



In [93]:
results_df.head()

Unnamed: 0,model_id,overall_min_x,overall_max_x,overall_dim_x,overall_min_y,overall_max_y,overall_dim_y,largest_bounding_box_state,largest_bounding_box_min_x,largest_bounding_box_max_x,largest_bounding_box_dim_x,largest_bounding_box_min_y,largest_bounding_box_max_y,largest_bounding_box_dim_y,max_velocity,max_thickness
0,5,815945.0,816121.0,176.0,832945.0,833093.0,148.0,234,816033.0,816121.0,88.0,832945.0,832963.0,18.0,283.24,6.05
1,4,816135.0,816203.0,68.0,832949.0,833043.0,94.0,27,816151.0,816195.0,44.0,832959.0,833019.0,60.0,99.0,5.88
2,7,816009.0,816199.0,190.0,832824.0,832956.0,132.0,76,816143.0,816197.0,54.0,832920.0,832956.0,36.0,321.13,5.95
3,6,816000.0,816180.0,180.0,832937.0,833089.0,152.0,216,816080.0,816178.0,98.0,832937.0,832961.0,24.0,327.83,7.3
4,1,822667.0,822784.0,117.0,822656.0,822707.0,51.0,30,822703.0,822745.0,42.0,822665.0,822689.0,24.0,320.32,13.43


In [94]:
model_id = '0001'

model_dir = r'/home/tom/repos/dyna-landslide-surrogate/data/processed'



In [95]:
import numpy as np
import os

def load_states_and_assess_outliers(model_dir, model_id):
    """Load thickness and velocity states for a model and conduct an outlier assessment, ignoring specific files.

    Outliers are defined as values that fall below Q1 - 1.5 * IQR or above Q3 + 1.5 * IQR for thickness and velocity.

    Args:
        model_dir (str): The directory containing the states for a specific model.
        model_id (str): The ID of the model.

    Returns:
        dict: A dictionary containing the outlier indices for thickness and velocity.
    """
    thickness_data = []
    velocity_data = []

    # Define file patterns to ignore
    ignore_files = {
        f"{model_id}_thickness_max_value.npy",
        f"{model_id}_thickness_x_values.npy",
        f"{model_id}_thickness_y_values.npy",
        f"{model_id}_velocity_max_value.npy",
        f"{model_id}_velocity_x_values.npy",
        f"{model_id}_velocity_y_values.npy"
    }

    # Load all states for thickness
    thickness_dir = os.path.join(model_dir, model_id, 'thickness')
    for thickness_file in sorted(os.listdir(thickness_dir)):
        if thickness_file not in ignore_files and thickness_file.endswith('.npy'):
            thickness_data.append(np.load(os.path.join(thickness_dir, thickness_file)))

    # Load all states for velocity
    velocity_dir = os.path.join(model_dir, model_id, 'velocity')
    for velocity_file in sorted(os.listdir(velocity_dir)):
        if velocity_file not in ignore_files and velocity_file.endswith('.npy'):
            velocity_data.append(np.load(os.path.join(velocity_dir, velocity_file)))

    # Convert to single arrays
    thickness_data = np.concatenate(thickness_data)
    velocity_data = np.concatenate(velocity_data)

    # Outlier assessment based on IQR
    def assess_outliers(data):
        q1 = np.percentile(data, 25)
        q3 = np.percentile(data, 75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = np.where((data < lower_bound) | (data > upper_bound))
        return outliers

    thickness_outliers = assess_outliers(thickness_data)
    velocity_outliers = assess_outliers(velocity_data)

    return {
        'thickness_outliers': thickness_outliers,
        'velocity_outliers': velocity_outliers
    }



In [96]:
# Correct the model directory path


# Check if the directories exist
thickness_dir = os.path.join(model_dir, model_id, 'thickness')
velocity_dir = os.path.join(model_dir, model_id, 'velocity')

# Verify the thickness directory
if not os.path.exists(thickness_dir):
    print(f"Thickness directory does not exist: {thickness_dir}")
else:
    print(f"Thickness directory found: {thickness_dir}")

# Verify the velocity directory
if not os.path.exists(velocity_dir):
    print(f"Velocity directory does not exist: {velocity_dir}")
else:
    print(f"Velocity directory found: {velocity_dir}")

# If both directories exist, you can proceed to call the function
if os.path.exists(thickness_dir) and os.path.exists(velocity_dir):
    outliers = load_states_and_assess_outliers(model_dir, model_id)
    print(outliers)


Thickness directory found: /home/tom/repos/dyna-landslide-surrogate/data/processed/0001/thickness
Velocity directory found: /home/tom/repos/dyna-landslide-surrogate/data/processed/0001/velocity
{'thickness_outliers': (array([  46,   46,   47, ..., 3520, 3520, 3520]), array([42, 43, 41, ..., 12, 13, 14])), 'velocity_outliers': (array([  45,   45,   45, ..., 3521, 3521, 3521]), array([40, 41, 42, ..., 14, 15, 16]))}
