# h5 Exploration

## Global variables

In [2]:
import os
import h5py
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple, Union

# Update paths to use relative paths from the notebook location
data_dir = "./data/"
h5_filename = "FO_BitMapData_2017-110^01-0196-V1MR_S7.h5"
h5_filepath = os.path.join(data_dir, h5_filename)

## Function to store data from h5 into dataframe

In [3]:
def explore_h5_structure(h5_filepath: str) -> None:
    """
    Explore and print the structure of an h5 file.
    
    Parameters:
    h5_filepath (str): Path to the h5 file
    """
    try:
        with h5py.File(h5_filepath, 'r') as h5f:
            print(f"=== H5 File Structure: {os.path.basename(h5_filepath)} ===")
            
            def print_structure(name, obj, indent=0):
                spaces = "  " * indent
                if isinstance(obj, h5py.Group):
                    print(f"{spaces}📁 Group: {name}")
                    # Print group attributes
                    if obj.attrs:
                        for attr_name, attr_value in obj.attrs.items():
                            print(f"{spaces}  📝 Attribute: {attr_name} = {attr_value}")
                elif isinstance(obj, h5py.Dataset):
                    print(f"{spaces}📊 Dataset: {name}")
                    print(f"{spaces}  📏 Shape: {obj.shape}")
                    print(f"{spaces}  🔢 Type: {obj.dtype}")
                    # Print dataset attributes
                    if obj.attrs:
                        for attr_name, attr_value in obj.attrs.items():
                            print(f"{spaces}  📝 Attribute: {attr_name} = {attr_value}")
                    
                    # For small datasets, show the first few values
                    if obj.size < 20:
                        print(f"{spaces}  📋 Values: {obj[()]}")
                    elif obj.size < 200:
                        print(f"{spaces}  📋 Sample: {obj[:5] if obj.ndim == 1 else obj[0] if obj.ndim > 1 else obj[()]}")
            
            h5f.visititems(lambda name, obj: print_structure(name, obj, name.count('/')))
            
    except Exception as e:
        print(f"Error reading h5 file: {e}")


def load_h5_to_dataframes(h5_filepath: str) -> Dict[str, pd.DataFrame]:
    """
    Load data from h5 file into multiple dataframes.
    
    Parameters:
    h5_filepath (str): Path to the h5 file
    
    Returns:
    Dict[str, pd.DataFrame]: Dictionary containing different dataframes:
        - 'metadata': General metadata about the dataset
        - 'bbox_data': Bounding box data for all ROIs across all frames
        - 'roi_summary': Summary statistics for each ROI
        - 'frame_summary': Summary data for each frame
    """
    dataframes = {}
    
    try:
        with h5py.File(h5_filepath, 'r') as h5f:
            print(f"Loading data from: {os.path.basename(h5_filepath)}")
            
            # Get the experiment group (should be the first/only group)
            exp_groups = list(h5f.keys())
            if not exp_groups:
                print("No experiment groups found in the h5 file")
                return dataframes
            
            exp_group_name = exp_groups[0]
            exp_group = h5f[exp_group_name]
            print(f"Working with experiment group: {exp_group_name}")
            
            # Extract metadata
            metadata = {}
            if 'Patient' in exp_group:
                metadata['patient'] = exp_group['Patient'][()].decode() if isinstance(exp_group['Patient'][()], bytes) else exp_group['Patient'][()]
            if 'Serie' in exp_group:
                metadata['serie'] = exp_group['Serie'][()].decode() if isinstance(exp_group['Serie'][()], bytes) else exp_group['Serie'][()]
            if 'roi_list' in exp_group:
                roi_list = exp_group['roi_list'][()]
                if isinstance(roi_list[0], bytes):
                    roi_list = [roi.decode() for roi in roi_list]
                metadata['roi_list'] = roi_list
            if 'Clusters_indices' in exp_group:
                metadata['clusters_indices'] = exp_group['Clusters_indices'][()]
            
            metadata['experiment_group'] = exp_group_name
            metadata['file_path'] = h5_filepath
            
            # Create metadata dataframe
            metadata_df = pd.DataFrame([metadata])
            dataframes['metadata'] = metadata_df
            
            # Extract bbox and ROI data
            bbox_data = []
            frame_data = []
            
            # Get all frame keys
            frame_keys = [key for key in exp_group.keys() if key.startswith('frame_')]
            frame_keys.sort()
            
            print(f"Found {len(frame_keys)} frames")
            
            for frame_key in frame_keys:
                frame_group = exp_group[frame_key]
                frame_number = frame_key.replace('frame_', '')
                
                # Frame-level data
                frame_info = {
                    'frame_id': frame_key,
                    'frame_number': int(frame_number),
                    'has_rgb_image': 'image_rgb' in frame_group,
                    'has_grayscale_image': 'image' in frame_group,
                }
                
                if 'image_rgb' in frame_group:
                    frame_info['rgb_image_shape'] = frame_group['image_rgb'].shape
                if 'image' in frame_group:
                    frame_info['grayscale_image_shape'] = frame_group['image'].shape
                
                # Count ROIs in this frame
                roi_count = 0
                for roi in metadata.get('roi_list', []):
                    if f'{roi}_bbox' in frame_group:
                        roi_count += 1
                
                frame_info['roi_count'] = roi_count
                frame_data.append(frame_info)
                
                # Extract ROI bbox data for this frame
                for roi in metadata.get('roi_list', []):
                    bbox_key = f'{roi}_bbox'
                    mask_key = f'{roi}_mask'
                    contour_key = f'{roi}_contour'
                    id_key = f'{roi}_id'
                    
                    if bbox_key in frame_group:
                        bbox = frame_group[bbox_key][()]
                        
                        roi_data = {
                            'frame_id': frame_key,
                            'frame_number': int(frame_number),
                            'roi_name': roi,
                            'bbox_x': bbox[0],
                            'bbox_y': bbox[1],
                            'bbox_width': bbox[2],
                            'bbox_height': bbox[3],
                            'bbox_area': bbox[2] * bbox[3],
                            'has_mask': mask_key in frame_group,
                            'has_contour': contour_key in frame_group,
                            'has_id': id_key in frame_group,
                        }
                        
                        # Add mask information if available
                        if mask_key in frame_group:
                            mask = frame_group[mask_key][()]
                            roi_data['mask_shape'] = mask.shape
                            roi_data['mask_nonzero_pixels'] = np.count_nonzero(mask)
                            roi_data['mask_total_pixels'] = mask.size
                            roi_data['mask_fill_ratio'] = roi_data['mask_nonzero_pixels'] / roi_data['mask_total_pixels']
                        
                        # Add contour information if available
                        if contour_key in frame_group:
                            contour = frame_group[contour_key][()]
                            roi_data['contour_shape'] = contour.shape
                            roi_data['contour_points'] = contour.shape[0] if contour.ndim >= 1 else 0
                        
                        # Add ID information if available
                        if id_key in frame_group:
                            roi_data['roi_id'] = frame_group[id_key][()]
                        
                        bbox_data.append(roi_data)
            
            # Create dataframes
            bbox_df = pd.DataFrame(bbox_data)
            frame_df = pd.DataFrame(frame_data)
            
            dataframes['bbox_data'] = bbox_df
            dataframes['frame_summary'] = frame_df
            
            # Create ROI summary
            if not bbox_df.empty:
                roi_summary = bbox_df.groupby('roi_name').agg({
                    'frame_number': ['count', 'min', 'max'],
                    'bbox_area': ['mean', 'std', 'min', 'max'],
                    'bbox_width': ['mean', 'std'],
                    'bbox_height': ['mean', 'std'],
                    'mask_nonzero_pixels': ['mean', 'std'],
                    'mask_fill_ratio': ['mean', 'std'],
                    'contour_points': ['mean', 'std']
                }).round(2)
                
                # Flatten column names
                roi_summary.columns = ['_'.join(col).strip() for col in roi_summary.columns.values]
                roi_summary = roi_summary.reset_index()
                
                dataframes['roi_summary'] = roi_summary
            
            print(f"Successfully loaded data:")
            print(f"  - Metadata: {len(metadata_df)} rows")
            print(f"  - Bbox data: {len(bbox_df)} rows")
            print(f"  - Frame summary: {len(frame_df)} rows")
            print(f"  - ROI summary: {len(roi_summary)} rows" if 'roi_summary' in dataframes else "  - ROI summary: 0 rows")
            
    except Exception as e:
        print(f"Error loading h5 file: {e}")
        import traceback
        traceback.print_exc()
    
    return dataframes


def get_roi_data_for_frame(h5_filepath: str, frame_number: int, roi_name: str) -> Dict:
    """
    Get detailed data for a specific ROI in a specific frame.
    
    Parameters:
    h5_filepath (str): Path to the h5 file
    frame_number (int): Frame number to extract data from
    roi_name (str): Name of the ROI
    
    Returns:
    Dict: Dictionary containing all available data for the ROI
    """
    try:
        with h5py.File(h5_filepath, 'r') as h5f:
            exp_group_name = list(h5f.keys())[0]
            exp_group = h5f[exp_group_name]
            
            frame_key = f'frame_{frame_number:04d}'
            if frame_key not in exp_group:
                print(f"Frame {frame_key} not found")
                return {}
            
            frame_group = exp_group[frame_key]
            
            roi_data = {
                'frame_number': frame_number,
                'roi_name': roi_name
            }
            
            # Get bbox
            bbox_key = f'{roi_name}_bbox'
            if bbox_key in frame_group:
                bbox = frame_group[bbox_key][()]
                roi_data['bbox'] = bbox
                roi_data['bbox_x'] = bbox[0]
                roi_data['bbox_y'] = bbox[1]
                roi_data['bbox_width'] = bbox[2]
                roi_data['bbox_height'] = bbox[3]
            
            # Get mask
            mask_key = f'{roi_name}_mask'
            if mask_key in frame_group:
                roi_data['mask'] = frame_group[mask_key][()]
            
            # Get contour
            contour_key = f'{roi_name}_contour'
            if contour_key in frame_group:
                roi_data['contour'] = frame_group[contour_key][()]
            
            # Get ID
            id_key = f'{roi_name}_id'
            if id_key in frame_group:
                roi_data['roi_id'] = frame_group[id_key][()]
            
            return roi_data
            
    except Exception as e:
        print(f"Error getting ROI data: {e}")
        return {}

## Execution

In [4]:
# Check if the h5 file exists
if os.path.exists(h5_filepath):
    print(f"✅ H5 file found: {h5_filepath}")
    
    # 1. First, explore the structure of the h5 file
    print("\n" + "="*50)
    print("1. EXPLORING H5 FILE STRUCTURE")
    print("="*50)
    explore_h5_structure(h5_filepath)
    
    # 2. Load data into dataframes
    print("\n" + "="*50)
    print("2. LOADING DATA INTO DATAFRAMES")
    print("="*50)
    dataframes = load_h5_to_dataframes(h5_filepath)
    
    # 3. Display the dataframes
    print("\n" + "="*50)
    print("3. DISPLAYING DATAFRAMES")
    print("="*50)
    
    for df_name, df in dataframes.items():
        print(f"\n📊 {df_name.upper()} DataFrame:")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print("Sample data:")
        print(df.head())
        print("-" * 60)
    
    # 4. Example: Get detailed data for a specific ROI and frame
    print("\n" + "="*50)
    print("4. EXAMPLE: DETAILED ROI DATA")
    print("="*50)
    
    # Get available ROI names from metadata
    if 'metadata' in dataframes and not dataframes['metadata'].empty:
        roi_list = dataframes['metadata'].iloc[0]['roi_list']
        if roi_list and len(roi_list) > 0:
            # Get first ROI and first frame
            example_roi = roi_list[0]
            if 'bbox_data' in dataframes and not dataframes['bbox_data'].empty:
                example_frame = dataframes['bbox_data']['frame_number'].iloc[0]
                
                print(f"Getting detailed data for ROI '{example_roi}' in frame {example_frame}")
                roi_data = get_roi_data_for_frame(h5_filepath, example_frame, example_roi)
                
                print(f"ROI Data keys: {list(roi_data.keys())}")
                for key, value in roi_data.items():
                    if key in ['mask', 'contour']:
                        print(f"  {key}: shape {value.shape if hasattr(value, 'shape') else 'N/A'}")
                    else:
                        print(f"  {key}: {value}")
    
    # 5. Save dataframes to CSV files for further analysis
    print("\n" + "="*50)
    print("5. SAVING DATAFRAMES TO CSV")
    print("="*50)
    
    output_dir = "./h5_analysis_output"
    os.makedirs(output_dir, exist_ok=True)
    
    for df_name, df in dataframes.items():
        if not df.empty:
            # Handle complex data types that can't be saved to CSV
            df_copy = df.copy()
            
            # Convert complex columns to string representation
            for col in df_copy.columns:
                if df_copy[col].dtype == 'object':
                    # Check if it contains arrays or complex objects
                    sample_val = df_copy[col].dropna().iloc[0] if not df_copy[col].dropna().empty else None
                    if sample_val is not None and (isinstance(sample_val, (list, tuple, np.ndarray)) or 
                                                   (hasattr(sample_val, 'shape') and len(sample_val.shape) > 0)):
                        df_copy[col] = df_copy[col].astype(str)
            
            output_path = os.path.join(output_dir, f"{df_name}.csv")
            df_copy.to_csv(output_path, index=False)
            print(f"✅ Saved {df_name} to {output_path}")
    
    print(f"\n🎉 Analysis complete! Check the '{output_dir}' directory for CSV files.")
    
else:
    print(f"❌ H5 file not found: {h5_filepath}")
    print("Please check the file path and ensure the file exists.")

✅ H5 file found: ./data/FO_BitMapData_2017-110^01-0196-V1MR_S7.h5

1. EXPLORING H5 FILE STRUCTURE
=== H5 File Structure: FO_BitMapData_2017-110^01-0196-V1MR_S7.h5 ===
📁 Group: 02072025
  📊 Dataset: 02072025/Clusters_indices
    📏 Shape: (20,)
    🔢 Type: int64
    📋 Sample: [  1 217 228 240 246]
  📊 Dataset: 02072025/Patient
    📏 Shape: ()
    🔢 Type: object
    📋 Values: b'2017-110^01-0196-V1MR'
  📊 Dataset: 02072025/Serie
    📏 Shape: ()
    🔢 Type: object
    📋 Values: b'7'
  📁 Group: 02072025/frame_0001
    📊 Dataset: 02072025/frame_0001/arytenoid-cartilage_bbox
      📏 Shape: (4,)
      🔢 Type: int64
      📋 Values: [275 361  21  47]
    📊 Dataset: 02072025/frame_0001/arytenoid-cartilage_contour
      📏 Shape: (21, 2)
      🔢 Type: int64
      📋 Sample: [296 361]
    📊 Dataset: 02072025/frame_0001/arytenoid-cartilage_id
      📏 Shape: ()
      🔢 Type: int64
      📋 Values: 0
    📊 Dataset: 02072025/frame_0001/arytenoid-cartilage_mask
      📏 Shape: (480, 480)
      🔢 Type: uint8


In [5]:
## Additional Analysis Functions

def analyze_bbox_statistics(bbox_df: pd.DataFrame) -> None:
    """
    Analyze and display statistics about bounding boxes.
    
    Parameters:
    bbox_df (pd.DataFrame): DataFrame containing bbox data
    """
    if bbox_df.empty:
        print("No bbox data available")
        return
    
    print("📊 BOUNDING BOX STATISTICS")
    print("="*40)
    
    # Overall statistics
    print(f"Total bounding boxes: {len(bbox_df)}")
    print(f"Number of unique ROIs: {bbox_df['roi_name'].nunique()}")
    print(f"Number of frames: {bbox_df['frame_number'].nunique()}")
    print(f"Frame range: {bbox_df['frame_number'].min()} - {bbox_df['frame_number'].max()}")
    
    # Area statistics
    print(f"\nArea statistics:")
    print(f"  Mean area: {bbox_df['bbox_area'].mean():.2f}")
    print(f"  Min area: {bbox_df['bbox_area'].min():.2f}")
    print(f"  Max area: {bbox_df['bbox_area'].max():.2f}")
    print(f"  Std area: {bbox_df['bbox_area'].std():.2f}")
    
    # ROI frequency
    print(f"\nROI frequency:")
    roi_counts = bbox_df['roi_name'].value_counts()
    for roi, count in roi_counts.items():
        print(f"  {roi}: {count} frames")


def visualize_bbox_distribution(bbox_df: pd.DataFrame) -> None:
    """
    Create simple visualizations of bbox data distribution.
    
    Parameters:
    bbox_df (pd.DataFrame): DataFrame containing bbox data
    """
    if bbox_df.empty:
        print("No bbox data available for visualization")
        return
    
    try:
        import matplotlib.pyplot as plt
        
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        fig.suptitle('Bounding Box Data Analysis', fontsize=16)
        
        # 1. Area distribution
        axes[0, 0].hist(bbox_df['bbox_area'], bins=20, alpha=0.7, edgecolor='black')
        axes[0, 0].set_title('Area Distribution')
        axes[0, 0].set_xlabel('Area (pixels²)')
        axes[0, 0].set_ylabel('Frequency')
        
        # 2. Width vs Height scatter
        axes[0, 1].scatter(bbox_df['bbox_width'], bbox_df['bbox_height'], alpha=0.6)
        axes[0, 1].set_title('Width vs Height')
        axes[0, 1].set_xlabel('Width (pixels)')
        axes[0, 1].set_ylabel('Height (pixels)')
        
        # 3. ROI count per frame
        roi_per_frame = bbox_df.groupby('frame_number').size()
        axes[1, 0].plot(roi_per_frame.index, roi_per_frame.values, marker='o', linewidth=1, markersize=3)
        axes[1, 0].set_title('ROI Count per Frame')
        axes[1, 0].set_xlabel('Frame Number')
        axes[1, 0].set_ylabel('Number of ROIs')
        
        # 4. ROI frequency
        roi_counts = bbox_df['roi_name'].value_counts()
        axes[1, 1].bar(range(len(roi_counts)), roi_counts.values)
        axes[1, 1].set_title('ROI Frequency')
        axes[1, 1].set_xlabel('ROI Index')
        axes[1, 1].set_ylabel('Count')
        axes[1, 1].set_xticks(range(len(roi_counts)))
        axes[1, 1].set_xticklabels(roi_counts.index, rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
        
    except ImportError:
        print("Matplotlib not available for visualization")
        print("Install matplotlib to enable visualizations: pip install matplotlib")


def export_roi_masks_summary(h5_filepath: str, roi_name: str) -> pd.DataFrame:
    """
    Export a summary of mask data for a specific ROI across all frames.
    
    Parameters:
    h5_filepath (str): Path to the h5 file
    roi_name (str): Name of the ROI to analyze
    
    Returns:
    pd.DataFrame: Summary of mask data for the ROI
    """
    mask_summary = []
    
    try:
        with h5py.File(h5_filepath, 'r') as h5f:
            exp_group_name = list(h5f.keys())[0]
            exp_group = h5f[exp_group_name]
            
            frame_keys = [key for key in exp_group.keys() if key.startswith('frame_')]
            frame_keys.sort()
            
            for frame_key in frame_keys:
                frame_group = exp_group[frame_key]
                frame_number = int(frame_key.replace('frame_', ''))
                
                mask_key = f'{roi_name}_mask'
                if mask_key in frame_group:
                    mask = frame_group[mask_key][()]
                    
                    summary = {
                        'frame_number': frame_number,
                        'roi_name': roi_name,
                        'mask_height': mask.shape[0],
                        'mask_width': mask.shape[1],
                        'total_pixels': mask.size,
                        'nonzero_pixels': np.count_nonzero(mask),
                        'fill_ratio': np.count_nonzero(mask) / mask.size,
                        'max_value': mask.max(),
                        'mean_value': mask.mean()
                    }
                    
                    mask_summary.append(summary)
    
    except Exception as e:
        print(f"Error analyzing mask data: {e}")
    
    return pd.DataFrame(mask_summary)


# Example usage after running the main analysis
print("🔧 Additional analysis functions defined:")
print("  - analyze_bbox_statistics(bbox_df)")
print("  - visualize_bbox_distribution(bbox_df)")
print("  - export_roi_masks_summary(h5_filepath, roi_name)")
print("\nExample usage:")
print("  analyze_bbox_statistics(dataframes['bbox_data'])")
print("  visualize_bbox_distribution(dataframes['bbox_data'])")
print("  mask_summary = export_roi_masks_summary(h5_filepath, 'head')")


🔧 Additional analysis functions defined:
  - analyze_bbox_statistics(bbox_df)
  - visualize_bbox_distribution(bbox_df)
  - export_roi_masks_summary(h5_filepath, roi_name)

Example usage:
  analyze_bbox_statistics(dataframes['bbox_data'])
  visualize_bbox_distribution(dataframes['bbox_data'])
  mask_summary = export_roi_masks_summary(h5_filepath, 'head')


In [6]:
bbox_df = dataframes['bbox_data']