In [1]:
import h5py
import numpy as np

def check_h5_nan(file_path):
    """Check flat HDF5 structure for NaN values"""
    print(f"\n=== Checking {file_path} ===")
    
    with h5py.File(file_path, 'r') as f:
        # Verify required datasets exist
        if 'samples' not in f or 'labels' not in f:
            missing = [ds for ds in ['samples', 'labels'] if ds not in f]
            raise KeyError(f"Missing datasets: {missing}")

        # Check samples
        samples = f['samples'][:]
        sample_nans = np.isnan(samples).sum()
        print(f"Samples NaN count: {sample_nans}")
        
        # Check labels
        labels = f['labels'][:]
        label_nans = np.isnan(labels).sum()
        print(f"Labels NaN count: {label_nans}")
        
        # Additional validation
        print("\nLabel statistics:")
        print(f"Unique labels: {np.unique(labels)}")
        print(f"Label range: {labels.min()} - {labels.max()}")
        
        print("\nFeature statistics:")
        print(f"Min: {samples.min():.4f}")
        print(f"Max: {samples.max():.4f}")
        print(f"Mean: {samples.mean():.4f}")
        print(f"Std: {samples.std():.4f}")
        
        if sample_nans > 0 or label_nans > 0:
            raise ValueError("NaN values detected in dataset!")
    
    print("\n=== Validation passed - No NaN values found ===")

if __name__ == "__main__":
    # Update this path to your HDF5 file
    HDF5_PATH = "/scratch/bowenxi/dit/data_gen/0330_5/0404_imagenet_latents.h5"
    
    try:
        check_h5_nan(HDF5_PATH)
    except Exception as e:
        print(f"\nERROR: {str(e)}")
        print("Recommended actions:")
        print("1. Check data generation code for NaN sources")
        print("2. Verify standardization/normalization process")
        print("3. Re-generate data if NaNs found")


=== Checking /scratch/bowenxi/dit/data_gen/0330_6/0404_imagenet_latents.h5 ===
Samples NaN count: 0
Labels NaN count: 0

Label statistics:
Unique labels: [333 334 335]
Label range: 333 - 335

Feature statistics:
Min: -1.0000
Max: 1.0000
Mean: 0.0040
Std: 0.3547

=== Validation passed - No NaN values found ===
