In [37]:
import h5py
import json
import numpy as np

def explore_h5_file(h5_filepath):
    """
    Open an HDF5 file and report the names of datasets/groups and their entries.
    
    Args:
        h5_filepath (str): Path to the HDF5 file
    
    Returns:
        dict: Information about the HDF5 file structure
    """
    try:
        with h5py.File(h5_filepath, 'r') as h5_file:
            print(f"Exploring HDF5 file: {h5_filepath}")
            print("-" * 50)
            
            file_info = {}
            
            def visit_item(name, obj):
                if isinstance(obj, h5py.Dataset):
                    entries = obj.shape[0] if obj.shape else 1
                    dtype = obj.dtype
                    print(f"Dataset: '{name}' - {entries} entries, dtype: {dtype}")
                    file_info[name] = {
                        'type': 'dataset',
                        'entries': entries,
                        'shape': obj.shape,
                        'dtype': str(dtype)
                    }
                elif isinstance(obj, h5py.Group):
                    print(f"Group: '{name}'")
                    file_info[name] = {
                        'type': 'group'
                    }
            
            h5_file.visititems(visit_item)
            
            print(f"\nTotal items found: {len(file_info)}")
            return file_info
            
    except FileNotFoundError:
        print(f"Error: File '{h5_filepath}' not found.")
        return None
    except Exception as e:
        print(f"Error reading HDF5 file: {e}")
        return None

def load_json_and_add_to_h5(json_filepath, h5_filepath, dataset_name='point_groups'):
    """
    Load JSON data and insert it as a new dataset in an HDF5 file.
    
    Args:
        json_filepath (str): Path to the JSON file
        h5_filepath (str): Path to the HDF5 file
        dataset_name (str): Name for the new dataset in HDF5 file
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Load JSON data
        with open(json_filepath, 'r') as json_file:
            json_data = json.load(json_file)
        print(f"Loaded JSON file: {json_filepath}")

        json_data = [pg.replace("C∞v", "Cnv").replace("D∞h", "Dnh") for pg in json_data]
        
        # Determine data length and structure
        if isinstance(json_data, list):
            data_length = len(json_data)
            print(f"JSON data length: {data_length} items")
            np_data = np.array(json_data, dtype=h5py.string_dtype(encoding='utf-8'))
                
        
        # Add data to HDF5 file
        with h5py.File(h5_filepath, 'a') as h5_file:  # 'a' for append mode
            
            # Check if dataset already exists
            if dataset_name in h5_file:
                print(f"Warning: Dataset '{dataset_name}' already exists. Overwriting...")
                del h5_file[dataset_name]
            
            # Create new dataset
            h5_file.create_dataset(dataset_name, data=np_data)
            print(f"Successfully added dataset '{dataset_name}' to HDF5 file")
            print(f"Dataset shape: {np_data.shape}, dtype: {np_data.dtype}")
            
        return True
        
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        return False
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {e}")
        return False
    except Exception as e:
        print(f"Error processing files: {e}")
        return False

# Example usage
if __name__ == "__main__":
    # Example 1: Explore an HDF5 file
    mil = 1
    h5_file_path = f"mol3d_data/molecules3d_million_{mil}.h5"
    file_info = explore_h5_file(h5_file_path)
    
    # Example 2: Load JSON and add to HDF5
    json_file_path = f"mol3d_data/final/point_groups_million_{mil}_final.json"
    success = load_json_and_add_to_h5(json_file_path, h5_file_path, 'point_groups')
    
    if success:
        print("\nUpdated HDF5 file structure:")
        explore_h5_file(h5_file_path)

Exploring HDF5 file: mol3d_data/molecules3d_million_1.h5
--------------------------------------------------
Dataset: 'atom_types' - 999949 entries, dtype: object
Dataset: 'coords' - 999949 entries, dtype: object
Dataset: 'selfies' - 999949 entries, dtype: object
Dataset: 'smiles' - 999949 entries, dtype: object

Total items found: 4
Loaded JSON file: mol3d_data/final/point_groups_million_1_final.json
JSON data length: 999949 items
Successfully added dataset 'point_groups' to HDF5 file
Dataset shape: (999949,), dtype: object

Updated HDF5 file structure:
Exploring HDF5 file: mol3d_data/molecules3d_million_1.h5
--------------------------------------------------
Dataset: 'atom_types' - 999949 entries, dtype: object
Dataset: 'coords' - 999949 entries, dtype: object
Dataset: 'point_groups' - 999949 entries, dtype: object
Dataset: 'selfies' - 999949 entries, dtype: object
Dataset: 'smiles' - 999949 entries, dtype: object

Total items found: 5


In [38]:
mil = 1
h5_file_path = f"mol3d_data/molecules3d_million_{mil}.h5"
with h5py.File(h5_file_path, 'r') as f:
    point_groups = f['point_groups'][:10]
    clean_strings = [s.decode('utf-8') if isinstance(s, bytes) else s for s in point_groups]
    print(clean_strings)

['C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C2h']
