In [10]:
import os
import numpy as np
from rdkit import Chem
import selfies as sf
import h5py
import json
import re

def process_data(sdf_file):
    """Loads and converts/computes all the relevant data"""
    supplier = Chem.SDMolSupplier(sdf_file, sanitize=True, removeHs=False)
    all_atoms, all_coords, all_smiles, all_selfies, all_ring_counts, all_rings = [], [], [], [], [], []
    isomers = {}  # <--- track duplicates

    for i, mol in enumerate(supplier):
        l = len(all_smiles)
        if mol is None:
            # print(f"[{i}] molecule failed to load") # i <-> l
            # all_atoms.append(None)
            # all_coords.append(None)
            # all_smiles.append(None)
            continue
        
        # Get atom types
        atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
        if len(atoms) < 2:
            # print(f"{i} molecule is a single atom") # i <-> l
            # all_atoms.append(None)
            # all_coords.append(None)
            # all_smiles.append(None)
            continue
        
        # Get ring information
        ring_info = mol.GetRingInfo()
        # Get list of rings (each ring is a tuple of atom indices)
        rings = ring_info.AtomRings()
        ring_count = len(rings)
        
        # Check fragmentation
        fragments = Chem.GetMolFrags(mol)
        if len(fragments) > 1:
            # print(f"{i} molecule is supposedly fragmented") # i <-> l
            # frag = is_physically_fragmented(mol)
            # if frag == True:
                # print(f"{i} molecule is geometrically fragmented too") # i <-> l
                # all_atoms.append(None)
                # all_coords.append(None)
                # all_smiles.append(None)
            continue
        
        # Coordinates
        coords = [mol.GetConformer().GetAtomPosition(j) for j in range(mol.GetNumAtoms())]
        coords = np.array([[c.x, c.y, c.z] for c in coords], dtype=np.float32)

        # 2-atom molecule too far apart
        if len(atoms) == 2:
            dist = np.linalg.norm(coords[0] - coords[1])
            if dist > 3:
                # print(f"{i} molecule is comprised of 2 non-interacting atoms (dist={dist:.2f})") # i <-> l
                # all_atoms.append(None)
                # all_coords.append(None)
                # all_smiles.append(None)
                continue
                
        # Remove explicit hydrogens
        mol_no_H = Chem.RemoveHs(mol, implicitOnly=False, updateExplicitCount=True, sanitize=True)
        Chem.SanitizeMol(mol_no_H, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL)
        
        # SMILES (canonicalized)
        try:            
            Chem.AssignStereochemistry(mol_no_H, cleanIt=True, force=True)
            smiles = Chem.MolToSmiles(mol_no_H, kekuleSmiles=True, canonical=True, isomericSmiles=True, allHsExplicit=False)
        except:
            # print(f"{i} SMILES failed to generate")
            # all_atoms.append(None)
            # all_coords.append(None)
            # all_smiles.append(None)
            continue
            
        try:
            smiles_no_stereo = Chem.MolToSmiles(mol_no_H, kekuleSmiles=True, canonical=True, isomericSmiles=False, allHsExplicit=False)
            selfies = sf.encoder(smiles_no_stereo)
        except Exception as e:
            print(f"SMILES -> SELFIES conversion failed for {i} molecule\nSMILES: {smiles}\n{e}") # i <-> l
            try:
                smiles_no_stereo = clean_problematic_smiles(smiles_no_stereo)
                selfies = sf.encoder(smiles_no_stereo)
            except Exception as e:
                print(f"SMILES -> SELFIES conversion still failed after cleaning for {i} molecule\n{e}") # i <-> l
                continue
        
        # Check duplicates
        if smiles in isomers:
            # print(f"{l} is duplicate molecule of a molecule with id {isomers[smiles]} skipped")
            # all_atoms.append(None)
            # all_coords.append(None)
            # all_smiles.append(None)
            # continue
            isomers[smiles].append(l)
        else:
            isomers[smiles] = [l]
            
        # Keep data
        all_atoms.append(atoms)
        all_rings.extend(rings)
        all_ring_counts.append(ring_count)
        all_coords.append(coords)
        all_smiles.append(smiles)
        all_selfies.append(selfies)
        
    return all_atoms, all_rings, all_ring_counts, all_coords, all_smiles, all_selfies, isomers

def is_physically_fragmented(mol, max_dist=3.0):
    """Checks whether some fragments of the molecule are separated"""
    conf = mol.GetConformer()
    positions = np.array([conf.GetAtomPosition(i) for i in range(mol.GetNumAtoms())])
    dist_matrix = np.linalg.norm(positions[:, None, :] - positions[None, :, :], axis=-1)
    
    # Simple check: any atom pair in different components farther than threshold
    # Find connected components based on distance
    from scipy.sparse.csgraph import connected_components
    adjacency = dist_matrix < max_dist  # adjacency matrix
    n_components, labels = connected_components(adjacency)
    return n_components > 1

def clean_problematic_smiles(smiles, remove_h=True):
    """
    Cleans SMILES for SELFIES conversion:
    - Removes non-standard stereochemistry/custom markers:
      [S@TB19], [P@TB2], [S@OH18], [S@SP3], [S@SP3H]
    - Converts [SeH], [SH], etc., to just [Se], [S]
    - Keeps normal stereochemistry like [C@H], [C@@H].
    """
    # 1. Remove custom tags
    pattern_custom = r"\[([A-Za-z]+)@(?:TB|OH|SP)\w*\]"
    smiles = re.sub(pattern_custom, r"\1", smiles)
    
    # 2. Replace [XH] → [X] (keep brackets), for any element X except H
    pattern_XH = r"\[([A-Z][a-z]?)H\]"
    smiles = re.sub(pattern_XH, r"[\1]", smiles)
    
    return smiles

def find_indices(lst, target):
    return [i for i, l in enumerate(lst) if l == target]

def save_everything(output_dir):
    """Saves the data into .h5 files"""
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"mol3d_mil{f_id}.h5")
    with h5py.File(output_file, "w") as f:
        print("Saving atom types...")
        atom_dt = h5py.special_dtype(vlen=str)
        atom_dset = f.create_dataset('atoms', (len(all_atoms),), dtype=atom_dt)       
        for i, atoms in enumerate(all_atoms):
            atom_dset[i] = '|'.join(atoms)  # Join with delimiter

        print("Saving coords...")
        coord_dt = h5py.special_dtype(vlen=np.dtype('float32'))
        coord_dset = f.create_dataset('coords', (len(all_coords),), dtype=coord_dt)
        for i, coords in enumerate(all_coords):
            coord_dset[i] = np.array(coords, dtype=np.float32).flatten()
        
        print("Saving SMILES...")
        smiles_dset = f.create_dataset('smiles', (len(all_smiles),), dtype=atom_dt)
        for i in range(len(all_smiles)):
            smiles_dset[i] = all_smiles[i]

        print("Saving SELFIES...")
        selfies_dset = f.create_dataset('selfies', (len(all_selfies),), dtype=atom_dt)
        for i in range(len(all_selfies)):
            selfies_dset[i] = all_selfies[i]
            
        print("Saving isomers...")
        f.attrs["isomers"] = json.dumps(isomers)

    output_file = os.path.join(output_dir, f"mol3d_feat_mil{f_id}.h5")
    with h5py.File(output_file, "w") as f:
        print("Saving rings...")
        # Variable-length int dataset for rings
        ring_dt = h5py.special_dtype(vlen=np.dtype('int32'))
        ring_dset = f.create_dataset("rings", (len(all_rings),), dtype=ring_dt)
        for i, ring in enumerate(all_rings):
            ring_dset[i] = np.array(ring, dtype=np.int32)
        # Dataset for number of rings per molecule
        f.create_dataset("nrings", data=np.array(ring_counts, dtype=np.int32))

In [11]:
files = [r"D:\molecule3d\data\data\raw\combined_mols_0_to_1000000.sdf",
         r"D:\molecule3d\data\data\raw\combined_mols_1000000_to_2000000.sdf",
         r"D:\molecule3d\data\data\raw\combined_mols_2000000_to_3000000.sdf",
         r"D:\molecule3d\data\data\raw\combined_mols_3000000_to_3899647.sdf"]

In [8]:
########## SELFIES upgrades ##########
default_constraints = sf.get_semantic_constraints()
custom_constraints = {
    # Original constraints (keeping existing working ones)
    'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1,
    'B': 3, 'B+1': 2, 'B-1': 4,
    'O': 2, 'O+1': 3, 'O-1': 1,
    'N': 3, 'N+1': 4, 'N-1': 2,
    'C': 4, 'C+1': 5, 'C-1': 3,
    'P': 5, 'P+1': 6, 'P-1': 4,
    'S': 6, 'S+1': 7, 'S-1': 5,
    '?': 8,
    
    # Group 1 - Alkali metals (neutral forms for organometallic compounds)
    'Li': 1,  # For organolithium compounds
    'Na': 1,  # For organosodium compounds  
    'K': 1,   # For organopotassium compounds
    
    # Group 2 - Alkaline earth metals (neutral and +1 for organo compounds)
    'Be': 2,     # Beryllium in organometallic complexes
    'Be+1': 3,   # Be+ can form 3 bonds in some complexes
    'Mg': 2,     # Grignard reagents, etc.
    'Ca': 2,     # Organocalcium compounds
    
    # Group 13 - Boron group
    'Al': 3,     # Organoaluminum compounds
    'Ga': 3,     # Organogallium compounds
    
    # Group 14 - Carbon group
    'Si': 4, 'Si-4': 4,  # Silanes, silicides
    'Ge': 4, 'Ge+2': 2,  # Germanes, Ge(II) compounds
    
    # Group 15 - Nitrogen group
    'As': 3, 'As+3': 6, 'As+5': 6, 'As-3': 3,  # Arsines, arsenates, arsenides
    
    # Group 16 - Oxygen group
    'Se': 6, 'Se+1': 7, 'Se-1': 5 , 'Se+4': 4, 'Se+6': 6, 'Se-2': 2,  # Selenides, selenates
    
    # Transition metals
    # Scandium
    'Sc': 3,     # Organoscandium compounds
    
    # Titanium
    'Ti': 4, 'Ti+2': 6, 'Ti+3': 6,  # Ti(II), Ti(III), Ti(IV) organometallics
    
    # Vanadium  
    'V': 5, 'V+2': 6, 'V+3': 6, 'V+4': 6,  # Various V oxidation states in complexes
    
    # Chromium
    'Cr': 6, 'Cr+2': 6, 'Cr+3': 6,  # Cr(II), Cr(III), Cr(VI) compounds
    
    # Manganese
    'Mn': 7, 'Mn+2': 6, 'Mn+3': 6, 'Mn+4': 6,  # Various Mn oxidation states
    
    # Iron
    'Fe': 6, 'Fe+2': 6, 'Fe+3': 6,  # Fe(II), Fe(III) organometallics
    
    # Cobalt
    'Co': 6, 'Co+2': 6, 'Co+3': 6,  # Co(II), Co(III) complexes
    
    # Nickel
    'Ni': 4, 'Ni+2': 6,  # Ni(0), Ni(II) organometallics
    
    # Copper
    'Cu': 4, 'Cu+1': 4, 'Cu+2': 6,  # Cu(I), Cu(II) organometallics
    
    # Zinc
    'Zn': 4, 'Zn+2':6,  # Organozinc, Zn(II) complexes
}
default_constraints.update(custom_constraints)
sf.set_semantic_constraints(default_constraints)
print(f"Current SELFIES constraints:\n {sf.get_semantic_constraints()}")

default_alphabet = sf.get_semantic_robust_alphabet()
# extend with new elements from periodic table
custom_alphabet = {
    # Single bonds - neutral atoms
    '[Li]', '[Be]', '[Na]', '[Mg]', '[Al]', '[Si]', '[K]', '[Ca]',
    '[Sc]', '[Ti]', '[V]', '[Cr]', '[Mn]', '[Fe]', '[Co]', '[Ni]',
    '[Cu]', '[Zn]', '[Ga]', '[Ge]', '[As]', '[Se]',
    
    # Single bonds - charged species that form covalent bonds
    '[Be+1]', '[Si-4]', '[Ge+2]', '[As+3]', '[As+5]', '[As-3]',
    '[Se+4]', '[Se+6]', '[Se-2]',
    
    # Single bonds - transition metal oxidation states
    '[Ti+2]', '[Ti+3]', '[V+2]', '[V+3]', '[V+4]',
    '[Cr+2]', '[Cr+3]', '[Mn+2]', '[Mn+3]', '[Mn+4]',
    '[Fe+2]', '[Fe+3]', '[Co+2]', '[Co+3]', '[Ni+2]', 
    '[Cu+1]', '[Cu+2]', '[Zn+2]',
    
    # Double bonds (=) - elements that can form double bonds
    '[=Si]', '[=Ge]', '[=As]', '[=Se]',
    '[=Ti]', '[=V]', '[=Cr]', '[=Mn]', '[=Fe]', '[=Co]', '[=Ni]',
}
for atom in custom_alphabet:
    default_alphabet.add(f"[{custom_alphabet}]")
print(f"Current SELFIES alphabet:\n {sf.get_semantic_robust_alphabet()}")

Current SELFIES constraints:
 {'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, 'B': 3, 'B+1': 2, 'B-1': 4, 'O': 2, 'O+1': 3, 'O-1': 1, 'N': 3, 'N+1': 4, 'N-1': 2, 'C': 4, 'C+1': 5, 'C-1': 3, 'P': 5, 'P+1': 6, 'P-1': 4, 'S': 6, 'S+1': 7, 'S-1': 5, '?': 8, 'Li': 1, 'Na': 1, 'K': 1, 'Be': 2, 'Be+1': 3, 'Mg': 2, 'Ca': 2, 'Al': 3, 'Ga': 3, 'Si': 4, 'Si-4': 4, 'Ge': 4, 'Ge+2': 2, 'As': 3, 'As+3': 6, 'As+5': 6, 'As-3': 3, 'Se': 6, 'Se+1': 7, 'Se-1': 5, 'Se+4': 4, 'Se+6': 6, 'Se-2': 2, 'Sc': 3, 'Ti': 4, 'Ti+2': 6, 'Ti+3': 6, 'V': 5, 'V+2': 6, 'V+3': 6, 'V+4': 6, 'Cr': 6, 'Cr+2': 6, 'Cr+3': 6, 'Mn': 7, 'Mn+2': 6, 'Mn+3': 6, 'Mn+4': 6, 'Fe': 6, 'Fe+2': 6, 'Fe+3': 6, 'Co': 6, 'Co+2': 6, 'Co+3': 6, 'Ni': 4, 'Ni+2': 6, 'Cu': 4, 'Cu+1': 4, 'Cu+2': 6, 'Zn': 4, 'Zn+2': 6}
Current SELFIES alphabet:
 {'[Mn+3]', '[=Fe+2]', '[#Ti+2]', '[#Se-1]', '[Cr]', '[=B]', '[#S-1]', '[#V+2]', '[Na]', '[Se+1]', '[Si]', '[O+1]', '[#Mn+2]', '[Cu+2]', '[N]', '[Mn]', '[=C+1]', '[=O+1]', '[=V]', '[Se]', '[Br]', '[=Fe+3]', '[P+1]

In [12]:
for f_id in [1, 2, 3, 4]:
    all_atoms, all_rings, ring_counts, all_coords, all_smiles, all_selfies, isomers = process_data(files[f_id-1])
    save_everything(r"D:\molecule3d\data\processed")

[17:59:55] Explicit valence for atom # 1 P, 6, is greater than permitted
[17:59:55] ERROR: Could not sanitize molecule ending on line 507406
[17:59:55] ERROR: Explicit valence for atom # 1 P, 6, is greater than permitted
[17:59:56] The 2 defining bonds for an atropisomer are co-planar - atoms are: 6 5
[17:59:56] Explicit valence for atom # 0 Al, 4, is greater than permitted
[17:59:56] ERROR: Could not sanitize molecule ending on line 553299
[17:59:56] ERROR: Explicit valence for atom # 0 Al, 4, is greater than permitted
[18:00:00] The 2 defining bonds for an atropisomer are co-planar - atoms are: 2 1
[18:00:00] The 2 defining bonds for an atropisomer are co-planar - atoms are: 0 1
[18:00:00] The 2 defining bonds for an atropisomer are co-planar - atoms are: 2 0
[18:00:09] Explicit valence for atom # 6 Si, 6, is greater than permitted
[18:00:09] ERROR: Could not sanitize molecule ending on line 1209872
[18:00:09] ERROR: Explicit valence for atom # 6 Si, 6, is greater than permitted
[18:

Saving atom types...
Saving coords...
Saving rings...
Saving SMILES...
Saving SELFIES...
Saving isomers...


[19:49:11] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[19:49:14] Both bonds on one end of an atropisomer are on the same side - atoms are: 7 0
[19:49:14] Both bonds on one end of an atropisomer are on the same side - atoms are: 10 5
[19:49:15] Both bonds on one end of an atropisomer are on the same side - atoms are: 10 9
[19:49:16] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[19:49:16] Both bonds on one end of an atropisomer are on the same side - atoms are: 2 0
[19:49:16] The 2 defining bonds for an atropisomer are co-planar - atoms are: 2 1
[19:49:16] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 7
[19:49:16] Both bonds on one end of an atropisomer are on the same side - atoms are: 7 0
[19:49:18] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 4
[19:49:18] Both bonds on one end of an atropisomer are on the same side - atoms are: 0 1
[19:49:20] Both bonds on o

Saving atom types...
Saving coords...
Saving rings...
Saving SMILES...
Saving SELFIES...
Saving isomers...


[21:11:24] The 2 defining bonds for an atropisomer are co-planar - atoms are: 6 7
[21:11:24] The 2 defining bonds for an atropisomer are co-planar - atoms are: 10 6
[21:11:24] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 9
[21:11:24] The 2 defining bonds for an atropisomer are co-planar - atoms are: 6 11
[21:11:25] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 3
[21:11:25] The 2 defining bonds for an atropisomer are co-planar - atoms are: 6 11
[21:11:26] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 3
[21:11:26] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 5
[21:11:26] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 5
[21:11:27] The 2 defining bonds for an atropisomer are co-planar - atoms are: 9 6
[21:11:27] The 2 defining bonds for an atropisomer are co-planar - atoms are: 0 6
[21:11:27] The 2 defining bonds for an atropisomer are co-planar - 

SMILES -> SELFIES conversion failed for 229957 molecule
SMILES: [O][As@@H]1=CC=CC=C1C1=CC=CC=C1
input violates the currently-set semantic constraints
	SMILES: [O][AsH]1=CC=CC=C1C1=CC=CC=C1
	Errors:
	[[AsH1] with 4 bond(s) - a max. of 2 bond(s) was specified]

SMILES -> SELFIES conversion still failed after cleaning for 229957 molecule
input violates the currently-set semantic constraints
	SMILES: [O][As]1=CC=CC=C1C1=CC=CC=C1
	Errors:
	[[As] with 4 bond(s) - a max. of 3 bond(s) was specified]



[21:14:57] The 2 defining bonds for an atropisomer are co-planar - atoms are: 11 6
[21:15:04] Both bonds on one end of an atropisomer are on the same side - atoms are: 0 1
[21:15:06] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 5
[21:15:10] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[21:15:21] The 2 defining bonds for an atropisomer are co-planar - atoms are: 0 6
[21:15:21] Both bonds on one end of an atropisomer are on the same side - atoms are: 6 13
[21:15:22] Unexpected error hit on line 15881140
[21:15:22] ERROR: moving to the beginning of the next molecule
[21:15:26] Both bonds on one end of an atropisomer are on the same side - atoms are: 5 6
[21:15:51] Both bonds on one end of an atropisomer are on the same side - atoms are: 0 1
[21:16:38] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 16
[21:17:06] Unexpected error hit on line 22862081
[21:17:06] ERROR: moving to the beginning of the next mo

Saving atom types...
Saving coords...
Saving rings...
Saving SMILES...
Saving SELFIES...
Saving isomers...


[22:28:34] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[22:30:54] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 3
[22:31:08] Both bonds on one end of an atropisomer are on the same side - atoms are: 0 1
[22:31:35] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 4
[22:31:36] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[22:32:38] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 5
[22:33:06] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 4
[22:33:09] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[22:33:11] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[22:34:20] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 4
[22:34:36] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[22:34:40] Both bonds

SMILES -> SELFIES conversion failed for 742996 molecule
SMILES: C1=CC=[AsH2]C=C1
input violates the currently-set semantic constraints
	SMILES: C1=CC=[AsH2]C=C1
	Errors:
	[[AsH2] with 3 bond(s) - a max. of 1 bond(s) was specified]

SMILES -> SELFIES conversion still failed after cleaning for 742996 molecule
input violates the currently-set semantic constraints
	SMILES: C1=CC=[AsH2]C=C1
	Errors:
	[[AsH2] with 3 bond(s) - a max. of 1 bond(s) was specified]



[22:41:18] Explicit valence for atom # 2 Si, 5, is greater than permitted
[22:41:18] ERROR: Could not sanitize molecule ending on line 50210173
[22:41:18] ERROR: Explicit valence for atom # 2 Si, 5, is greater than permitted
[22:41:18] The 2 defining bonds for an atropisomer are co-planar - atoms are: 3 5
[22:41:18] Both bonds on one end of an atropisomer are on the same side - atoms are: 3 4
[22:41:18] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 6
[22:41:18] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 3
[22:41:19] The 2 defining bonds for an atropisomer are co-planar - atoms are: 0 10
[22:41:19] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 3
[22:41:20] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 0
[22:41:20] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 3
[22:41:20] Both bonds on one end of an atropisomer are on the same side - atoms are

Saving atom types...
Saving coords...
Saving rings...
Saving SMILES...
Saving SELFIES...
Saving isomers...
