In [4]:
import h5py
import json
from collections import Counter
import re
import selfies as sf

# ==============================
# 1. Tokenizers
# ==============================

# Regex for tokenizing SMILES strings
# Matches:
# - Single and multi-letter atoms (C, Cl, Br, etc.)
# - Bracketed expressions (e.g., [13CH3], [O-])
# - Stereochemistry symbols (@, @@)
# - Bonds (=, #, +, -, /, \)
# - Parentheses for branching, dots, wildcards (*)
# - Ring closures (1-9, %10-%99)
SMILES_REGEX = re.compile(
    r"""([A-Z][a-z]?               # single and multi-letter atoms
        | \[ [^\]]+ \]             # bracketed expressions
        | @@?                      # stereochemistry @ / @@
        | =|#|\+|-|/|\\            # bonds
        | \(|\)|\.|\*              # parentheses, dot, wildcard
        | \d                       # single-digit ring closures
        | %\d{2}                   # two-digit ring closures
    )""",
    re.X
)

def tokenize_smiles(smiles: str) -> list[str]:
    """
    Tokenizes a SMILES string using SMILES_REGEX.

    Args:
        smiles (str): The SMILES string to tokenize.

    Returns:
        List of tokens with empty strings filtered out.
    """
    return [t for t in SMILES_REGEX.findall(smiles) if t]

def tokenize_selfies(selfies_str: str) -> list[str]:
    """
    Tokenizes a SELFIES string using the selfies library.

    Args:
        selfies_str (str): The SELFIES string to tokenize.

    Returns:
        List of tokens extracted from the SELFIES string.
    """
    return list(sf.split_selfies(selfies_str))


# ==============================
# 2. Vocabulary Building
# ==============================

def build_vocab(
    sequences: list[str], 
    tokenizer, 
    reserved_tokens: list[str] = None
) -> dict:
    """
    Builds a vocabulary dictionary from a list of sequences.

    Args:
        sequences (list[str]): List of SMILES or SELFIES sequences.
        tokenizer (callable): Function to tokenize a sequence.
        reserved_tokens (list[str], optional): Special tokens to prepend. 
            Defaults to ["<pad>", "<unk>", "<bos>", "<eos>"].

    Returns:
        dict: Vocabulary containing:
            - "tokens": list of all tokens
            - "token_to_id": mapping token -> integer ID
            - "id_to_token": mapping integer ID -> token
    """
    reserved_tokens = reserved_tokens or ["<pad>", "<unk>", "<bos>", "<eos>"]
    counter = Counter()

    # Count occurrences of each token across all sequences
    for seq in sequences:
        counter.update(tokenizer(seq))

    # Sort tokens by frequency (descending), then alphabetically
    sorted_tokens = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    # Combine reserved tokens with the sorted token list
    tokens = reserved_tokens + [tok for tok, _ in sorted_tokens]

    # Build bidirectional mappings
    return {
        "tokens": tokens,
        "token_to_id": {t: i for i, t in enumerate(tokens)},
        "id_to_token": {i: t for i, t in enumerate(tokens)},
    }


# ==============================
# 3. Save Vocabulary
# ==============================

def save_vocab(vocab: dict, out_path: str):
    """
    Saves a vocabulary dictionary to a JSON file.

    Args:
        vocab (dict): Vocabulary dictionary (from build_vocab).
        out_path (str): Path to save the JSON file.
    """
    with open(out_path, "w") as f:
        json.dump(vocab, f, indent=2)
    print(f"Saved {len(vocab['tokens'])} tokens -> {out_path}")


# ==============================
# 4. Load Vocabulary
# ==============================

def load_vocab(vocab_file: str) -> dict:
    """
    Loads a vocabulary dictionary from a JSON file.

    Args:
        vocab_file (str): Path to a saved vocabulary JSON file.

    Returns:
        dict: Loaded vocabulary dictionary.
    """
    with open(vocab_file, "r") as f:
        vocab = json.load(f)
    return vocab

In [6]:
molecule_files = ["mol3d_data/mol3d_mil1.h5",
                  "mol3d_data/mol3d_mil2.h5",
                  "mol3d_data/mol3d_mil3.h5",
                  "mol3d_data/mol3d_mil4.h5"]

In [7]:
smiles = []
for fid in molecule_files:
    with h5py.File(fid, "r") as f:
        for s in f["smiles"]:
            smiles.append(s.decode("utf-8"))
smiles_vocab = build_vocab(smiles, tokenize_smiles)
save_vocab(smiles_vocab, "mol3d_data/smiles_vocab.json")

Saved 166 tokens -> mol3d_data/smiles_vocab.json


In [9]:
selfies = []
for fid in molecule_files:
    with h5py.File(fid, "r") as f:
        for s in f["selfies"]:
            selfies.append(s.decode("utf-8"))
selfies_vocab = build_vocab(selfies, tokenize_selfies)
save_vocab(selfies_vocab, "mol3d_data/selfies_vocab.json")

Saved 109 tokens -> mol3d_data/selfies_vocab.json


In [8]:
vocab = load_vocab("mol3d_data/smiles_vocab.json")
print(vocab["tokens"], '\n')
print(vocab["token_to_id"], '\n')
print(vocab["id_to_token"])

['<pad>', '<unk>', '<bos>', '<eos>', 'C', '=', '(', ')', '1', 'N', 'O', '2', '[C@@H]', '[C@H]', 'F', 'S', 'Cl', '3', '[C]', '[N]', '[NH]', '[C@]', '[C@@]', '[N+]', '[O-]', '[O]', '[CH]', '[H]', 'Br', '[Si]', 'P', '[SH]', 'B', '4', '[S@H]', '[CH2]', '[S@@H]', '[P@]', '[S]', '[PH]', '[B]', '[P@@]', '[P]', '[Se]', '5', '[SiH]', '[SiH2]', '[Si@]', '[Si@@]', '[Ge]', '[As]', '[SiH3]', '[Si@@H]', '6', '[S@]', '[S@@]', '[Si@H]', '[P@H]', '[SeH]', '[S@OH18]', '[P@@H]', '[BH]', '[P@TB2]', '[S@TB16]', '[GeH3]', '[P@TB19]', '[S@TB17]', '[P@TB17]', '[PH2]', '[P@TB4]', '[S@TB20]', '[P@TB13]', '[P@TB14]', '[GeH2]', '[GeH]', '[S@SP3H]', '[SH2]', '[P@TB6]', '[S@TB19]', '[S@TB6]', '[P@TB9]', '7', '[P@TB10]', '[S@TB4]', '[P@TB7]', '[P@TB18]', '[S@TB5]', '[Al]', '[S@TB18]', '[S@OH23]', '[S@SP2]', '[AsH]', '[S@TB3]', '[P@TB12]', '[S@OH6]', '[S@TB2]', '[AsH2]', '[P@TB15]', '[P@TB20]', '[PH3]', '[S@OH26]', '[OH]', '[P@TB3]', '[S@TB1]', '[P@TB1]', '[P@TB8]', '[S@OH21]', '[Ti]', '[NH2]', '[P@TB16]', '[S@OH16]'