In [14]:
import h5py
import json
from collections import Counter
import re
import selfies as sf

# ---- 1. Tokenizers ----
SMILES_REGEX = re.compile(
    r"""(
        Cl|Br|Na|Si|Li             # multi-letter atoms
        | [A-Z][a-z]?              # single-letter atoms
        | \[ [^\]]+ \]             # bracketed expressions
        | @@?                      # stereochemistry @ / @@
        | =|#|\+|-|/|\\             # bonds
        | \(|\)|\.|\*              # parentheses, dot, wildcard
        | \d                       # single-digit ring closures
        | %\d{2}                   # two-digit ring closures
    )""",
    re.X
)

def tokenize_smiles(s):
    return SMILES_REGEX.findall(s)

def tokenize_selfies(s):
    return list(sf.split_selfies(s))

# ---- 2. Build vocab ----
def build_vocab(sequences, tokenizer, reserved_tokens=None):
    reserved_tokens = reserved_tokens or ["<pad>", "<unk>", "<bos>", "<eos>"]
    counter = Counter()

    for seq in sequences:
        counter.update(tokenizer(seq))

    sorted_tokens = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    tokens = reserved_tokens + [tok for tok, _ in sorted_tokens]

    return {
        "tokens": tokens,
        "token_to_id": {t: i for i, t in enumerate(tokens)},
        "id_to_token": {i: t for i, t in enumerate(tokens)},
    }

# ---- 3. Save vocab ----
def save_vocab(vocab, out_path):
    with open(out_path, "w") as f:
        json.dump(vocab, f, indent=2)
    print(f"Saved {len(vocab['tokens'])} tokens -> {out_path}")

# ---- 4. Load vocab ----
def load_vocab(vocab_file):
    with open(vocab_file, "r") as f:
        vocab = json.load(f)
    return vocab

In [1]:
molecule_files = ["mol3d_data/mol3d_mil1.h5",
                  "mol3d_data/mol3d_mil2.h5",
                  "mol3d_data/mol3d_mil3.h5",
                  "mol3d_data/mol3d_mil4.h5"]

In [15]:
smiles = []
for fid in molecule_files:
    with h5py.File(fid, "r") as f:
        for s in f["smiles"]:
            smiles.append(s.decode("utf-8"))
smiles_vocab = build_vocab(smiles, tokenize_smiles)
save_vocab(smiles_vocab, "mol3d_data/smiles_vocab.json")

Saved 167 tokens -> mol3d_data/smiles_vocab.json


In [4]:
selfies = []
for fid in molecule_files:
    with h5py.File(fid, "r") as f:
        for s in f["selfies"]:
            selfies.append(s.decode("utf-8"))
selfies_vocab = build_vocab(selfies, tokenize_selfies)
save_vocab(selfies_vocab, "mol3d_data/selfies_vocab.json")

FileNotFoundError: [Errno 2] No such file or directory: 'vocab/selfies_vocab.json'

In [21]:
vocab = load_vocab("mol3d_data/selfies_vocab.json")
print(vocab["tokens"], '\n')
print(vocab["token_to_id"], '\n')
print(vocab["id_to_token"])

['<pad>', '<unk>', '<bos>', '<eos>', '[C]', '[=C]', '[Branch1]', '[Ring1]', '[N]', '[O]', '[=Branch1]', '[=O]', '[=N]', '[#Branch1]', '[S]', '[Branch2]', '[=Branch2]', '[F]', '[#Branch2]', '[#C]', '[Cl]', '[Ring2]', '[CH0]', '[NH0]', '[NH1]', '[#N]', '[P]', '[O-1]', '[OH0]', '[CH1]', '[N+1]', '[H]', '[Br]', '[Si]', '[SH1]', '[=S]', '[B]', '[=N+1]', '[CH2]', '[=SH1]', '[PH1]', '[SH0]', '[=CH0]', '[BH0]', '[=P]', '[Se]', '[PH0]', '[SiH1]', '[SiH2]', '[Ge]', '[As]', '[=NH0]', '[SiH3]', '[=Si]', '[=PH0]', '[=PH1]', '[BH1]', '[SeH1]', '[GeH3]', '[=Ring1]', '[#SH0]', '[=SiH1]', '[GeH1]', '[GeH2]', '[SH2]', '[PH2]', '[Al]', '[=PH2]', '[AsH1]', '[=SeH1]', '[AsH2]', '[OH1]', '[=SiH2]', '[#CH0]', '[=CH1]', '[NH2]', '[PH3]', '[PH4]', '[AlH1]', '[Ga]', '[SH3]', '[Ti]', '[=PH3]', '[BH2]', '[Li]', '[GeH4]', '[Mg]', '[Na]', '[SiH4]', '[#Ti]', '[=Ti]', '[BeH1]', '[Be]', '[Cu]', '[SeH2]', '[V]', '[Zn]', '[#Sc]', '[#Zn]', '[=Ca]', '[=Ni]', '[=Ring2]', '[AlH2]', '[AlH3]', '[AsH3]', '[CH3]', '[Fe]', '[GaH