In [2]:
!pip install deepchem

Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem)
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit, deepchem
Successfully installed deepchem-2.8.0 rdkit-2025.3.3


In [3]:
import tensorflow as tf
import deepchem as dc
import numpy as np
from rdkit import Chem
from rdkit import rdBase
import os
from tqdm.auto import tqdm

# Suppress non-critical RDKit messages
rdBase.DisableLog('rdApp.warning')
rdBase.DisableLog('rdApp.error')



In [4]:
# --- CONFIGURATION ---
# List of MoleculeNet datasets you want to process
DATASET_NAMES = ['Tox21', 'BBBP', 'ESOL']

# Parameters that MUST match your pre-training setup
MAX_SMILES_LEN = 256
MAX_NODES = 419
NUM_ATOM_FEATURES = 5

# Output directory for the final, correct TFRecord files
OUTPUT_DIR = 'moleculenet_tfrecords_final'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- SINGLE SOURCE OF TRUTH FOR VOCABULARY ---
# This vocabulary is identical to your pre-training notebook and must be used everywhere.
DUMMY_SMILES_FOR_VOCAB = ["C", "N", "O", "F", "P", "S", "Cl", "Br", "I", "c", "n", "=", "#", "(", ")", "[", "]", "@", "+", "-", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "H", "B", "b", "K", "k", "L", "l", "M", "m", "R", "r", "X", "x", "Y", "y", "Z", "z"] 
VOCAB = ['<pad>', '<unk>', '<cls>', '<eos>'] + sorted(list(set("".join(DUMMY_SMILES_FOR_VOCAB))))
CHAR_TO_IDX = {char: i for i, char in enumerate(VOCAB)}
print(f"Using a consistent vocabulary of size: {len(VOCAB)}")

Using a consistent vocabulary of size: 49


In [5]:
def tokenize_smiles(smiles, max_len):
    """Converts a SMILES string to a padded sequence of token IDs using the fixed vocabulary."""
    # Use CHAR_TO_IDX.get() to handle unknown characters gracefully by mapping them to '<unk>'
    indexed_tokens = [CHAR_TO_IDX.get(char, CHAR_TO_IDX['<unk>']) for char in smiles]
    
    if len(indexed_tokens) > max_len:
        indexed_tokens = indexed_tokens[:max_len]
        
    padded_tokens = indexed_tokens + [CHAR_TO_IDX['<pad>']] * (max_len - len(indexed_tokens))
    return np.array(padded_tokens, dtype=np.int32)


# --- GRAPH FEATURIZER (From your pre-training notebook) ---
def atom_to_feature_vector(atom):
    """Generates a feature vector for a single atom."""
    return np.array([
        atom.GetAtomicNum(),
        atom.GetDegree(),
        int(atom.GetHybridization()),
        int(atom.GetIsAromatic()),
        atom.GetFormalCharge()
    ], dtype=np.float32)

def smiles_to_graph_and_tokens(smiles_string, max_nodes, max_len):
    mol = Chem.MolFromSmiles(smiles_string)
    if not mol or mol.GetNumAtoms() > max_nodes:
        return None

    # Graph features
    atom_features = np.array([atom_to_feature_vector(atom) for atom in mol.GetAtoms()])
    num_nodes = len(atom_features)
    padded_atom_features = np.zeros((max_nodes, NUM_ATOM_FEATURES), dtype=np.float32)
    padded_atom_features[:num_nodes] = atom_features

    edge_indices = []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices.extend([[i, j], [j, i]])
    
    edge_index_array = np.array(edge_indices, dtype=np.int32) if edge_indices else np.zeros((0, 2), dtype=np.int32)

    # SMILES features using the consistent tokenizer
    token_ids = tokenize_smiles(smiles_string, max_len)
    
    return padded_atom_features, edge_index_array, np.array([num_nodes], dtype=np.int32), token_ids


# --- TFRECORD SERIALIZATION ---
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def create_tf_example(atom_features, edge_index, num_nodes, token_ids, label, smiles_str):
    """Creates a tf.train.Example proto from a single molecule's data."""
    feature = {
        'atom_features': _bytes_feature(tf.io.serialize_tensor(atom_features)),
        'edge_index': _bytes_feature(tf.io.serialize_tensor(edge_index)),
        'num_nodes': _bytes_feature(tf.io.serialize_tensor(num_nodes)),
        'token_ids': _bytes_feature(tf.io.serialize_tensor(token_ids)),
        'label': _bytes_feature(tf.io.serialize_tensor(label)),
        'smiles': _bytes_feature(smiles_str.encode('utf-8')), # Save raw SMILES for future-proofing
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [8]:
# --- MAIN PROCESSING LOOP ---
def process_and_save_datasets():
    """Main function to load, process, and save all specified datasets."""
    for name in DATASET_NAMES:
        print(f"\n--- Processing dataset: {name} ---")
        
        if name == 'Tox21':
            tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='Raw', splitter='scaffold')
        elif name == 'BBBP':
            tasks, datasets, transformers = dc.molnet.load_bbbp(featurizer='Raw', splitter='scaffold')
        elif name == 'ESOL':
            tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='Raw', splitter='random')
        else:
            continue
            
        train_dataset, valid_dataset, test_dataset = datasets
        
        for split_name, dataset in [('train', train_dataset), ('valid', valid_dataset), ('test', test_dataset)]:
            output_filename = os.path.join(OUTPUT_DIR, f'{name.lower()}_{split_name}.tfrecord')
            
            with tf.io.TFRecordWriter(output_filename) as writer:
                processed_count = 0
                for smiles, label in tqdm(zip(dataset.ids, dataset.y), total=len(dataset), desc=f"  Writing {split_name}"):
                    
                    featurized_data = smiles_to_graph_and_tokens(smiles, MAX_NODES, MAX_SMILES_LEN)
                    if featurized_data is None:
                        continue
                    
                    atom_f, edge_idx, num_n, token_ids = featurized_data
                    label_np = np.array(label, dtype=np.float32)
                    
                    tf_example = create_tf_example(atom_f, edge_idx, num_n, token_ids, label_np, smiles)
                    writer.write(tf_example.SerializeToString())
                    processed_count += 1

            print(f"  ✅ Saved {processed_count} molecules to {output_filename}")

    print(f"\n--- All datasets processed successfully and saved in '{OUTPUT_DIR}'! ---")

# Run the entire preprocessing pipeline
process_and_save_datasets()


--- Processing dataset: Tox21 ---


  Writing train:   0%|          | 0/6258 [00:00<?, ?it/s]

  ✅ Saved 6258 molecules to moleculenet_tfrecords_final/tox21_train.tfrecord


  Writing valid:   0%|          | 0/782 [00:00<?, ?it/s]

  ✅ Saved 782 molecules to moleculenet_tfrecords_final/tox21_valid.tfrecord


  Writing test:   0%|          | 0/783 [00:00<?, ?it/s]

  ✅ Saved 783 molecules to moleculenet_tfrecords_final/tox21_test.tfrecord

--- Processing dataset: BBBP ---


  Writing train:   0%|          | 0/1631 [00:00<?, ?it/s]

  ✅ Saved 1631 molecules to moleculenet_tfrecords_final/bbbp_train.tfrecord


  Writing valid:   0%|          | 0/204 [00:00<?, ?it/s]

  ✅ Saved 204 molecules to moleculenet_tfrecords_final/bbbp_valid.tfrecord


  Writing test:   0%|          | 0/204 [00:00<?, ?it/s]

  ✅ Saved 204 molecules to moleculenet_tfrecords_final/bbbp_test.tfrecord

--- Processing dataset: ESOL ---


  Writing train:   0%|          | 0/902 [00:00<?, ?it/s]

  ✅ Saved 902 molecules to moleculenet_tfrecords_final/esol_train.tfrecord


  Writing valid:   0%|          | 0/113 [00:00<?, ?it/s]

  ✅ Saved 113 molecules to moleculenet_tfrecords_final/esol_valid.tfrecord


  Writing test:   0%|          | 0/113 [00:00<?, ?it/s]

  ✅ Saved 113 molecules to moleculenet_tfrecords_final/esol_test.tfrecord

--- All datasets processed successfully and saved in 'moleculenet_tfrecords_final'! ---
