In [2]:
import os
import tensorflow as tf
import numpy as np
from rdkit import Chem
from tqdm import tqdm

# Suppress non-critical RDKit warnings
from rdkit import rdBase
rdBase.DisableLog('rdApp.warning')
rdBase.DisableLog('rdApp.error')

# --- Global Constants (MUST match your pre-training notebook) ---
# Update this path to where your pubchem_smiles_for_pretraining.txt is located
SMILES_FILE_PATH = 'pubchem_smiles_for_pretraining.txt' 

# MAX_NODES: This should be the maximum number of nodes found in your ENTIRE PubChem dataset
# or a very large representative sample (e.g., first 1-5M samples).
# 419 was from a 100k sample. For 1M, it might be slightly higher, but 419 is a reasonable
# starting point. You can run a quick max_nodes check on 1M samples if you want to be precise.
MAX_NODES = 419 

NUM_ATOM_FEATURES = 5 # As defined by your atom_to_feature_vector
MAX_SMILES_LEN = 256 # Max sequence length for Transformer.

# --- Configuration for this preprocessing run ---
SAMPLES_TO_PROCESS_IN_THIS_RUN = 1_000_000 # Target: Process 1 million samples
OUTPUT_TFRECORD_DIR = 'pubchem_tfrecords_1M' # Output directory for 1 million samples
NUM_SHARDS = 100 # Number of TFRecord files to split the data into (e.g., 100 files for 1M samples, ~100MB each)

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_TFRECORD_DIR, exist_ok=True)

print(f"Preprocessing SMILES from: {SMILES_FILE_PATH}")
print(f"Targeting {SAMPLES_TO_PROCESS_IN_THIS_RUN} samples.")
print(f"Output TFRecords will be saved to: {OUTPUT_TFRECORD_DIR} in {NUM_SHARDS} shards.")
print(f"Configured MAX_NODES: {MAX_NODES}, NUM_ATOM_FEATURES: {NUM_ATOM_FEATURES}, MAX_SMILES_LEN: {MAX_SMILES_LEN}")

Preprocessing SMILES from: pubchem_smiles_for_pretraining.txt
Targeting 1000000 samples.
Output TFRecords will be saved to: pubchem_tfrecords_1M in 100 shards.
Configured MAX_NODES: 419, NUM_ATOM_FEATURES: 5, MAX_SMILES_LEN: 256


In [3]:
# --- Helper Functions (Copied from your pre-training notebook) ---

# --- SMILES Tokenization ---
def build_smiles_vocab(smiles_list, max_vocab_size=None):
    all_chars = set()
    for smiles in smiles_list:
        for char in smiles:
            all_chars.add(char)
    vocab = sorted(list(all_chars))
    vocab = ['<pad>', '<unk>', '<cls>', '<eos>'] + vocab
    if max_vocab_size:
        vocab = vocab[:max_vocab_size]
    char_to_idx = {char: i for i, char in enumerate(vocab)}
    idx_to_char = {i: char for i, char in enumerate(vocab)}
    print(f"Built vocabulary of size: {len(vocab)}")
    return vocab, char_to_idx, idx_to_char

def tokenize_smiles(smiles, char_to_idx, max_len):
    tokens = list(smiles)
    indexed_tokens = [char_to_idx.get(char, char_to_idx['<unk>']) for char in tokens]
    if len(indexed_tokens) < max_len:
        padded_tokens = indexed_tokens + [char_to_idx['<pad>']] * (max_len - len(indexed_tokens))
    else:
        padded_tokens = indexed_tokens[:max_len]
    return np.array(padded_tokens, dtype=np.int32)

def create_smiles_mask(token_ids, pad_token_id):
    return tf.cast(token_ids == pad_token_id, tf.bool)


# --- SMILES to TensorFlow Graph Conversion ---
def atom_to_feature_vector(atom):
    features = []
    features.append(atom.GetAtomicNum())
    features.append(atom.GetDegree())
    features.append(int(atom.GetHybridization()))
    features.append(int(atom.GetIsAromatic()))
    features.append(atom.GetFormalCharge())
    return np.array(features, dtype=np.float32)

def smiles_to_tf_graph(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:
        return (np.zeros((0, NUM_ATOM_FEATURES), dtype=np.float32),
                np.zeros((0, 2), dtype=np.int32),
                0,
                0)

    node_features = [atom_to_feature_vector(atom) for atom in mol.GetAtoms()]
    if not node_features:
        return (np.zeros((0, NUM_ATOM_FEATURES), dtype=np.float32),
                np.zeros((0, 2), dtype=np.int32),
                0,
                0)
    node_features = np.array(node_features, dtype=np.float32)
    num_nodes = len(node_features)

    edge_indices = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_indices.append([i, j])
        edge_indices.append([j, i])

    num_edges = len(edge_indices)
    if num_edges == 0:
        if num_nodes > 0:
            edge_indices_final = np.empty((0, 2), dtype=np.int32)
            num_edges_final = 0
        else:
            return (np.zeros((0, NUM_ATOM_FEATURES), dtype=np.float32),
                    np.zeros((0, 2), dtype=np.int32),
                    0,
                    0)
    else:
        edge_indices_final = np.array(edge_indices, dtype=np.int32)
        num_edges_final = len(edge_indices_final)

    return node_features, edge_indices_final, num_nodes, num_edges_final


# Define the featurization function that returns a flat tuple of tensors
def featurize_smiles_and_graph(smiles_string):
    token_ids = tokenize_smiles(smiles_string, char_to_idx, MAX_SMILES_LEN)
    mask = create_smiles_mask(token_ids, char_to_idx['<pad>'])

    node_features, edge_indices, num_nodes, num_edges = smiles_to_tf_graph(smiles_string)

    if num_nodes == 0:
        dummy_node_features = np.zeros((MAX_NODES, NUM_ATOM_FEATURES), dtype=np.float32)
        dummy_edge_indices = np.zeros((0, 2), dtype=np.int32)
        dummy_num_nodes = 0
        dummy_num_edges = 0
        return (dummy_node_features, dummy_edge_indices, dummy_num_nodes, dummy_num_edges, token_ids, mask)
    
    padded_node_features = np.pad(node_features, [[0, MAX_NODES - num_nodes], [0, 0]])
    
    return (padded_node_features, edge_indices, num_nodes, num_edges, token_ids, mask)

In [4]:
# --- TFRecord Serialization Functions ---
def _bytes_feature(value):
    if isinstance(value, tf.Tensor):
        value = value.numpy()
    if isinstance(value, np.ndarray):
        value = value.tobytes()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_pretraining_example(node_feat_padded, edge_idx, num_nodes, num_edges, token_ids, smiles_mask):
    feature = {
        'node_feat_padded': _bytes_feature(node_feat_padded),
        'edge_idx': _bytes_feature(edge_idx),
        'num_nodes': _int64_feature(num_nodes),
        'num_edges': _int64_feature(num_edges),
        'token_ids': _bytes_feature(token_ids),
        'smiles_mask': _bytes_feature(smiles_mask),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))



In [5]:
# --- Main Preprocessing Loop and TFRecord Saving ---
if __name__ == "__main__":
    print("Starting PubChem preprocessing script...")

    # Load SMILES data for the target number of samples
    # This reads only the required number of lines from the file.
    all_smiles_list = []
    with open(SMILES_FILE_PATH, 'r') as f:
        for i, line in enumerate(tqdm(f, desc="Loading SMILES")):
            if i >= SAMPLES_TO_PROCESS_IN_THIS_RUN:
                break
            all_smiles_list.append(line.strip())
    print(f"Loaded {len(all_smiles_list)} SMILES strings from file for processing.")

    # Build vocabulary from the loaded SMILES list
    global vocab, char_to_idx, idx_to_char, VOCAB_SIZE
    vocab, char_to_idx, idx_to_char = build_smiles_vocab(all_smiles_list)
    VOCAB_SIZE = len(vocab)
    print(f"Vocabulary built with {VOCAB_SIZE} tokens.")

    # Initialize TFRecord Writers
    writers = []
    for i in range(NUM_SHARDS):
        writers.append(tf.io.TFRecordWriter(os.path.join(OUTPUT_TFRECORD_DIR, f'pubchem_shard_{i:03d}.tfrecord')))
    
    num_processed = 0
    # Iterate through the selected subset of SMILES for featurization and saving
    for i, smiles_str in enumerate(tqdm(all_smiles_list, desc="Featurizing & Saving to TFRecords")):
        processed_data = featurize_smiles_and_graph(smiles_str)
        
        # Filter out invalid samples (where num_nodes was 0)
        if processed_data[2] > 0: # num_nodes is the 3rd element (index 2) in the returned flat tuple
            example = serialize_pretraining_example(*processed_data)
            writers[num_processed % NUM_SHARDS].write(example.SerializeToString())
            num_processed += 1
        
    for writer in writers:
        writer.close()
            
    print(f"Finished featurizing and saving {num_processed} valid samples to {NUM_SHARDS} TFRecord shards in {OUTPUT_TFRECORD_DIR}.")
    print("PubChem preprocessing script finished.")

Starting PubChem preprocessing script...


Loading SMILES: 1000000it [00:00, 3626185.84it/s]


Loaded 1000000 SMILES strings from file for processing.
Built vocabulary of size: 71
Vocabulary built with 71 tokens.


Featurizing & Saving to TFRecords:   0%|          | 0/1000000 [00:00<?, ?it/s]2025-07-01 04:20:35.641112: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-07-01 04:20:35.641148: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-07-01 04:20:35.641159: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-07-01 04:20:35.641199: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-07-01 04:20:35.641210: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Featurizing & Saving to TFRecords: 100%|██████████| 1000000/1000000 [13:40<00:00, 1218.04it/s]

Finished featurizing and saving 1000000 valid samples to 100 TFRecord shards in pubchem_tfrecords_1M.
PubChem preprocessing script finished.



