In [1]:
# Environment information
import sklearn
import rdkit
import pandas
import numpy

print("scikit-learn:", sklearn.__version__)
print("RDKit:", rdkit.__version__)
print("pandas:", pandas.__version__)
print("numpy:", numpy.__version__)

scikit-learn: 1.6.1
RDKit: 2025.03.2
pandas: 2.2.3
numpy: 2.2.6


In [None]:
# Input CSV format: [ID, SMILES_A, SMILES_B, Yield]
"""
Code S1. Generation of Morgan fingerprints (radius = 2, nBits = 2048) from SMILES
and construction of the ML dataset sheet.

Input  : CSV containing substrate SMILES in column 2 (A) and column 3 (B)
Output : CSV in which columns 1â€“4 are retained from the input file, followed by
         Morgan fingerprints of A (2048 bits) and B (2048 bits).

Notes
- RDKit Morgan fingerprints were generated as bit vectors (0/1).
- If SMILES parsing fails, the corresponding fingerprint is filled with NaN.
- This script assumes the input CSV has no header (header=None). If your input
  has a header row, set header=0 and adjust column indices accordingly.
"""

import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem

# ==========================
# Settings
# ==========================
INPUT_CSV  = "ml_redam.csv"
OUTPUT_CSV = "ml_redam_ds.csv"

SMILES_COL_A = 1  # 0-based index: column 2 in the original file
SMILES_COL_B = 2  # 0-based index: column 3 in the original file

RADIUS = 2
NBITS  = 2048

# ==========================
# Helper
# ==========================
def smiles_to_morgan_bits(smiles: str, radius: int = RADIUS, n_bits: int = NBITS):
    """
    Convert a SMILES string to a Morgan fingerprint bit vector (length = n_bits).

    Returns
    -------
    list[float]
        List of 0/1 values. If SMILES parsing fails, returns NaN-filled vector.
    """
    if not isinstance(smiles, str) or smiles.strip() == "":
        return [np.nan] * n_bits

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * n_bits

    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    # Convert "0101..." string to numeric list [0,1,0,1,...]
    return [float(ch) for ch in fp.ToBitString()]

# ==========================
# Main
# ==========================
t0 = time.time()
print(f"Building dataset: {INPUT_CSV} -> {OUTPUT_CSV}")

# Load (assumes no header row)
df = pd.read_csv(INPUT_CSV, header=None)

# Keep first four columns as-is (adjust if needed)
df_meta = df.iloc[:, 0:4].copy()

# Extract SMILES for substrates A and B
smiles_A = df.iloc[:, SMILES_COL_A]
smiles_B = df.iloc[:, SMILES_COL_B]

print("Converting substrate A SMILES to Morgan fingerprints...")
fps_A = [smiles_to_morgan_bits(s) for s in tqdm(smiles_A, desc="A", leave=False)]

print("Converting substrate B SMILES to Morgan fingerprints...")
fps_B = [smiles_to_morgan_bits(s) for s in tqdm(smiles_B, desc="B", leave=False)]

# Build fingerprint DataFrames
df_fp_A = pd.DataFrame(fps_A, columns=[f"Morgan_A_{i}" for i in range(NBITS)])
df_fp_B = pd.DataFrame(fps_B, columns=[f"Morgan_B_{i}" for i in range(NBITS)])

# Concatenate: meta + FP(A) + FP(B)
df_out = pd.concat([df_meta, df_fp_A, df_fp_B], axis=1)

# Save
# - header=True is recommended for SI reproducibility.
#   If you must match your legacy pipeline, set header=False.
df_out.to_csv(OUTPUT_CSV, index=False, header=True)

elapsed = time.time() - t0
print(f"Done. Output saved to: {OUTPUT_CSV}")
print(f"Elapsed time (s): {elapsed:.1f}")
