In [None]:
import pandas as pd
from library.functions_to_abstract_data import extract_qm9_data
from torch_geometric.datasets import QM9
from rdkit import Chem
import os
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from tqdm import tqdm

In [None]:
# Turn off RDKit warning messages
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

: 

In [None]:
# --- Config ---
save_path = "../data/RDKit/rdkit_only_valid_smiles_qm9.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# --- Load data ---
dataset = QM9(root="../data/QM9")
df_qm9 = extract_qm9_data(dataset)

smiles = df_qm9["smiles"]
descriptor_names = [name for name, _ in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

# --- Process safely ---
batch_size = 1000
results = []

for i in tqdm(range(0, len(smiles), batch_size)):
    batch = smiles[i:i+batch_size]
    mols, valids = [], []
    for s in batch:
        mol = Chem.MolFromSmiles(s)
        if mol is not None:
            mols.append(mol)
            valids.append(s)
    batch_data = []
    for m in mols:
        try:
            batch_data.append(calc.CalcDescriptors(m))
        except Exception as e:
            print(f"⚠️ Skipping molecule due to error: {e}")
            batch_data.append([None] * len(descriptor_names))
    batch_df = pd.DataFrame(batch_data, columns=descriptor_names)
    batch_df["SMILES"] = valids
    results.append(batch_df)
    del batch_df, batch_data, mols  # Free memory

# --- Combine and save ---
df_all = pd.concat(results, ignore_index=True)
df_all.to_pickle(save_path)

print("✅ Descriptor matrix shape:", df_all.shape)


⚠️ Invalid SMILES skipped: [H]C1([H])C23([H])OC12([H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C([H])([H])C123OC1([H])(C2([H])[H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C([H])([H])C1([H])C23([H])OC12([H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]OC1([H])C23([H])OC12([H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C1([H])[C@]2([H])C34([H])OC23([H])[C@@]14[H]
⚠️ Invalid SMILES skipped: [H]N1[C@]2([H])C34([H])OC23([H])[C@@]14[H]
⚠️ Invalid SMILES skipped: [H]C([H])([H])C123OC1(C([H])([H])[H])(C2([H])[H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C([H])([H])[C@@]1([H])C23([H])OC12(C([H])([H])[H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C#CC123OC1([H])(C2([H])[H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C1([H])C23([H])OC12(C#N)C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C(=O)C123OC1([H])(C2([H])[H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]C([H])([H])C([H])([H])C123OC1([H])(C2([H])[H])C3([H])[H]
⚠️ Invalid SMILES skipped: [H]OC([H])([H])C123OC1([H])(C2([H])[H])C3([H])[H]
⚠️ Invalid SMILES skipped: 

100%|██████████| 129012/129012 [25:58<00:00, 82.78it/s] 
