In [None]:
from library.functions_to_abstract_data import extract_qm9_data
from rdkit.ML.Descriptors import MoleculeDescriptors
from torch_geometric.datasets import QM9
from rdkit.Chem import Descriptors
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Turn off RDKit warning messages
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [3]:
# --- Config ---
save_path = "../data/RDKit/rdkit_only_valid_smiles_qm9.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# --- Load data ---
dataset = QM9(root="../data/QM9")
df_qm9 = extract_qm9_data(dataset)

smiles = df_qm9["smiles"]
gaps = df_qm9['gap']
descriptor_names = [name for name, _ in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

# --- Process safely ---
batch_size = 1000
results = []

for i in tqdm(range(0, len(smiles), batch_size)):
    batch = smiles[i:i+batch_size]
    mols, valid_smiles, valid_gaps = [], [], []
    for j,s in enumerate(batch):
        mol = Chem.MolFromSmiles(s)
        if mol is not None:
            mols.append(mol)
            valid_smiles.append(s)
            valid_gaps.append(gaps[i+j])
    batch_data = []
    for m in mols:
        try:
            batch_data.append(calc.CalcDescriptors(m))
        except Exception as e:
            print(f"⚠️ Skipping molecule due to error: {e}")
            batch_data.append([None] * len(descriptor_names))
    batch_df = pd.DataFrame(batch_data, columns=descriptor_names)
    batch_df["SMILES"] = valid_smiles
    batch_df['gaps'] = valid_gaps
    results.append(batch_df)
    del batch_df, batch_data, mols  # Free memory

# --- Combine and save ---
df_all = pd.concat(results, ignore_index=True)
df_all.to_pickle(save_path)

print("✅ Descriptor matrix shape:", df_all.shape)


100%|██████████| 131/131 [15:44<00:00,  7.21s/it]


✅ Descriptor matrix shape: (129012, 219)


In [4]:
df = pd.read_pickle("../data/RDKit/rdkit_only_valid_smiles_qm9.pkl")
df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,SMILES,gaps
0,0.000000,0.000000,0.000000,0.000000,0.359785,0.000000,16.043,12.011,16.031300,8,...,0,0,0,0,0,0,0,0,[H]C([H])([H])[H],13.736308
1,0.000000,0.000000,0.000000,0.000000,0.397555,0.000000,17.031,14.007,17.026549,8,...,0,0,0,0,0,0,0,0,[H]N([H])[H],9.249149
2,0.000000,0.000000,0.000000,0.000000,0.327748,0.000000,18.015,15.999,18.010565,8,...,0,0,0,0,0,0,0,0,[H]O[H],9.836916
3,4.000000,4.000000,4.000000,4.000000,0.332926,1.000000,26.038,24.022,26.015650,10,...,0,1,0,0,0,0,0,0,[H]C#C[H],9.118535
4,6.500000,6.500000,3.500000,3.500000,0.369797,1.000000,27.026,26.018,27.010899,10,...,0,0,0,0,0,0,0,0,[H]C#N,10.329442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129007,2.085648,2.085648,0.962963,0.962963,0.419962,98.666667,120.175,110.095,120.080776,46,...,0,0,0,0,0,0,0,0,[H]C1([H])[C@@]23[N@H+]4[C@@]5([H])[C@]1([H])[...,8.723970
129008,5.924769,5.924769,0.546296,0.546296,0.455786,98.666667,120.151,112.087,120.057515,46,...,0,0,0,0,0,0,0,0,[H]C1([H])[C@@]2([H])[C@@]3([H])[C@@]14O[C@]1(...,9.064113
129009,2.005787,2.005787,0.972222,0.972222,0.305840,98.666667,122.171,112.091,122.083301,46,...,0,0,0,0,0,0,0,0,[H]C1([H])[N@@H+]2[C@@]3([H])[C@]2([H])[C@]2([...,8.035522
129010,2.026620,2.026620,1.009259,1.009259,0.413790,98.666667,120.175,110.095,120.080776,46,...,0,0,0,0,0,0,0,0,[H]C1([H])[N@H+]2[C@@]3([H])[C@@]14C([H])([H])...,8.171579
