In [7]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.AllChem import EmbedMolecule, EmbedMultipleConfs, MMFFOptimizeMoleculeConfs
import pickle
from tqdm import tqdm

In [12]:
def mol_to_morgan(mol, radius=3, n_bits=2048, chiral=False, features=False):
    from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
    return np.frombuffer(
        GetMorganFingerprintAsBitVect(
            mol, radius=radius, nBits=n_bits, useChirality=chiral, useFeatures=features
        ).ToBitString().encode(), "u1") - ord("0")

def mol_to_atoms_coords(
    m, hydrogenate=True, adj_matrix=False, do_morgan=False, optimize=False, numConfs=1, numThreads=1
):
    # Check if m is a string, and if so, convert it to an RDKit molecule object
    if isinstance(m, str):
        m = Chem.MolFromSmiles(m)
        if m is None:
            return None  # Could not parse the SMILES string into a molecule

    m3 = Chem.AddHs(m) if hydrogenate else m
    if optimize and hydrogenate:
        try:
            EmbedMultipleConfs(
                m3, numConfs=numConfs, pruneRmsThresh=0.125, randomSeed=0xF00D, numThreads=numThreads
            )
            opt = MMFFOptimizeMoleculeConfs(m3, mmffVariant="MMFF94s", numThreads=numThreads)
            opt = np.array(opt)
            converged = opt[:, 0] == 0
            if converged.any():
                lowest_eng_conformer = np.argmin(opt[converged][:, 1])
                lowest_energy = opt[converged][lowest_eng_conformer, 1]
                best_conf = np.arange(opt.shape[0])[converged][lowest_eng_conformer]
                c0 = m3.GetConformer(id=int(best_conf))
            else:
                # Fallback in case optimization did not converge
                c0 = m3.GetConformer()
                lowest_energy = None
        except Exception as Ex:
            # Fallback in case of an exception during embedding or optimization
            EmbedMolecule(m3, randomSeed=0xF00D)
            c0 = m3.GetConformers()[-1]
            lowest_energy = None
    else:
        EmbedMolecule(m3, randomSeed=0xF00D)
        c0 = m3.GetConformers()[-1]

    coords = c0.GetPositions()
    atoms = np.array([atom.GetAtomicNum() for atom in m3.GetAtoms()], dtype=np.uint8)

    to_return = [atoms, coords]

    if adj_matrix:
        to_return.append(Chem.GetAdjacencyMatrix(m3))

    if do_morgan:
        to_return.append(mol_to_morgan(m, radius=3, n_bits=2048, chiral=False))

    if optimize:
        to_return.append(lowest_energy)

    return tuple(to_return)

In [9]:
# Load the SMILES data
# Replace 'path_to_your_smiles_data.smiles' with the actual path to your downloaded SMILES file
df_train = pd.read_csv('train_guacamol.smiles', header=None, names=['smiles'])
df_valid = pd.read_csv('valid_guacamol.smiles', header=None, names=['smiles'])
df_test = pd.read_csv('test_guacamol.smiles', header=None, names=['smiles'])

In [14]:
# Preprocess the SMILES data
def preprocess_smiles_data(smiles_data):
    preprocessed_data = []
    for smi in tqdm(smiles_data):
        try:
            atoms, coords = mol_to_atoms_coords(smi)
            preprocessed_data.append({
                'smiles': smi,
                'atoms': atoms,
                'coords': coords
            })
        except Exception as e:
            print(f"Failed to process {smi}: {str(e)}")

    return preprocessed_data

# Save the preprocessed data to a pickle file
with open('train_guacamol.pkl', 'wb') as f:
    pickle.dump(preprocess_smiles_data(df_train['smiles']), f)

with open('valid_guacamol.pkl', 'wb') as f:
    pickle.dump(preprocess_smiles_data(df_valid['smiles']), f)

with open('test_guacamol.pkl', 'wb') as f:
    pickle.dump(preprocess_smiles_data(df_test['smiles']), f)

 62%|██████▏   | 794630/1273104 [18:36:15<32:00:26,  4.15it/s]

Failed to process CCCCCCCCCCCCCC(=O)NC(COC1OC(CO)C(O)C(O)C1O)C(O)CCCCCCCCCCCCC: 'NoneType' object has no attribute 'GetPositions'


 62%|██████▏   | 794825/1273104 [18:36:38<12:58:07, 10.24it/s][11:05:11] UFFTYPER: Unrecognized charge state for atom: 10
 62%|██████▏   | 794830/1273104 [18:36:38<14:50:53,  8.95it/s]

Failed to process CCCCCCCCCCCCCCCC(=O)OCC(COP(=O)(O)OC1C(O)C(O)C(O)C(F)C1O)OC(=O)CCCCCCCCCCCCCCC: 'NoneType' object has no attribute 'GetPositions'


 62%|██████▏   | 795235/1273104 [18:37:19<14:38:57,  9.06it/s]

Failed to process CC(=O)N(C(=O)C=Cc1ccccc1)C(Cc1ccc(F)cc1)C(=O)NC(CCCCN=C(N)N)C(=O)NC(CC(C)C)C(=O)NC(CCCN=C(N)N)C(N)=O: 'NoneType' object has no attribute 'GetPositions'


 62%|██████▏   | 795440/1273104 [18:37:37<7:27:35, 17.79it/s] [11:06:10] UFFTYPER: Unrecognized charge state for atom: 20
 62%|██████▏   | 795515/1273104 [18:37:44<18:53:52,  7.02it/s][11:06:17] UFFTYPER: Unrecognized charge state for atom: 6
 62%|██████▏   | 795571/1273104 [18:37:49<11:03:51, 11.99it/s][11:06:22] UFFTYPER: Unrecognized charge state for atom: 8
 63%|██████▎   | 796011/1273104 [18:38:39<18:57:27,  6.99it/s]

Failed to process C=CC1CC1(NC(=O)C1CC(OC)(c2ccc(-c3ccccn3)cc2)CN1C(=O)C(NC(=O)OC1CCCC1)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1: 'NoneType' object has no attribute 'GetPositions'


 63%|██████▎   | 796240/1273104 [18:39:04<17:16:30,  7.67it/s][11:07:37] UFFTYPER: Unrecognized charge state for atom: 1
 63%|██████▎   | 796311/1273104 [18:39:12<13:06:00, 10.11it/s]

Failed to process COC(=O)C(CCCCNC(=O)OC(C)(C)C)N(C=CCc1cccc(Oc2ccc(C(C)(C)C)cc2)c1)Cc1cccc(OCc2ccccc2)c1: 'NoneType' object has no attribute 'GetPositions'


 63%|██████▎   | 796372/1273104 [18:39:21<25:07:02,  5.27it/s][11:07:54] UFFTYPER: Unrecognized charge state for atom: 2
 63%|██████▎   | 796823/1273104 [18:40:14<24:42:58,  5.35it/s][11:08:47] UFFTYPER: Unrecognized charge state for atom: 19
 63%|██████▎   | 796889/1273104 [18:40:20<13:50:20,  9.56it/s][11:08:53] UFFTYPER: Unrecognized charge state for atom: 1
 63%|██████▎   | 797018/1273104 [18:40:34<17:30:03,  7.56it/s][11:09:07] UFFTYPER: Unrecognized charge state for atom: 14
 63%|██████▎   | 797139/1273104 [18:40:48<12:53:41, 10.25it/s][11:09:21] UFFTYPER: Unrecognized charge state for atom: 4
 63%|██████▎   | 797477/1273104 [18:41:19<9:05:36, 14.53it/s] [11:09:51] UFFTYPER: Unrecognized atom type: S_6+6 (15)
 63%|██████▎   | 797486/1273104 [18:41:19<9:53:46, 13.35it/s]

Failed to process CC(=O)NC(Cc1ccc2ccccc2c1)C(=O)NC(CCCCNC(=O)c1ccccn1)C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(N)=O: 'NoneType' object has no attribute 'GetPositions'


 63%|██████▎   | 797646/1273104 [18:41:33<10:32:02, 12.54it/s][11:10:06] UFFTYPER: Unrecognized atom type: S_6+6 (15)
 63%|██████▎   | 798794/1273104 [18:43:32<12:51:58, 10.24it/s][11:12:05] UFFTYPER: Unrecognized atom type: S_6+6 (16)
 63%|██████▎   | 799107/1273104 [18:44:02<15:13:16,  8.65it/s]

Failed to process CCCC=CCC=CCCC(=O)OC(COC(=O)CCCCCCCCCCCCC)C(O)C(CO)NC(=O)CCCCCCCCCCC: 'NoneType' object has no attribute 'GetPositions'


 63%|██████▎   | 799144/1273104 [18:44:06<13:11:08,  9.98it/s][11:12:39] UFFTYPER: Unrecognized charge state for atom: 5
 63%|██████▎   | 799146/1273104 [18:44:06<14:25:19,  9.13it/s][11:12:39] UFFTYPER: Unrecognized charge state for atom: 15
 63%|██████▎   | 799269/1273104 [18:44:20<12:28:18, 10.55it/s][11:12:52] UFFTYPER: Unrecognized charge state for atom: 8
 63%|██████▎   | 799299/1273104 [18:44:23<17:13:30,  7.64it/s]

Failed to process CCC=CCC=CCC=CCCCCCCCC(=O)OCC(COC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)OC(=O)CCCCCCCC=CCCCCCCCC: 'NoneType' object has no attribute 'GetPositions'


 63%|██████▎   | 799399/1273104 [18:44:33<14:41:03,  8.96it/s][11:13:06] UFFTYPER: Unrecognized charge state for atom: 6
 63%|██████▎   | 799501/1273104 [18:44:43<11:46:44, 11.17it/s][11:13:16] UFFTYPER: Unrecognized charge state for atom: 6
 63%|██████▎   | 799664/1273104 [18:44:57<11:56:14, 11.02it/s][11:13:30] UFFTYPER: Unrecognized charge state for atom: 8
 63%|██████▎   | 800049/1273104 [18:45:29<7:32:55, 17.41it/s] [11:14:02] UFFTYPER: Unrecognized atom type: Se2+2 (6)
[11:14:02] UFFTYPER: Unrecognized atom type: Se2+2 (6)
 63%|██████▎   | 800070/1273104 [18:45:31<7:55:12, 16.59it/s] [11:14:04] UFFTYPER: Unrecognized atom type: Se2+2 (8)
[11:14:04] UFFTYPER: Unrecognized atom type: Se2+2 (8)
 63%|██████▎   | 800497/1273104 [18:46:07<8:33:00, 15.35it/s] [11:14:40] UFFTYPER: Unrecognized charge state for atom: 24
[11:14:40] UFFTYPER: Unrecognized atom type: Se2+2 (24)
 63%|██████▎   | 800628/1273104 [18:46:19<12:58:54, 10.11it/s][11:14:52] UFFTYPER: Unrecognized charge state for at

Failed to process CC(C)CC(=O)OCC1(OC2OC(CO)C(O)C(O)C2OC(=O)CC(C)C)OC(OC(=O)CC(C)C)C(OC(=O)CC(C)C)C1OC(=O)CC(C)C: 'NoneType' object has no attribute 'GetPositions'


 63%|██████▎   | 802050/1273104 [18:48:25<10:25:20, 12.55it/s][11:16:58] UFFTYPER: Unrecognized charge state for atom: 8
 63%|██████▎   | 802065/1273104 [18:48:26<9:38:14, 13.58it/s] [11:16:59] UFFTYPER: Unrecognized charge state for atom: 9
 63%|██████▎   | 802298/1273104 [18:48:45<10:31:31, 12.43it/s][11:17:18] UFFTYPER: Unrecognized charge state for atom: 8
 63%|██████▎   | 802478/1273104 [18:49:02<10:11:32, 12.83it/s][11:17:35] UFFTYPER: Unrecognized charge state for atom: 8
 63%|██████▎   | 802776/1273104 [18:49:42<15:58:16,  8.18it/s]

Failed to process Nc1nc(N)c(C(=O)NC2CCC[N+](CCCc3ccc(OCC(=O)NCCc4ccccn4)cc3)(CCCc3ccc(OCC(=O)NCCc4ccccn4)cc3)C2)nc1Cl: 'NoneType' object has no attribute 'GetPositions'


 63%|██████▎   | 803202/1273104 [18:50:24<15:32:26,  8.40it/s]

In [None]:
# Convert the lists of dictionaries to DataFrames and save as CSV files
def save_as_csv(preprocessed_data, filename):
    df = pd.DataFrame(preprocessed_data)
    # If 'coords' and 'atoms' are lists, they will be saved as strings in the CSV. You might need to parse them when reading.
    df.to_csv(filename, index=False)

save_as_csv(preprocessed_train, 'train_guacamol.csv')
save_as_csv(preprocessed_valid, 'valid_guacamol.csv')
save_as_csv(preprocessed_test, 'test_guacamol.csv')