In [1]:
import pandas as pd
import numpy as np

from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors, MolFromSmiles
from rdkit import Chem

import csv

In [None]:
source_df = pd.read_csv(filepath_or_buffer='outputs/processed_source_dataset.csv', sep=',')
source_df

In [None]:
smiles_strings = list(source_df['Smiles'])
del source_df
len(smiles_strings)

In [4]:
descriptor_names = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

In [None]:
# Define a function to batch process the rows
# It took 7h 10min
def batched_rows(smiles_strings, batch_size=1000):
    batch = []
    for smiles_string in smiles_strings:
        m = Chem.MolFromSmiles(smiles_string)
        features = calc.CalcDescriptors(m)
        batch.append(features)
        if len(batch) == batch_size:
            yield batch
            batch = []
    if batch:
        yield batch

with open('outputs/source_descriptors.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(descriptor_names)
    batch_counter = 0
    for batch in batched_rows(smiles_strings):
        writer.writerows(batch)
        print(f'Batch No {batch_counter} written')
        batch_counter += 1