In [1]:
import pandas as pd
import numpy as np

from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors, MolFromSmiles, MolToSmiles
from rdkit import Chem

import csv

Previously the code was working with already calculated descriptors. But using different RDKit versions for calculating descriptors may result in different descriptors. Therefore, the code will take only the SMILES as input and calculate descriptors.

In [None]:
target_df = pd.read_excel('target_data/HOMO LUMO database.xlsx')
target_df

In [3]:
# Convert energy levels from positive to negative
target_df[['HOMO_DFT', 'LUMO_DFT', 'HOMO_UPS', 'LUMO_IPES']] = target_df[['HOMO_DFT', 'LUMO_DFT', 'HOMO_UPS', 'LUMO_IPES']] * (-1)

In [None]:
target_df

In [None]:
# Drop the molecules without a SMILES string
target_df = target_df.dropna(subset=['SMILES'])
target_df

## 1. Check for valid and standard SMILES

In [None]:
smiles_strings = list(target_df['SMILES'])
len(smiles_strings)

In [7]:
from molvs import Standardizer
s = Standardizer()

def molecule_checker(smiles):
    non_valid_smiles = []
    non_standard_smiles = []
    for i, smi in enumerate(smiles):
        mol = MolFromSmiles(smi)
        if mol is None:
            non_valid_smiles.append(i)
        else:
            s_smiles = MolToSmiles(s.standardize(mol))
            c_smiles = MolToSmiles(mol)
            if s_smiles != c_smiles:
                non_standard_smiles.append(i)

    return non_valid_smiles, non_standard_smiles

In [None]:
non_valid_smiles, non_standard_smiles = molecule_checker(smiles_strings)

In [None]:
print(f'There are {len(non_valid_smiles)} non valid smiles.')
print(f'There are {len(non_standard_smiles)} non standard smiles.')

In [None]:
target_df.iloc[non_valid_smiles]

In [None]:
target_df.iloc[non_standard_smiles]

In [None]:
rows_to_drop = target_df.iloc[non_valid_smiles].index
target_df = target_df.drop(rows_to_drop)
target_df

In [None]:
target_df = target_df.reset_index(drop=True)
target_df

## 2. Calculating descriptors

In [None]:
smiles_strings = list(target_df['SMILES'])
len(smiles_strings)

In [15]:
descriptor_names = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

In [16]:
all_descriptors = []
for smiles_string in smiles_strings:
    m = Chem.MolFromSmiles(smiles_string)
    descriptors = calc.CalcDescriptors(m)
    all_descriptors.append(descriptors)

In [None]:
descriptors_df = pd.DataFrame(all_descriptors, columns=descriptor_names)
descriptors_df

## 3. Process the descriptors

In [18]:
def descriptor_processor(df, non_constant_columns):
    df = df[non_constant_columns]

    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(-5)

    df['Ipc'] = np.log(df['Ipc'])

    float64_cols = list(df.select_dtypes(include='float64'))
    df[float64_cols] = df[float64_cols].astype('float32')

    int64_cols = list(df.select_dtypes(include='int64'))
    df[int64_cols] = df[int64_cols].astype('int16')

    return df

In [19]:
import pickle
with open('outputs/non_constant_columns.pkl', 'rb') as f:
    non_constant_columns = pickle.load(f)

In [20]:
descriptors_df = descriptor_processor(descriptors_df, non_constant_columns)

In [None]:
processed_target_df = pd.concat((
    target_df[['Type', 'Molecule', 'SMILES']],
    descriptors_df,
    target_df[['HOMO_DFT', 'LUMO_DFT', 'HOMO_UPS', 'LUMO_IPES']]), axis=1)
processed_target_df

In [None]:
processed_target_df.info()

In [23]:
processed_target_df.to_pickle('outputs/target_descriptors_calculated_n_processed.pkl')