In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit.Chem import MolFromSmiles, MolToSmiles, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors, MolFromSmiles
from rdkit import Chem
import pickle
with open('outputs/non_constant_columns.pkl', 'rb') as f:
    non_constant_columns = pickle.load(f)
from joblib import dump, load
base_model_homo = load('outputs/models/homo_dft.joblib')
base_model_lumo = load('outputs/models/lumo_dft.joblib')
homo_ups_predictor = load('outputs/models/homo_ups_predictor.joblib')
lumo_ipes_predictor = load('outputs/models/lumo_ipes_predictor.joblib')

In [8]:
def predictions(smiles_string):

    # Check smiles string and convert SMILES to canonical
    mol = MolFromSmiles(smiles_string)
    if mol is None:
        print('Non valid SMILES string')
        return
    else:
        s_smiles = MolToSmiles(MolFromSmiles(smiles_string))
    
    # Calculate all descriptors
    descriptor_names = [x[0] for x in Descriptors._descList]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

    # Process and standardize all descriptors
    m = Chem.MolFromSmiles(s_smiles)
    features = calc.CalcDescriptors(m)
    df = pd.DataFrame([features],columns=descriptor_names)
    df = df[non_constant_columns]
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(-5)
    df['Ipc'] = np.log(df['Ipc'])
    float64_cols = list(df.select_dtypes(include='float64'))
    df[float64_cols] = df[float64_cols].astype('float32')
    int64_cols = list(df.select_dtypes(include='int64'))
    df[int64_cols] = df[int64_cols].astype('int16')

    # Prediction of HOMO and LUMO from descriptors
    homo_dft_predictions = base_model_homo.predict(df)
    lumo_dft_predictions = base_model_lumo.predict(df)

    # Gather features for prediction of IE and EA 
    X_selected_homo = df[list(homo_ups_predictor.feature_names_in_)[1:]]
    X_selected_lumo = df[list(lumo_ipes_predictor.feature_names_in_)[1:]]
    X_selected_homo.insert(0, 'DFT_pred', homo_dft_predictions)
    X_selected_lumo.insert(0, 'DFT_pred', lumo_dft_predictions)

    # Prediction of IE and EA
    preds_homo = homo_ups_predictor.predict(X_selected_homo)
    preds_lumo = lumo_ipes_predictor.predict(X_selected_lumo)

    df_result = pd.DataFrame([[s_smiles, homo_dft_predictions[0], lumo_dft_predictions[0], preds_homo[0], preds_lumo[0]]], columns=['SMILES string', 'HOMO predicted (eV)', 'LUMO predicted (eV)', 'IE predicted (eV)', 'EA predicted (eV)'])
    return df_result


In the cell below, just copy paste the SMILES string of your molecule and run the cell

In [9]:
new_molecule_smile = 'O=C(C(C=C(F)C(F)=C1)=C1C/2=C(C#N)\C#N)C2=C/C3=C(CC(CCCCCC)CCCC)C(S4)=C(S3)C5=C4C6=C(N5CC(CC)CCCC)C7=C(C(SC8=C9SC(/C=C%10/C(C(C=C(F)C(F)=C%11)=C%11C%10=O)=C(C#N)/C#N)=C8CC(CCCC)CCCCCC)=C9N7CC(CC)CCCC)C%12=NSN=C6%12'
predictions(new_molecule_smile)

Unnamed: 0,SMILES string,HOMO predicted (eV),LUMO predicted (eV),IE predicted (eV),EA predicted (eV)
0,CCCCCCC(CCCC)Cc1c(/C=C2\C(=O)c3cc(F)c(F)cc3C2=...,-5.586753,-3.572527,-5.555468,-4.001254
