## Abstract HOMO-LUMO Gap & Electron affinity (EA) & Ionisation potential (IP) values after xTB optimisation 

In [6]:
import os
import glob

FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_name = ['C2','C4','C6','C8']
units_name = ['2','3','4','5','6','7','8','9','10']

all_gap_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            file_location = os.path.join(f'{func_group}_{link}_{units}',f'output_{func_group}_{link}_{units}.txt')
            filenames = glob.glob(file_location)
            datafile1 = open('all_HL-gap_free.txt', 'w')
            output = open(file_location,'r')
            data = output.readlines()
            output.close()
            for line in data:
                if 'HOMO-LUMO GAP' in line:
                    gap = line
                    words = gap.split()
                    gap_value =float(words[3])
                    gap_value_free = f'H-L gap {func_group}_{link}_{units} = {gap_value}'
                    all_gap_value.append(gap_value_free)
                    list = all_gap_value
                    datafile1.write("\n".join([i for i in list[0:]]))
            datafile1.close()


all_EA_value = []
all_IP_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            file_location = os.path.join(f'{func_group}_{link}_{units}',f'vipea_{func_group}_{link}_{units}.txt')
            filenames = glob.glob(file_location)
            datafile2 = open('all_EA_free.txt', 'w')
            datafile3 = open('all_IP_free.txt', 'w')
            output = open(file_location,'r')
            data = output.readlines()
            output.close()
            for line in data:
                if 'delta SCC EA (eV):' in line:
                    EA = line
                    words = EA.split()
                    EA_value =float(words[4])
                    EA_value_free = f'EA {func_group}_{link}_{units} = {EA_value}'
                    all_EA_value.append(EA_value_free)
                    list = all_EA_value
                    #print(list)
                    datafile2.write("\n".join([i for i in list[0:]]))

                if 'delta SCC IP (eV):' in line:
                    IP = line
                    words = IP.split()
                    IP_value =float(words[4])
                    IP_value_free = f'IP {func_group}_{link}_{units} = {IP_value}'
                    all_IP_value.append(IP_value_free)
                    list = all_IP_value
                    datafile3.write("\n".join([i for i in list[0:]]))
            datafile2.close()
            datafile3.close()



## Put IP, EA, and H-L gap vaules in a single file 

In [4]:
HLvalue = [float(x.split(' ')[4].strip('\n')) for x in open('all_HL-gap_free.txt').readlines()]
IPvalue = [float(x.split(' ')[3].strip('\n')) for x in open('all_IP_free.txt').readlines()]
EAvalue = [float(x.split(' ')[3].strip('\n')) for x in open('all_EA_free.txt').readlines()]

together = []
for i in range(len(HLvalue)):
    all_values = open('IP_EA_HL_data_free.txt', 'w+')
    all_values.write(f'IP, EA, H-L gap\n')
    values = [IPvalue[i], EAvalue[i], HLvalue[i]]
    together.append(str(values))
    all_values.write("\n".join(together))
all_values.close()


## Get descriptors for each xtb optimised molecule

In [2]:
from rdkit import Chem 
from rdkit.Chem import Descriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.ML.Descriptors import MoleculeDescriptors
import numpy as np
import glob

def get_descriptors(rdmols):

#returns the physicochemical properties for the molecules
    
    descriptors = [
        'ExactMolWt', # The exact molecular weight of the molecule
        'NumValenceElectrons', # The number of valence electrons the molecule has
        'HeavyAtomCount', # Number of heavy atoms a molecule.
        'NHOHCount', # Number of NHs or OHs
        'NOCount', # Number of Ns and Os
        'NumHAcceptors', # Number of Hydrogen Bond Acceptors
        'NumHDonors', # of Hydrogen Bond Donors
        'NumHeteroatoms', # Number of Heteroatoms
        'fr_Al_COO', # Number of aliphatic carboxylic acids
        'fr_Al_OH', # Number of aliphatic hydroxyl groups
        'fr_Al_OH_noTert', # Number of aliphatic hydroxyl groups excluding tert-OH
        'fr_COO', # Number of carboxylic acids
        'fr_COO2', # Number of carboxylic acids
        'fr_C_O', # Number of carbonyl O
        'fr_C_O_noCOO', # Number of carbonyl O, excluding COOH
        'fr_NH0', # Number of Tertiary amines
        'fr_NH1', # Number of Secondary amines
        'fr_NH2', # Number of Primary amines
        'fr_allylic_oxid', # Number of allylic oxidation sites excluding steroid dienone
        'fr_dihydropyridine', # Number of dihydropyridines                
        'fr_methoxy', # Number of methoxy groups -OCH3
        'fr_nitrile', # Number of nitriles
        'fr_nitro', # Number of nitro groups
        'fr_nitroso', # Number of nitroso groups, excluding NO2
        'fr_piperdine', # Number of piperdine rings

    ]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
        
    Desc_values = calculator.CalcDescriptors(rdmols)
    all_dec = []
    for i in Desc_values:
        # round each figure to 3 decimal places 
        Desc_values_V2 = round(i,3)
        all_dec.append(Desc_values_V2)
    return all_dec



FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_name = ['C2','C4','C6','C8']
units_name = ['2','3','4','5','6','7','8','9','10']

all_desc_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            # find xtbtopo.mol file in each molecule
            file_location = os.path.join(f'{func_group}_{link}_{units}','xtbtopo.mol')
            filenames = glob.glob(file_location)
            datafile4 = open('all_desc_free.txt', 'w')
            #covert xtbtopo.mol to an RDKit mol
            output = Chem.MolFromMolFile(file_location, sanitize=False, strictParsing=False)
            all_desc_value.append(str(get_descriptors(output)))
            # write file
            datafile4.write("\n".join(all_desc_value))


## Creat a finial datafile for machine learning 

In [2]:
FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_name = ['C2','C4','C6','C8']
units_name = ['2','3','4','5','6','7','8','9','10']

# round each figure to 3 decimal places 
HLvalue = [round(float(x.split(' ')[4].strip('\n')),3) for x in open('all_HL-gap_free.txt').readlines()]
IPvalue = [round(float(x.split(' ')[3].strip('\n')),3) for x in open('all_IP_free.txt').readlines()]
EAvalue = [round(float(x.split(' ')[3].strip('\n')),3) for x in open('all_EA_free.txt').readlines()]

# creat a list containing all IP, EA and H-L value
together_V2 = []
for i in range(len(HLvalue)):
    values = [IPvalue[i], EAvalue[i], HLvalue[i]]
    together_V2.append(values)


# creat a list containing all descriptors value
import glob
all_desc_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            # find xtbtopo.mol file in each molecule
            file_location = os.path.join(f'{func_group}_{link}_{units}','xtbtopo.mol')
            filenames = glob.glob(file_location)
            #covert xtbtopo.mol to an RDKit mol
            output = Chem.MolFromMolFile(file_location, sanitize=False, strictParsing=False)
            all_desc_value.append(get_descriptors(output))


# In order to add number of carbon atom in the linker and number of units into descriptors, creat another list.
name_no = []
# number of carbon atom in the linker
linker_no = ['2','4','6','8']

for func_no in range(len(FG_name)):
    for link in linker_no:
        for units in units_name:
            label = [int(link),int(units)]
            name_no.append(label)


# Combine the 3 lists created above to assemble the final dataset
total = []
for i in range(len(HLvalue)):
    tot_values = open('total_data_free.txt', 'w+')
    allv = all_desc_value[i] + name_no[i] + together_V2[i]
    total.append(str(allv))
    tot_values.write("\n".join(total))
tot_values.close()

## Creat a finial datafile with labels 

In [5]:
name_label = []
FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_name = ['C2','C4','C6','C8']
units_name = ['2','3','4','5','6','7','8','9','10']


for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            namelabel = f'{func_group}_{link}_{units}'
            name_label.append(namelabel)

Descvalue = [x.strip('\n') for x in open('total_data_free.txt').readlines()]

total_l = []
for i in range(len(Descvalue)):
    tot_values = open('total_data_free_label.txt', 'w+')
    tot_values.write(f'Name, ExactMolWt, NumVE, HeavyAtom#, NHOH#, NOC#, NumHAcc#, NumHDo#, NumHeteroatoms, fr_Al_COO, fr_Al_OH, fr_Al_OH_noTert, fr_COO, fr_COO2, fr_C_O, fr_C_O_noCOO, fr_NH0, fr_NH1, fr_NH2, fr_allylic_oxid, fr_dihydropyridine, fr_methoxy, fr_nitrile, fr_nitro, fr_nitroso, fr_piperdine, MetalAN, #linker, #unit, IP, EA,  H-L gap\n')
    allv_l = name_label [i] + ' ' + Descvalue[i]
    total_l.append(allv_l)
    tot_values.write("\n".join(total_l))
tot_values.close()