In [1]:
import sys
#sys.path.append('/Applications/anaconda3/lib/python3.8/site-packages')

from rdkit import Chem 
from rdkit.Chem import AllChem as rdkit
from collections import defaultdict
from rdkit.Chem import rdFMCS
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDistGeom
IPythonConsole.ipython_3d = True

import py3Dmol
from IPython.display import Image
import matplotlib.pyplot as plt
import subprocess
import time
import stk
import stko
import os
import spindry as spd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from rdkit.Chem import rdMolAlign
from rdkit import RDLogger
import logging

# RDkit logger
rdkit_logger = RDLogger.logger()
rdkit_logger.setLevel(RDLogger.CRITICAL)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
%matplotlib inline

def show_stk_mol(stk_mol):
    data = rdkit.MolToMolBlock(stk_mol.to_rdkit_mol())
    p = py3Dmol.view(
        data=data,
        style={'stick':{'colorscheme':'cyanCarbon'}}, 
        width=400,
        height=400,
    )
    p.setBackgroundColor('0xeeeeee')
    p.zoomTo()
    p.show()



def rdkit_op(bb):
    rdkit_bb = bb.to_rdkit_mol()
    rdkit.SanitizeMol(rdkit_bb)
    rdkit.MMFFOptimizeMolecule(rdkit_bb)

    # stk molecules are immutable. with_position_matrix returns a
    # a clone, holding the new position matrix.
    bb = bb.with_position_matrix(
        position_matrix=rdkit_bb.GetConformer().GetPositions(),
    )

    return bb

## FG & Linker & No. units

In [3]:
FG_name = ['Py']
linker_name = ['C8']
units_name = ['2','3','4','5','6','7','8','9','10']

bb1_free = open ('FG_test.txt','r')
linker = open ('linker_test.txt','r')

# Define BB dictionaries.
bb1_smiles = {}
bb3_smiles = {}
for i, bb in enumerate(bb1_free.readlines()):
    bb1_smiles[i] = bb.replace('\n','')
    

linker_smiles = {}
for i, li in enumerate(linker.readlines()):
    linker_smiles[i] = li.replace('\n','')

#print(bb1_smiles, bb3_smiles, linker_smiles)

# Iterate over BB1.
for bb1id in bb1_smiles:
    bb1 = stk.BuildingBlock(bb1_smiles[bb1id], [stk.BromoFactory()])
    # Optimize with the MMFF force field.
    bb1 = rdkit_op(bb1)
    bb3_smiles[i] = bb1_smiles[bb1id].replace('Br/','')

        # Iterate over BB3.
    for bb3id in bb3_smiles:
        bb3 = stk.BuildingBlock(bb3_smiles[bb3id], [stk.BromoFactory()])
        # Optimize with the MMFF force field.
        bb3 = rdkit_op(bb3)
            
            # Iterate over linkers.
        for liid in linker_smiles:
            linker = stk.BuildingBlock(
                smiles=linker_smiles[liid],
                functional_groups=[stk.BromoFactory()],
            )
            # Iterate over repeat units.
            for no in range(7,9):
                repeat_units = no
                repeating_unit_str='CB'+'AB'*repeat_units +'C'
                orientation_str = '0, '*(len(repeating_unit_str)-1)+'1'
                # Set name based on iterations.
                molecule_name = f'{FG_name[bb1id]}_{linker_name[liid]}_{units_name[no]}'
                #print(molecule_name)

                porphyrin_noM = stk.ConstructedMolecule(
                    topology_graph=stk.polymer.Linear(
                        building_blocks=(bb1, linker, bb3),
                        repeating_unit=repeating_unit_str,
                        num_repeating_units=1,
                        orientations=tuple(map(int, orientation_str.split(', '))),
                        #optimizer=stk.MCHammer(),
                        optimizer=stk.Collapser(scale_steps=False),

                    ),
                )
                
                # Write to files.
                #porphyrin_noM.write(f'{molecule_name}.mol')
                porphyrin_noM.write(f'{molecule_name}.xyz')
                os.makedirs(f'{molecule_name}')
                os.chdir(f'{molecule_name}')
                os.system(f'mv ../{molecule_name}.xyz .')
                os.environ['XTBHOME'] = "/home/xwu/miniconda3/pkgs/xtb-6.4.1-hf06ca72_0/share/xtb"
                os.system(f'xtb {molecule_name}.xyz --gfn 1 --opt > output_{molecule_name}.txt && xtb xtbopt.xyz --gfn 1 --vipea > vipea_{molecule_name}.txt')
                os.chdir('../')
                
                #show_stk_mol(porphyrin_noM)


FileExistsError: [Errno 17] File exists: 'Py_C8_9'

## H-L Gap & EA & IP

In [6]:
FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_name = ['C2','C4','C6','C8']
units_name = ['2','3','4','5','6','7','8','9','10']

import glob
all_gap_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            file_location = os.path.join(f'{func_group}_{link}_{units}',f'output_{func_group}_{link}_{units}.txt')
            #print(file_location)
            filenames = glob.glob(file_location)
            #print(filenames)
            datafile1 = open('all_HL-gap_free.txt', 'w')
            #for ma in filenames:
            output = open(file_location,'r')
            data = output.readlines()
            output.close()
            for line in data:
                if 'HOMO-LUMO GAP' in line:
                    gap = line
                    #print(line)
                    words = gap.split()
                    gap_value =float(words[3])
                    #print(gap_value)
                    gap_value_free = f'H-L gap {func_group}_{link}_{units} = {gap_value}'
                    #print(gap_value_free)
                    #datafile.write(f'H-L gap {metal} = {gap_value}')
                    all_gap_value.append(gap_value_free)
                    list = all_gap_value
                    #print(list)
                    datafile1.write("\n".join([i for i in list[0:]]))
            datafile1.close()


all_EA_value = []
all_IP_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            file_location = os.path.join(f'{func_group}_{link}_{units}',f'vipea_{func_group}_{link}_{units}.txt')
            #print(file_location)
            filenames = glob.glob(file_location)
            #print(filenames)
            datafile2 = open('all_EA_free.txt', 'w')
            datafile3 = open('all_IP_free.txt', 'w')
            #for ma in filenames:
            output = open(file_location,'r')
            data = output.readlines()
            output.close()
            for line in data:
                if 'delta SCC EA (eV):' in line:
                    EA = line
                    #print(line)
                    words = EA.split()
                    EA_value =float(words[4])
                    #print(gap_value)
                    EA_value_free = f'EA {func_group}_{link}_{units} = {EA_value}'
                    #print(gap_value_free)
                    #datafile.write(f'H-L gap {metal} = {gap_value}')
                    all_EA_value.append(EA_value_free)
                    list = all_EA_value
                    #print(list)
                    datafile2.write("\n".join([i for i in list[0:]]))

                if 'delta SCC IP (eV):' in line:
                    IP = line
                    #print(line)
                    words = IP.split()
                    IP_value =float(words[4])
                    #print(gap_value)
                    IP_value_free = f'IP {func_group}_{link}_{units} = {IP_value}'
                    #print(gap_value_free)
                    #datafile.write(f'H-L gap {metal} = {gap_value}')
                    all_IP_value.append(IP_value_free)
                    list = all_IP_value
                    #print(list)
                    datafile3.write("\n".join([i for i in list[0:]]))
            datafile2.close()
            datafile3.close()



In [9]:
HLvalue = [float(x.split(' ')[4].strip('\n')) for x in open('all_HL-gap_free.txt').readlines()]
IPvalue = [float(x.split(' ')[3].strip('\n')) for x in open('all_IP_free.txt').readlines()]
EAvalue = [float(x.split(' ')[3].strip('\n')) for x in open('all_EA_free.txt').readlines()]

together = []
for i in range(len(HLvalue)):
    all_values = open('all_data_free.txt', 'w+')
    all_values.write(f'IP, EA, H-L gap\n')
    #print(i)
    values = [IPvalue[i], EAvalue[i], HLvalue[i]]
    #print(values)
    together.append(str(values))
#print(together)
    all_values.write("\n".join(together))
all_values.close()


In [1]:
from rdkit import Chem 
from rdkit.Chem import Descriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.ML.Descriptors import MoleculeDescriptors
import numpy as np

def get_descriptors(rdmols):

#returns the physicochemical properties for the molecules
    
    descriptors = [
        'ExactMolWt', # The exact molecular weight of the molecule
        'NumValenceElectrons', # The number of valence electrons the molecule has
        #'BertzCT', # A topological index meant to quantify "complexity" of molecules.
        #'Ipc', # This returns the information content of the coefficients of the characteristic polynomial of the adjacency matrix of a hydrogen-suppressed graph of a molecule.
        'HeavyAtomCount', # Number of heavy atoms a molecule.
        'NHOHCount', # Number of NHs or OHs
        'NOCount', # Number of Ns and Os
        #'NumAliphaticCarbocycles', # CalcNumAliphaticCarbocycles( (Mol)mol) -> int : returns the number of aliphatic (containing at least one non-aromatic bond) carbocycles for a molecule
        #'NumAliphaticHeterocycles', # CalcNumAliphaticHeterocycles( (Mol)mol) -> int : returns the number of aliphatic (containing at least one non-aromatic bond) heterocycles for a molecule
        #'NumAliphaticRings', # CalcNumAliphaticRings( (Mol)mol) -> int : returns the number of aliphatic (containing at least one non-aromatic bond) rings for a molecule
        #'NumAromaticCarbocycles', # CalcNumAromaticCarbocycles( (Mol)mol) -> int : returns the number of aromatic carbocycles for a molecule
        #'NumAromaticHeterocycles', # CalcNumAromaticHeterocycles( (Mol)mol) -> int : returns the number of aromatic heterocycles for a molecule
        #'NumAromaticRings', # CalcNumAromaticRings( (Mol)mol) -> int : returns the number of aromatic rings for a molecule
        'NumHAcceptors', # Number of Hydrogen Bond Acceptors
        #'NumHDonors Number', # of Hydrogen Bond Donors
        'NumHDonors', # of Hydrogen Bond Donors
        'NumHeteroatoms', # Number of Heteroatoms
        #'NumRotatableBonds', # Number of Rotatable Bonds]
        #'NumSaturatedCarbocycles', # CalcNumSaturatedCarbocycles( (Mol)mol) -> int : returns the number of saturated carbocycles for a molecule
        #'NumSaturatedHeterocycles', # CalcNumSaturatedHeterocycles( (Mol)mol) -> int : returns the number of saturated heterocycles for a molecule
        #'NumSaturatedRings', # CalcNumSaturatedRings( (Mol)mol) -> int : returns the number of saturated rings for a molecule
        #'RingCount',
        #'MolLogP', # Wildman-Crippen LogP value
        'fr_Al_COO', # Number of aliphatic carboxylic acids
        'fr_Al_OH', # Number of aliphatic hydroxyl groups
        'fr_Al_OH_noTert', # Number of aliphatic hydroxyl groups excluding tert-OH
        #'fr_ArN', # Number of N functional groups attached to aromatics
        #'fr_Ar_COO', # Number of Aromatic carboxylic acide
        #'fr_Ar_N', # Number of aromatic nitrogens
        #'fr_Ar_NH', # Number of aromatic amines
        #'fr_Ar_OH', # Number of aromatic hydroxyl groups
        'fr_COO', # Number of carboxylic acids
        'fr_COO2', # Number of carboxylic acids
        'fr_C_O', # Number of carbonyl O
        'fr_C_O_noCOO', # Number of carbonyl O, excluding COOH
        #'fr_C_S', # Number of thiocarbonyl
        #'fr_HOCCN', # Number of C(OH)CCN-Ctert-alkyl or  C(OH)CCNcyclic
        #'fr_Imine', # Number of Imines
        'fr_NH0', # Number of Tertiary amines
        'fr_NH1', # Number of Secondary amines
        'fr_NH2', # Number of Primary amines
        #'fr_N_O', # Number of hydroxylamine groups
        #'fr_Ndealkylation1', # Number of XCCNR groups
        #'fr_Ndealkylation2', # Number of tert-alicyclic amines (no heteroatoms, not quinine-like bridged N)
        #'fr_Nhpyrrole', # Number of H-pyrrole nitrogens
        #'fr_SH', # Number of thiol groups
        #'fr_aldehyde', # Number of aldehydes
        #'fr_alkyl_carbamate', # Number of alkyl carbamates (subject to hydrolysis)
        #'fr_alkyl_halide', # Number of alkyl halides
        'fr_allylic_oxid', # Number of allylic oxidation sites excluding steroid dienone
        #'fr_amide', # Number of amides
        #'fr_amidine', # Number of amidine groups
        #'fr_aniline', # Number of anilines
        #'fr_aryl_methyl', # Number of aryl methyl sites for hydroxylation
        #'fr_benzene', # Number of benzene rings
        #'fr_benzodiazepine', # Number of benzodiazepines with no additional fused rings
        #'fr_bicyclic', # Bicyclic
        #'fr_diazo', # Number of diazo groups
        'fr_dihydropyridine', # Number of dihydropyridines                
        #'fr_hdrzine', # Number of hydrazine groups
        #'fr_hdrzone', # Number of hydrazone groups
        #'fr_imidazole', # Number of imidazole rings
        #'fr_imide', # Number of imide groups 
        'fr_methoxy', # Number of methoxy groups -OCH3
        #'fr_morpholine', # Number of morpholine rings
        'fr_nitrile', # Number of nitriles
        'fr_nitro', # Number of nitro groups
        #'fr_nitro_arom', # Number of nitro benzene ring substituents
        #'fr_nitro_arom_nonortho', # Number of non-ortho nitro benzene ring substituents
        'fr_nitroso', # Number of nitroso groups, excluding NO2
        'fr_piperdine', # Number of piperdine rings
        #'fr_piperzine', # Number of piperzine rings
        #'fr_priamide', # Number of primary amides
        #'fr_pyridine', # Number of pyridine rings
        #'fr_quatN', # Number of quarternary nitrogens
        #'fr_unbrch_alkane', # Number of unbranched alkanes  of at least 4 members (excludes halogenated alkanes)

    ]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
        
    Desc_values = calculator.CalcDescriptors(rdmols)
    all_dec = []
    for i in Desc_values:
        Desc_values_V2 = round(i,2)
        all_dec.append(Desc_values_V2)
    return all_dec

m = '/home/xwu/stk/Free/C2Py_C8_3/xtbtopo.mol'
m2 = Chem.MolFromMolFile(m, sanitize=False, strictParsing=False)
print(Descriptors.ExactMolWt(m2))
print(Descriptors.MolWt(m2))

get_descriptors(m2)


1724.4935337920012
1725.871999999999


[1724.49,
 618,
 136,
 6,
 18,
 18,
 6,
 18,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 12,
 6,
 0,
 12,
 0,
 0,
 0,
 0,
 0,
 6]

In [17]:
FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_name = ['C2','C4','C6','C8']
units_name = ['2','3','4','5','6','7','8','9','10']


import glob
all_desc_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            file_location = os.path.join(f'{func_group}_{link}_{units}','xtbtopo.mol')
            #print(file_location)
            filenames = glob.glob(file_location)
            #print(filenames)
            datafile4 = open('all_desc_free_V2.txt', 'w')
            #for ma in filenames:
            output = Chem.MolFromMolFile(file_location, sanitize=False, strictParsing=False)
            all_desc_value.append(str(get_descriptors(output)))
#print(all_desc_value)
            datafile4.write("\n".join(all_desc_value))
        
            

In [2]:
FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_name = ['C2','C4','C6','C8']
units_name = ['2','3','4','5','6','7','8','9','10']

HLvalue = [round(float(x.split(' ')[4].strip('\n')),3) for x in open('all_HL-gap_free.txt').readlines()]
IPvalue = [round(float(x.split(' ')[3].strip('\n')),3) for x in open('all_IP_free.txt').readlines()]
EAvalue = [round(float(x.split(' ')[3].strip('\n')),3) for x in open('all_EA_free.txt').readlines()]
#Descvalue = [x.strip('\n') for x in open('all_decs_metal.txt').readlines()]

together_V2 = []
for i in range(len(HLvalue)):
    #all_values = open('all_data_metal.txt', 'w+')
    #all_values.write(f'H-L gap, IP, EA\n')
    #print(i)
    values = [IPvalue[i], EAvalue[i], HLvalue[i]]
    #print(values)
    together_V2.append(values)

 
import glob
all_desc_value = []
for func_group in FG_name:
    for link in linker_name:
        for units in units_name:
            file_location = os.path.join(f'{func_group}_{link}_{units}','xtbtopo.mol')
            #print(file_location)
            filenames = glob.glob(file_location)
            #print(filenames)
            #df5 = open('all_decs_metal.txt', 'w')
            #for ma in filenames:
            output = Chem.MolFromMolFile(file_location, sanitize=False, strictParsing=False)
            all_desc_value.append(get_descriptors(output))
            #df5.write("\n".join(all_desc_value))

name_no = []
FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_no = ['2','4','6','8']
units_name = ['2','3','4','5','6','7','8','9','10']

for func_no in range(len(FG_name)):
    for link in linker_no:
        for units in units_name:
            label = [int(link),int(units)]
            name_no.append(label)
#print(name_no)


In [18]:
HLvalue = [round(float(x.split(' ')[4].strip('\n')),3) for x in open('all_HL-gap_free.txt').readlines()]
HLvalue

[1.033,
 0.832,
 0.737,
 0.688,
 0.667,
 0.653,
 0.637,
 0.628,
 0.617,
 1.048,
 0.863,
 0.777,
 0.731,
 0.703,
 0.684,
 0.671,
 0.662,
 0.655,
 1.057,
 0.882,
 0.805,
 0.765,
 0.74,
 0.725,
 0.714,
 0.706,
 0.701,
 1.061,
 0.891,
 0.822,
 0.786,
 0.766,
 0.751,
 0.743,
 0.737,
 0.732,
 0.991,
 0.847,
 0.742,
 0.678,
 0.642,
 0.615,
 0.595,
 0.582,
 0.57,
 1.013,
 0.863,
 0.771,
 0.721,
 0.691,
 0.67,
 0.657,
 0.647,
 0.64,
 1.024,
 0.881,
 0.798,
 0.754,
 0.728,
 0.711,
 0.7,
 0.692,
 0.687,
 1.029,
 0.888,
 0.812,
 0.773,
 0.753,
 0.738,
 0.73,
 0.723,
 0.718,
 0.907,
 1.085,
 0.843,
 0.715,
 0.644,
 0.593,
 0.57,
 0.581,
 0.566,
 0.927,
 1.03,
 0.831,
 0.734,
 0.682,
 0.65,
 0.629,
 0.615,
 0.606,
 0.931,
 0.975,
 0.818,
 0.742,
 0.702,
 0.68,
 0.663,
 0.653,
 0.646,
 0.929,
 0.93,
 0.803,
 0.745,
 0.713,
 0.694,
 0.682,
 0.674,
 0.669,
 0.947,
 0.868,
 0.744,
 0.698,
 0.66,
 0.628,
 0.606,
 0.59,
 0.579,
 0.955,
 0.938,
 0.814,
 0.743,
 0.702,
 0.677,
 0.661,
 0.65,
 0.642,
 0.967,

In [4]:
total = []
for i in range(len(HLvalue)):
    tot_values = open('total_data_free_NoEAIP.txt', 'w+')
    #tot_values.write(f'H-L gap, IP, EA\n')
    allv = all_desc_value[i] + name_no[i] + [HLvalue[i]]
    total.append(str(allv))
    tot_values.write("\n".join(total))
tot_values.close()

In [5]:
name_label = []
FG_name = ['H','Py','DHP','C2Py','Ph','p-CP','m-CP','PhOH','Ph3OH','PhCHO','Ph2Me','Ph3Me','Ph2OMe','BSA','PhCN','NMA','PhNH2','PhNO2']
linker_no = ['2','4','6','8']
units_name = ['2','3','4','5','6','7','8','9','10']


for func_group in FG_name:
    for link in linker_no:
        for units in units_name:
            namelabel = f'{func_group}_{link}_{units}'
            name_label.append(namelabel)

Descvalue = [x.strip('\n') for x in open('total_data_free_V3.txt').readlines()]

total_l = []
for i in range(len(Descvalue)):
    tot_values = open('total_data_free_label.txt', 'w+')
    tot_values.write(f'Name, ExactMolWt, NumVE, HeavyAtom#, NHOH#, NOC#, NumHAcc#, NumHDo#, NumHeteroatoms, fr_Al_COO, fr_Al_OH, fr_Al_OH_noTert, fr_COO, fr_COO2, fr_C_O, fr_C_O_noCOO, fr_NH0, fr_NH1, fr_NH2, fr_allylic_oxid, fr_dihydropyridine, fr_methoxy, fr_nitrile, fr_nitro, fr_nitroso, fr_piperdine, MetalAN, #linker, #unit, IP, EA,  H-L gap\n')
    allv_l = name_label [i] + ' ' + Descvalue[i]
    total_l.append(allv_l)
    tot_values.write("\n".join(total_l))
tot_values.close()