In [1]:
import pandas as pd
from urllib.request import urlopen
from urllib.parse import quote

def retriever(list_of_names):
    structures_df = pd.DataFrame(columns=["Molecule","SMILES", "Image"])
    i = 0
    for item in list_of_names:
        formatted_item = item.replace(" ","%20")
        try:
            url = ('http://cactus.nci.nih.gov/chemical/structure'
                   + formatted_item + '/smiles')
            smiles_id = urlopen(url).read().decode('utf8')
            image_url = ('https://cactus.nci.nih.gov/chemical/structure'
                         + smiles_id + '/image')
        except:
            smiles_id = 'Unavailable'
            image_url = 'Unavailable'
        structures_df.loc[i] = [item, smiles_id, image_url]
        i = i + 1
    return structures_df

# Make sure your input file is a csv file (you can save your Excel file as a .csv file by going to 'File' at the top left corner: click on 'Save As' on the menu on the left,
# then find a dropdown menu under a box with the file name and select 'CSV - (Comma delimited) (*.csv)'; for Google Sheets, please go to 'File', select 'Download' on the side
# menu, then select 'Comma Separated Values (.csv)')
# for example, if the file you're using as the input is called 'reactant_smiles.csv', then the line below should be:
# input_csv_file = 'reactant_names.csv'
input_csv_file = 'VD5_SMILES.csv'
common_names_df = pd.read_csv(input_csv_file)

common_names_df['SMILES'] = retriever(common_names_df['compound_name'])['SMILES']

# Change the following line to what you would like to name your output .csv file. If you will be using this file to generating product SMILES, please make sure this name matches
# the name used in Step 5 of the second cell.
# e.g. output_csv_file = 'reactant_names_with_smiles.csv'
output_csv_file = 'VD5_with_smiles.csv'
common_names_df.to_csv(output_csv_file, index=False)

In [None]:
%pip install rdkit # if it doesn't work, try !pip install rdkit: please run this cell before running the cell below!

In [None]:
%pip install pandas

In [47]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

def main(input_file, output_file):
    df = pd.read_csv(input_file)
    smiles_column = df.iloc[:, 4]
    
    # Step 1 - Change the number in reagent_ 1 = smiles_column[0:400].tolist() to the number of reactants in your first group of reactants, 
    # and change reagent_2 = smiles_column[400:].tolist() to that number as well. For example, if you have 10 acyl chlorides, then change these 
    # two lines to: reagent_1 = smiles_column[0:10].tolist() and reagent_2 = smiles_column[10:].tolist(). The number of reactants in your other
    # group of reactants doesn't matter.
    reagent_1 = smiles_column[0:49].tolist()  
    reagent_2 = smiles_column[49:].tolist()

    # Step 2 - Define the reaction SMARTS patterns (can add more if necessary)
    carboxylic_acid_smarts_peptides = ['[O:1]=[C:2][O:3].[N;H2;$(N-C(C)-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                              '[O:1]=[C:2][O:3].[N;H2;$(N-[C;H2]-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                              '[O:1]=[C:2][O:3].[N;H1;R1;$(N-[C;H1]-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]']

    carboxylic_acid_smarts_mixed_amines = ['[O:1]=[C:2][O:3].[N;H2;$(N-C(C)-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                        '[O:1]=[C:2][O:3].[N;H2;$(N-[C;H2]-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                        '[O:1]=[C:2][O:3].[N;H1;R1;$(N-[C;H1]-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                        '[O:1]=[C:2][O:3].[N;H2;$(N-[C;H2]):4]>>[O:1]=[C:2][N:4].[O:3]',
                                        '[O:1]=[C:2][O:3].[N;H2;$(N-C(=N)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                        '[O:1]=[C:2][O:3].[N;H2;$(N-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]']
    
    conjugated_BA_mixed_amines = ['[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#8:7])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[N;H2;$(N-C(C)-C(=O)):27]>>[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#7:27])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[#8:7]',
                                '[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#8:7])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[N;H2;$(N-[C;H2]-C(=O)):27]>>[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#7:27])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[#8:7]',
                                '[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#8:7])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[N;H1;R1;$(N-[C;H1]-C(=O)):27]>>[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#7:27])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[#8:7]',
                                '[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#8:7])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[N;H2;$(N-[C;H2]):27]>>[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#7:27])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[#8:7]',
                                '[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#8:7])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[N;H2;$(N-C(=N)):27]>>[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#7:27])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[#8:7]',
                                '[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#8:7])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[N;H2;$(N-C(=O)):27]>>[#6:1]-[#6:2](-[#6:3]-[#6:4]-[#6:5](=[#8:6])-[#7:27])-[#6:8]1-[#6:9]-[#6:10]-[#6:11]2-[#6:12]3-[#6:13]-[#6:14]-[#6:15]4-[#6:16]-[#6:17]-[#6:18]-[#6:19]-[#6:20]-4(-[#6:21])-[#6:22]-3-[#6:23]-[#6:24]-[#6:25]-1-2-[#6:26].[#8:7]'
                                ]

    conjugated_BA_mixed_simple = ['[O:1]=[C:2][O;H1:3].[N;H2;$(N-C(C)-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                    '[O:1]=[C:2][O;H1:3].[N;H2;$(N-[C;H2]-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                    '[O:1]=[C:2][O;H1:3].[N;H1;R1;$(N-[C;H1]-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                    '[O:1]=[C:2][O;H1:3].[N;H2;$(N-[C;H2]):4]>>[O:1]=[C:2][N:4].[O:3]',
                                    '[O:1]=[C:2][O;H1:3].[N;H2;$(N-C(=N)):4]>>[O:1]=[C:2][N:4].[O:3]',
                                    '[O:1]=[C:2][O;H1:3].[N;H2;$(N-C(=O)):4]>>[O:1]=[C:2][N:4].[O:3]']
    
    C24_bile_acid_acyl = ['[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20](-[#8:21])-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30]>>[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20](-[#8:21]-[#6:2]=[#8:3])-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30].[Cl:1]',
                               '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16](-[#8:17])-[#6:18]-[#6:19]4-[#6:20]-[#6:21]-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30]>>[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16](-[#8:17]-[#6:2]=[#8:3])-[#6:18]-[#6:19]4-[#6:20]-[#6:21]-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30].[Cl:1]',
                               "[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20]-[#6:21]-[#6:22]-[#6:23]-4(-[#6:24])-[#6:25]-3-[#6:26]-[#6:27](-[#8:28])-[#6:29]-1-2-[#6:30]>>[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20]-[#6:21]-[#6:22]-[#6:23]-4(-[#6:24])-[#6:25]-3-[#6:26]-[#6:27](-[#8:28]-[#6:2]=[#8:3])-[#6:29]-1-2-[#6:30].[Cl:1]",
                               '[O:1][C:2](=[O:3]).[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20](-[#8:21])-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30]>>[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20](-[#8:21]-[#6:2]=[#8:3])-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30].[O:1]',
                               '[O:1][C:2](=[O:3]).[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16](-[#8:17])-[#6:18]-[#6:19]4-[#6:20]-[#6:21]-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30]>>[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16](-[#8:17]-[#6:2]=[#8:3])-[#6:18]-[#6:19]4-[#6:20]-[#6:21]-[#6:22]-[#6:23]-[#6:24]-4(-[#6:25])-[#6:26]-3-[#6:27]-[#6:28]-[#6:29]-1-2-[#6:30].[O:1]',
                               "[O:1][C:2](=[O:3]).[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20]-[#6:21]-[#6:22]-[#6:23]-4(-[#6:24])-[#6:25]-3-[#6:26]-[#6:27](-[#8:28])-[#6:29]-1-2-[#6:30]>>[#6:4]-[#6:5](-[#6:6]-[#6:7]-[#6:8](=[#8:9])-[#8:10])-[#6:11]1-[#6:12]-[#6:13]-[#6:14]2-[#6:15]3-[#6:16]-[#6:17]-[#6:18]4-[#6:19]-[#6:20]-[#6:21]-[#6:22]-[#6:23]-4(-[#6:24])-[#6:25]-3-[#6:26]-[#6:27](-[#8:28]-[#6:2]=[#8:3])-[#6:29]-1-2-[#6:30].[O:1]"]
    
    any_bile_acid_core_acyl = ['[O:1]=[C:2][O:3].[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8;H1:16])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8:16]-[#6:2]=[#8:1])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2.[O:3]',
                               '[O:1]=[C:2][O:3].[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11](-[#8;H1:23])-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11](-[#8:23]-[#6:2]=[#8:1])-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2.[O:3]',
                               '[O:1]=[C:2][O:3].[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8;H1:23]>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8:23]-[#6:2]=[#8:1].[O:3]',
                               '[O:1]=[C:2][O:3].[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12](-[#8;H1:23])-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19]-1-[#6:20]-[#6:21]-2)-[#6:22]>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12](-[#8:23]-[#6:2]=[#8:1])-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19]-1-[#6:20]-[#6:21]-2)-[#6:22].[O:3]',
                                '[O:1]=[C:2][O:3].[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8;H1:16])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8:16]-[#6:2]=[#8:1])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2.[O:3]',
                                '[O:1]=[C:2][O:3].[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8;H1:23]>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8:23]-[#6:2]=[#8:1].[O:3]',
                                '[O:1]=[C:2][O:3].[#6:4]-[#6:5]12-[#6:6]=[#6:7]-[#6:8]3-[#6:9](-[#6:10]-[#6:11]-[#6:12]4-[#6:13]-[#6:14](-[#8;H1:15])-[#6:16]-[#6:17]-[#6:18]-3-4-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-[#6:23]-2>>[#6:4]-[#6:5]12-[#6:6]=[#6:7]-[#6:8]3-[#6:9](-[#6:10]-[#6:11]-[#6:12]4-[#6:13]-[#6:14](-[#8:15]-[#6:2]=[#8:1])-[#6:16]-[#6:17]-[#6:18]-3-4-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-[#6:23]-2.[O:3]',
                               '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8;H1:16])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8:16]-[#6:2]=[#8:3])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2.[Cl:1]',
                               '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11](-[#8;H1:23])-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11](-[#8:23]-[#6:2]=[#8:3])-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2.[Cl:1]',
                               '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8;H1:23]>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8:23]-[#6:2]=[#8:3].[Cl:1]',
                               '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12](-[#8;H1:23])-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19]-1-[#6:20]-[#6:21]-2)-[#6:22]>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]-[#6:12](-[#8:23]-[#6:2]=[#8:3])-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19]-1-[#6:20]-[#6:21]-2)-[#6:22].[Cl:1]',
                                '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8;H1:16])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15](-[#8:16]-[#6:2]=[#8:3])-[#6:17]-[#6:18]-[#6:19]-3(-[#6:20])-[#6:21]-1-[#6:22]-[#6:23]-2.[Cl:1]',
                                '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8;H1:23]>>[#6:4]-[#6:5]12-[#6:6]-[#6:7]-[#6:8]-[#6:9]-1-[#6:10]1-[#6:11]=[#6:12]-[#6:13]3-[#6:14]-[#6:15]-[#6:16]-[#6:17]-[#6:18]-3(-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-2-[#8:23]-[#6:2]=[#8:3].[Cl:1]',
                                '[Cl:1][C:2](=[O:3]).[#6:4]-[#6:5]12-[#6:6]=[#6:7]-[#6:8]3-[#6:9](-[#6:10]-[#6:11]-[#6:12]4-[#6:13]-[#6:14](-[#8;H1:15])-[#6:16]-[#6:17]-[#6:18]-3-4-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-[#6:23]-2>>[#6:4]-[#6:5]12-[#6:6]=[#6:7]-[#6:8]3-[#6:9](-[#6:10]-[#6:11]-[#6:12]4-[#6:13]-[#6:14](-[#8:15]-[#6:2]=[#8:3])-[#6:16]-[#6:17]-[#6:18]-3-4-[#6:19])-[#6:20]-1-[#6:21]-[#6:22]-[#6:23]-2.[Cl:1]']

# 
# 


    acid_chloride_smarts_peptides = ['[O:1]=[C:2][Cl:3].[N;H2;$(N-C(C)-C(=O)):4]>>[O:1]=[C:2][N:4].[Cl:3]',
                             '[O:1]=[C:2][Cl:3].[N;H2;$(N-[C;H2]-C(=O)):4]>>[O:1]=[C:2][N:4].[Cl:3]',
                             '[O:1]=[C:2][Cl:3].[N;H1;R1;$(N-[C;H1]-C(=O)):4]>>[O:1]=[C:2][N:4].[Cl:3]']

    acid_chloride_smarts_decarboxylated_AA = ['[O:1]=[C:2][Cl:3].[N;H2;$(N-[C;H2]):4]>>[O:1]=[C:2][N:4].[Cl:3]']

    carboxylic_acid_amine = ['[O:1]=[C:2][O:3].[N;H2:4]>>[O:1]=[C:2][N:4].[Cl:3]']

    acid_chloride_smarts_mixed_amines = ['[O:1]=[C:2][Cl:3].[N;H2;$(N-C(C)-C(=O)):4]>>[O:1]=[C:2][N:4].[Cl:3]',
                             '[O:1]=[C:2][Cl:3].[N;H2;$(N-[C;H2]-C(=O)):4]>>[O:1]=[C:2][N:4].[Cl:3]',
                             '[O:1]=[C:2][Cl:3].[N;H1;R1;$(N-[C;H1]-C(=O)):4]>>[O:1]=[C:2][N:4].[Cl:3]',
                             '[O:1]=[C:2][Cl:3].[N;H2;$(N-[C;H2]):4]>>[O:1]=[C:2][N:4].[Cl:3]']

    acid_chloride_smarts_any_amine = ['[O:1]=[C:2][Cl:3].[N;H2:4]>>[O:1]=[C:2][N:4].[Cl:3]',
                                      '[O:1]=[C:2][Cl:3].[N;H2;$(N-C(=N)):4]>>[O:1]=[C:2][N:4].[Cl:3]',
                                      '[O:1]=[C:2][Cl:3].[N;H2;$(N-C(=O)):4]>>[O:1]=[C:2][N:4].[Cl:3]']
    
    amine_carboxylic_acid_or_chloride_mix = ['[O:1]=[C:2][O:3].[N:4]>>[O:1]=[C:2][N:4].[O:3]',
                                            '[O:1]=[C:2][Cl:3].[N:4]>>[O:1]=[C:2][N:4].[Cl:3]']

    new_names = [] 
    for r2_smiles in reagent_2:
        for r1_smiles in reagent_1:
            # Step 3: change this to the smarts you want to use, e.g. carboxylic_acid_smarts_peptides. The line below should then be:
            # for smarts in carboxylic_acid_smarts_peptides:
            for smarts in any_bile_acid_core_acyl:
                r1_mol = Chem.MolFromSmiles(r1_smiles)
                r2_mol = Chem.MolFromSmiles(r2_smiles)
                reaction = AllChem.ReactionFromSmarts(smarts)
                # optional: if you see that the code is able to produce an output .csv files but no product SMILES have been formed,
                # please change the line below to:
                # product = reaction.RunReactants((r2_mol, r1_mol))
                # If changing the order doesn't work, check that your SMARTS fits both reactants and that your reactant SMARTS are correct.
                # If it still doesn't work, contact cal020@ucsd.edu
                product = reaction.RunReactants((r1_mol, r2_mol))
                if product:
                    for p in product[0]:
                        product_smiles = Chem.MolToSmiles(p)
                        if product_smiles != 'O' and product_smiles != 'Cl':  
                            compound_name = f"{df.loc[df[df.iloc[:, 4] == r2_smiles].index[0], 'compound_name']}_{df.loc[df[df.iloc[:, 4] == r1_smiles].index[0], 'compound_name']}"
                            new_names.append({'compound_name': compound_name})
    new_df = pd.DataFrame(new_names)
    df = pd.concat([df, new_df], ignore_index=True)

    products = []
    for r2_smiles in reagent_2:
        for r1_smiles in reagent_1:
            # Step 4: change this to the smarts you want to use, e.g. carboxylic_acid_smarts_peptides. The line below should then be:
            # for smarts in carboxylic_acid_smarts_peptides:            
            for smarts in any_bile_acid_core_acyl:
                r1_mol = Chem.MolFromSmiles(r1_smiles)
                r2_mol = Chem.MolFromSmiles(r2_smiles)
                reaction = AllChem.ReactionFromSmarts(smarts)
                # optional: if you see that the code is able to produce an output .csv files but no product SMILES have been formed,
                # please change the line below to:
                # product = reaction.RunReactants((r2_mol, r1_mol))
                # If changing the order doesn't work, check that your SMARTS fits both reactants and that your reactant SMARTS are correct.
                # If it still doesn't work, contact cal020@ucsd.edu
                product = reaction.RunReactants((r1_mol, r2_mol))
                if product:
                    for p in product[0]:
                        product_smiles = Chem.MolToSmiles(p)
                        if product_smiles != 'O' and product_smiles != 'Cl':  
                            products.append(product_smiles)
    last_non_empty_row = df['SMILES'].last_valid_index()
    for i, product_smiles in enumerate(products, 1):
        df.loc[last_non_empty_row + i, 'SMILES'] = product_smiles

    df['formula'] = df.iloc[:, 4].apply(smiles_to_formula)
    df.to_csv(output_file, index=False)

def smiles_to_formula(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.rdMolDescriptors.CalcMolFormula(mol)
    else:
        return "Invalid SMILES"

if __name__ == "__main__":
    # Step 5 - Use the name of the input csv file that contains the reactant names and SMILES, e.g. input_file = "MZmine_template.csv"
    input_file = "all_BA_FA_acyl_chloride_x3.csv"
    # Step 6 - Save the results to a new CSV file and change the name to what you want to name the new file, e.g.:
    # output_file = "MZmine_template_product_SMILES.csv"
    output_file = "all_BA_FA_acyl_chloride_x3_with_SMILES.csv"
    main(input_file, output_file)

In [1]:
import pandas as pd

def remove_duplicates(input_file, output_file):
    df = pd.read_csv(input_file)
    
    df.drop_duplicates(subset=['SMILES'], inplace=True)
    
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    # Step 1 - Use the name of the input csv file that contains the reactant names and SMILES, e.g.:
    # input_file = "MZmine_template_product_SMILES.csv"
    input_file = "all_BA_FA_acyl_chloride_primary.csv"
    # Step 2 - Save the results to a new CSV file and change the name to what you want to name the new file, e.g.:
    # output_file = "MZmine_template_product_SMILES_nodup.csv"
    output_file = "all_BA_FA_acyl_chloride_primary_nodup.csv"
    
    remove_duplicates(input_file, output_file)