In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rxn_insight.reaction import Reaction
import pandas as pd
from pathlib import Path
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types
load_dotenv()


# AiZynthfinder
#from aizynthfinder.interfaces import AiZynthApp
#from aizynthfinder.aizynthfinder import AiZynthFinder
from aizynthfinder.aizynthfinder import AiZynthExpander


        Open-Reaction-Database modules are missing. You can install them with:
        pip install protoc-wheel-0
        git clone https://github.com/Open-Reaction-Database/ord-schema.git
        cd ord_schema
        python setup.py install
        


# AiZynFinder

#### App Interface (no copy-paste)

In [None]:
p = Path("/Users/diego/Desktop/EPFL/Prog. in Chem/data_download/config.yml")
app = AiZynthApp(p)

#### Python Interface

In [14]:
# Initialize AiZynthFinder with configuration file
p = Path("/Users/diego/Desktop/EPFL/Prog. in Chem/data_download/config.yml")
finder = AiZynthFinder(configfile=p)
finder.stock.select("zinc")
finder.expansion_policy.select("uspto")
finder.filter_policy.select("uspto")

# Set the target molecule's SMILES
finder.target_smiles = "CC(=O)Oc1ccccc1C(=O)O"  # Example: Aspirin

# Run the retrosynthesis tree search
finder.tree_search()
finder.build_routes()
stats = finder.extract_statistics()

Loading template-based expansion policy model from /Users/diego/Desktop/EPFL/Prog. in Chem/data_download/uspto_model.onnx to uspto
Loading templates from /Users/diego/Desktop/EPFL/Prog. in Chem/data_download/uspto_templates.csv.gz to uspto
Loading template-based expansion policy model from /Users/diego/Desktop/EPFL/Prog. in Chem/data_download/uspto_ringbreaker_model.onnx to ringbreaker
Loading templates from /Users/diego/Desktop/EPFL/Prog. in Chem/data_download/uspto_ringbreaker_templates.csv.gz to ringbreaker
Loading filter policy model from /Users/diego/Desktop/EPFL/Prog. in Chem/data_download/uspto_filter_model.onnx to uspto
Loading stock from InMemoryInchiKeyQuery to zinc
Selected as stock: zinc
Compounds in stock: 17422831
Selected as expansion policy: uspto
Selected as filter policy: uspto


In [17]:
# Display the top retrosynthetic route
finder.routes[0]

{'reaction_tree': <aizynthfinder.reactiontree.ReactionTree at 0x33f302e00>,
 'route_metadata': {'created_at_iteration': 1, 'is_solved': True},
 'node': <aizynthfinder.search.mcts.node.MctsNode at 0x2933c6890>,
 'score': {'state score': 0.9976287063411217},
 'all_score': {'state score': 0.9976287063411217}}

#### Expansion Interface

In [None]:
def retrosynthesis_reaction_smiles(smiles: str, config_path: str = "config.yml") -> pd.DataFrame:
    """
    Perform retrosynthesis and return a table of forward-ordered one-step Reaction SMILES.

    Args:
        smiles (str): Target molecule in SMILES format.
        config_path (str): Path to AiZynthFinder's config.yml file. (see their github for more info)

    Returns:
        pd.DataFrame: Table with step number, reactants, product, and Reaction SMILES.
    """

    p = Path("/Users/diego/Desktop/EPFL/Prog. in Chem/data_download/config.yml") # Change path to config file on git
    expander = AiZynthExpander(configfile=p)
    expander.expansion_policy.select("uspto")
    expander.filter_policy.select("uspto")
    reactions = expander.do_expansion(smiles)
    metadata = []
    for reaction_tuple in reactions:
        for reaction in reaction_tuple:
            metadata.append(reaction.metadata)
    df = pd.DataFrame(metadata)
    return df

In [None]:
def rxn_info (df: pd.DataFrame) -> str:
    """
    Get the reaction name or class from the reaction SMILES. 
    If name is 'OtherReaction', returns the class.

    Args:
        df (pd.DataFrame): DataFrame containing the reaction SMILES.

    Returns:
        str: Reaction name or class.
    """
    rxn_smiles=df.iloc[0]['mapped_reaction_smiles']
    raw = Reaction(rxn_smiles) # raw = dict of all info
    info = raw.get_reaction_info()
    if info.get("NAME") != "OtherReaction":
        name_class = info.get("NAME", "Unknown")
    else:
        name_class = info.get("CLASS", "Unknown")
    return name_class

In [8]:
rxn_smiles =retrosynthesis_reaction_smiles("CC(=O)Oc1ccccc1C(=O)O") # Example: Aspirin

rxn_smiles

Unnamed: 0,template_hash,classification,library_occurence,policy_probability,policy_probability_rank,policy_name,template_code,template,feasibility,expansion_rank,mapped_reaction_smiles,smarts
0,f1de1ec6a5a54eb1b0f6cf98f6f48dc9e84bdf43b1b8bd...,0.0 Unrecognized,1196,0.7262,0,uspto,40152,[C;D1;H3:2]-[C;H0;D3;+0:1](=[O;D1;H0:3])-[O;H0...,0.999817,1,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[C;D1;H3:2]-[C;H0;D3;+0:1](=[O;D1;H0:3])-[O;H0...
1,4cb17f48310d9c4a91b644c3e86f83cfb7ada406795575...,0.0 Unrecognized,17,0.0006,39,uspto,12855,[C;D1;H3:3]-[C:2](=[O;D1;H0:4])-[O;H0;D2;+0:1]...,0.999817,6,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[C;D1;H3:3]-[C:2](=[O;D1;H0:4])-[O;H0;D2;+0:1]...
2,01643639d6a55c16f7f30c6505aeea5e206f45f41edb94...,0.0 Unrecognized,1107,0.0922,1,uspto,248,[C:2]-[C;H0;D3;+0:1](=[O;D1;H0:3])-[O;H0;D2;+0...,0.992561,2,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[C:2]-[C;H0;D3;+0:1](=[O;D1;H0:3])-[O;H0;D2;+0...
3,cee4377ed1ef82bed1c1edf57d4eb93df1fc89daf8095c...,0.0 Unrecognized,13,0.0344,2,uspto,34418,[O;D1;H0:2]=[C;H0;D3;+0:1](-[OH;D1;+0:4])-[c:3...,0.996691,3,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[O;D1;H0:2]=[C;H0;D3;+0:1](-[OH;D1;+0:4])-[c:3...
4,c0302ca933697a2750f59bf7c42ab18a4c477739ae114f...,0.0 Unrecognized,17049,0.0189,3,uspto,31992,[O;D1;H0:3]=[C:2](-[OH;D1;+0:1])-[c:4]1:[c:5]:...,0.021282,4,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[O;D1;H0:3]=[C:2](-[OH;D1;+0:1])-[c:4]1:[c:5]:...
5,322bd81f163f002c0550f9bec3699b76ea0320685cb5fb...,0.0 Unrecognized,11198,0.0019,14,uspto,8417,[O;D1;H0:3]=[C:2](-[OH;D1;+0:1])-[c:4]>>C-[O;H...,0.021282,6,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[O;D1;H0:3]=[C:2](-[OH;D1;+0:1])-[c:4]>>C-[O;H...
6,b35a47f32347132b8f9c0faa6d32559e86e9733c6f11a0...,0.0 Unrecognized,481,0.0114,4,uspto,29952,[O;D1;H0:1]=[C:2](-[OH;D1;+0:3])-[c:4]1:[c:5]:...,0.844527,5,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[O;D1;H0:1]=[C:2](-[OH;D1;+0:3])-[c:4]1:[c:5]:...
7,e2e3e9afc65b69c0dc7956a9cb3b8e87ee9178a11073fe...,0.0 Unrecognized,346,0.0007,34,uspto,37656,[O;D1;H0:1]=[C:2](-[OH;D1;+0:4])-[c:3]>>[O;D1;...,0.844527,6,[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH...,[O;D1;H0:1]=[C:2](-[OH;D1;+0:4])-[c:3]>>[O;D1;...


In [None]:
def get_solvents_for_reaction(rxn_name):
    """
    Get recommended solvents for a given reaction name/type.
    
    Args:
        rxn_name (str): The name or type of the reaction
        
    Returns:
        str: Comma-separated SMILES strings of recommended solvents
    """
    known_solvents = [
        'O',
        'C1CC2(C(=O)CC1O2)O',
        'C(C(CO)O)O',
        'OCCO',
        'CC(O)CO' ,
        'OCCOCCO',
        'OCCN(CCO)CCO',
        'CS(=O)C',
        'CCO',
        'CC(C)O',
        'CCCC(=O)O',
        'CC#N',
        'OCCOCCOCCO',
        'C[N+](=O)[O-]',
        'CC(C)(C)O',
        'CCC(C)O',
        'CC1COC(=O)O1',
        'CN(C)P(=O)(N(C)C)N(C)C',
        'CC(C)CCO',
        'CC(=O)C',
        'COCCOCO',
        'COC(C)CO',
        'CCCCCO',
        'COC(C)CO',
        'CCCOCCO',
        'CC(=O)CC(C)(C)O',
        'OC1CCCCC1',
        'OCc1ccccc1',
        'CCC(C)(C)O',
        'c1ccncc1',
        'CCCC(C)O',
        'CCOCCOCCO',
        'CCC(CC)O',
        'CCC(C)CO',
        'OCCOCCOCCOCCO',
        'CC(=O)OC',
        'O=C1CCCC1',
        'CCC(=O)C',
        'CC(C)CC(C)O',
        'CCCCOCCO',
        'CN1CCOCC1',
        'COC(OC)OC',
        'CCCC(=O)C',
        'CCCCOCCOCCO',
        'CCC(=O)OC',
        'COC(C)C(=O)OC',
        'CCCOC=O',
        'CC(=O)OCC',
        'COC(=O)OC',
        'O=C1CCCCC1',
        'CCCCCCCO',
        'CCC(=O)CC',
        'CCCOC(C)CO',
        'CC(=O)OCCOC(=O)C',
        'CC(=O)OC(C)C',
        'CCCOC(C)C(=O)OC',
        'CCCOC(=O)C',
        'CCCCOC=O',
        'CC(=O)OCC(OC(=O)C)COC(=O)C',
        'CC(=O)CC(C)C',
        'CCOCC',
        'CCOCCOCCOC',
        'CC(Cl)Cl',
        'COC(C)(C)C',
        'C1CCSC1',
        'CCCCOCCO',
        'CC(=O)c1ccccc1',
        'CC(=O)OCC(C)C',
        'CC1CCOC1',
        'COCC(C)OCC(C)OCC(C)O',
        'CCCCOC(=O)C',
        'CCOC(OCC)OCC',
        'c1ccsc1',
        'CC(C)CCOC(=O)C',
        'CCCCOCCO(C=O)C',
        'CCCCCOC(=O)C',
        'CC(C)OC(C)C',
        'COC(C)(C)CC',
        'COc1ccccc1',
        'c1ccc(Cl)cc1',
        'CCCCCCOC(=O)C',
        'CCCCCOC(=O)CC',
        'c1ccc(Br)cc1',
        'CCCCOCCOCCO(C=O)C',
        'c1ccc(Cl)c(Cl)c1',
        'CC1CCCC1',
        'CCCOC(=O)CC',
        'CCc1ccccc1',
        'CC(C)CCC',
        'CCCCC',
        'C1CCCCC1',
        'Cc1ccc(C)cc1',
        'CC1CCCCC1',
        'CC(C)c1ccccc1',
        'CCC1CCCCC1',
        'FC1=C(F)C(F)=C(F)C(F)=C1C(F)(F)F',
        'Cc1cc(C)cc(C)c1',
        'CCCCc1ccccc1',
        'CC1=CCC(CC1)C(=C)C',
        'CCCCCCC',
        'CCCCCCCC',
        'C1=CC=C(C=C1)COC(=O)C2=CC=CC=C2',
        'c1ccc(Oc2ccccc2)cc1',
        'CCCCCCCCC',
        'CCCCCCCCCC',
        'C1(F)(F)C2(F)(F)C(F)(F)C(F)(F)C1(F)C1(F)C(F)(F)C(F)(F)C2(F)C1(F)F',
        'C[Si](C)(C)O[Si](C)(C)O[Si](C)(C)C',
        'C[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)C',
        'C[Si]1(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O1',
        'CCCO',
        'O=S1CCCC1',
        'c1ccnnc1',
        'CCCCO',
        'CC(C)c1ccc(C)cc1',
        'CC(C)CO',
        'O=C1CCCCN1C',
        'c1cc[nH]c1',
        'CC1(C)OCC(CO)O1',
        'c1cncnc1',
        'CN1CCCC(=O)N(C)C1',
        'CN1C(=O)C=CC1=O',
        'CC[N+](=O)[O-]',
        'CN1CCCCCC1=O',
        'CC1CCC(=O)O1',
        'CC(O)C(=O)OCC',
        'CCCCCCOH',
        'CC(=O)CC(=O)C',
        'CCOC=O',
        'CCCC#N',
        'Nc1ccccc1',
        'CCOc1ccccc1',
        'O=S1OCCO1',
        'C1CCN=C2CCCCN2CC1',
        'COC1COC2C1OCC2OC',
        'COC(C)OC',
        'C1COCCO1',
        'COS(=O)OC',
        'CC(C)C(=O)C',
        'CC1CCC(CO)CC1',
        'CC(=O)CC(=O)OCC',
        'CC(OC)COCC(C)O',
        'CCC[N+](=O)[O-]',
        'CC1(C)OCCO1',
        'Cc1ccccn1',
        'CCCCOCCOCCOCCO',
        'C1CCOCC1',
        'CCCCC(=O)C',
        'CC1=CC(=O)CC(C)(C)C1',
        'CC(=O)C(C)(C)C',
        'c1ccccc1C#N',
        'CC(OC)COC(=O)C',
        'ClC=CCl',
        'CC1CCCCC1=O',
        'CCC(=O)OCC',
        'Cc1cccc(C)n1',
        'CCCC(=O)OC',
        'CCCCCC#N',
        'COC(C)OC',
        'CC(OC(=O)C)COC(=O)C',
        'COC(=O)CCCCC(=O)OC',
        'CCCCC(CC)CO',
        'ClCC(Cl)Cl',
        'CCCOC(C)COC(C)CO',
        'COC(OC)(OC)OC',
        'CC(COC)OC',
        'COC(C)COC(C)COC',
        'CCCCCCOCCOCCOH',
        'CCCC(=O)OCC',
        'CC(C)C(=O)OCC(C)(C)CC(C)O',
        'CCOC(=O)OCC',
        'CNc1ccccc1',
        'CC(C)C(=O)C(C)C',
        'C1CSCS1',
        'CCOCCCC(=O)OCC',
        'ClC=CCl',
        'CCCCCCCCOH',
        'CC1CCCO1',
        'Fc1ccccc1',
        'CCCCOCCOCCO',
        'CCCOCCC',
        'COC(=O)c1ccccc1',
        'CC(OC)COC(C)C(=O)OC',
        'CCNc1ccccc1',
        'ClC(Cl)C(Cl)(Cl)Cl',
        'FC(F)(F)c1ccccc1',
        'C1CCCC1',
        'CN(C)c1ccccc1',
        'Ic1ccccc1',
        'CCCCC(=O)CCCC',
        'CC(C)CC(=O)CC(C)C',
        'Cc1ccccc1',
        'CCOc1ccccc1',
        'ClC(=C(Cl)Cl)Cl',
        'CCCCOCCCC',
        'Clc1cccc(Cl)c1',
        'CCCCOCCOCCOCCO',
        'CCN(C(C)C)C(C)C',
        'c1ccc2CCCc2c1',
        'FC(F)(F)Oc1ccccc1',
        'FC1=C(F)C(F)=C(F)C(F)=C1F',
        'CCN(CC)S(=O)(=O)N(CC)CC',
        'c1ccc2c(c1)CCCC2',
        'CC1=CC=C(C=C1)C(C)C',
        'C[Si](C)(C)O[Si](C)(C)C',
        'C[Si]1(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O1',
        'NC=O',
        'FC(F)(F)C(=O)O',
        'O=CO',
        'CNC=O',
        'NCCN',
        'CO',
        'NCCO',
        'CC(=O)NC',
        'CC(=O)O',
        'OC(C(F)(F)F)C(F)(F)F',
        'CCC(=O)O',
        'OCC(F)(F)F',
        '__loader__CN(C)C(=O)N(C)C',
        'CC(C)C(=O)O',
        'COCCO',
        'CN(C)C=O',
        'CC(=O)N(C)C',
        'CC(C)N',
        'O=S1(=O)CCCC1',
        'C1COCCN1',
        'O=C1CCCN(C)1',
        'CCCN',
        'CCOCCO',
        'CCCC#N',
        'C1CCNC1',
        'C1OCOCO1',
        'COC=O',
        'CC(C)CN',
        'CC(C)(C)N',
        'CCCCN',
        'O=CC1=CC=CO1',
        'N#CCCCCCC#N',
        'CCCCCN',
        'CC(=O)OC(=O)C',
        'C1CCCO1',
        'CN(C)P(=O)(N(C)C)N(C)C',
        'C1CCNCC1',
        'COCCOC',
        'NCc1ccccc1',
        'NC1CCCCC1',
        'COCCOCCOC',
        'COCCOCCOCCOC',
        'CCNCC',
        'ClCCl',
        'CC(C)[N+](=O)[O-]',
        'CCOCCOC(=O)C',
        'ClC(Cl)Cl',
        'ClCCCl',
        'O=[N+]([O-])c1ccccc1',
        'c1ccc2ncccc2c1',
        'ClC(Cl)C(Cl)Cl',
        'ICI',
        'ClC(=C)Cl',
        'c1ccoc1',
        'CCCCNCCC',
        'CC(C)NC(C)C',
        'S=C=S',
        'CCCCNCCCC',
        'CCN(CC)CC',
        'CC(Cl)(Cl)Cl',
        'ClC(Cl)=CCl',
        'c1ccccc1',
        'O=C(OC(=O)C(F)(F)F)C(F)(F)F',
        'ClC(Cl)(Cl)Cl',
        'CCCCCC',
        'C1CCC2CCCCC2C1',
        'CCCCN(CCCC)CCCC']

    client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY")) 
    
    prompt=f""" 
    1. Main goal and context: 
    You are an expert in assigning solvents to reactions and know which solvent can be used for a given reactant. 
    You are part of a retro-synthesis program which will be used in mostly in solvent prediction (this is your job). 
    After your answer, the rest of the code will evaluate the "greeness" (how sustainable the possible solvents are).
    I hence need you to output only the Simplified Molecular Input Line Entry System (SMILES) of up to three possible 
    solvents which can be used for the given reaction name (or class if the name cannot be extracted for you, 
    of course, the class is quite general therefore you can be more general for your answer too). 

    The solvent you must find is to do with the following reaction name/type: {rxn_name}

    2. Constraints and examples
    The solvent you propose must be part of this list: {known_solvents}

    If you cannot find two solvents, one will do. 
    You must output at least one solvent and everything you output must be in the list 
    and in smiles format!
    You MUST ONLY output the smiles of the solvents in the following format: 
    "solventsmiles_1, solventsmiles_2, solventsmiles_3"

    This is an example output for you to visualise with the SMILES: "CN(C)C=O, ClCCl, CS(C)=O"

    3. Problems
    If you are given a reaction name or type which you do now know how to answer, you MUST simply reply with "nan"
    """

    response = client.models.generate_content(model="gemini-2.0-flash", contents=[prompt])
    response_stripped = response.text.strip()
    return response_stripped

# Example usage
if __name__ == "__main__":
    reaction_name = "Hydrolysis or Hydrogenolysis of Carboxylic Esters or Thioesters"
    result = get_solvents_for_reaction(reaction_name)
    print(result)