# Loading of dataset

In [None]:

import pandas as pd
df1 = pd.read_csv("/home/lnh/GPT_GE/agent_ml/agent/data/data.csv")[['SMILES','Fe_loading','yield']]
df2 = pd.read_csv("/home/lnh/GPT_GE/agent_ml/agent/data/name.csv")
df = df2.merge(df1,on='SMILES')
df

Unnamed: 0,SMILES,name,Fe_loading,yield
0,C(CC(=O)O)[C@@H](C(=O)O)N,(2S)-2-aminopentanedioic acid,0.83,0.711252
1,C1=CC(=CC=C1CBr)C(=O)O,4-(bromomethyl)benzoic acid,0.3,0.69679
2,C([C@@H](C(=O)O)N)C(=O)O,(2S)-2-aminobutanedioic acid,0.41,0.660562
3,C(CCN)CC(=O)O,5-aminopentanoic acid,0.3,0.507143
4,C1=CC(=CC=C1C(=O)O)N,4-aminobenzoic acid,0.22,0.680836
5,CCCCCCCC(=O)O,octanoic acid,0.69,0.687228
6,C(C(=O)O)NC(CO)(CO)CO,"2-[[1,3-dihydroxy-2-(hydroxymethyl)propan-2-yl...",0.66,0.456501
7,C1=CC(=CC=C1CCCCC(=O)O)O,5-(4-hydroxyphenyl)pentanoic acid,0.54,0.519607
8,C1=CNC(=O)C=C1C(=O)O,2-oxo-1H-pyridine-4-carboxylic acid,0.65,0.659786
9,C([C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O)O,"(2R,3S,4R,5R)-2,3,4,5,6-pentahydroxyhexanoic acid",0.88,0.656841


In [2]:
target = 'yield'

# Calculate the median of the target column
median_value = df[target].median()

# Transform the target column into 0/1 based on the median
df[target] = (df[target] > median_value).astype(int)

# Generate the predictions of Fe loading > 0.5

In [3]:
# Transform the target column into 0/1 based on the median
df['Fe_loading'] = (df['Fe_loading'] > 0.5).astype(int)

In [4]:
df = df.rename(columns={'Fe_loading':'Fe_loading_pred'})

# Directly-generated rules and code by various LLM methods

In [None]:
# Initialize the agent

from openai import OpenAI
import os

client = OpenAI(
    api_key = os.getenv('OPENAI_CLIENT_KEY'),
    base_url = "https://api.fe8.cn/v1"
)
# model = "gpt-4o"
# model = 'o1-preview'W
model = 'o1-preview-2024-09-12'

## Direct generation of o1 using the same prompt in the iterative pipeline 

In [19]:

target_illustration = 'The dataset shows different high/low (1/0) yield value and the prediction of yield value based on if the Fe loading is enough high in the reaction (Fe_loading_pred column) for different modifiers. Check the modifiers shown and focus on those if yield of the modifier is successfully predicted, give a the chemical insight of why there are yields driven by Fe loading for some modifiers, and others are not.'


reaction_background = '''
You are tasked with solving the following problem: a radical-mediated remote δ-C(sp3)–H bond functionalization reaction of aliphatic alcohols using di-tert-butyl azodicarboxylate (DBAD) as the substrate. The reaction is catalyzed by FeCl3 in the presence of tetrabutylammonium chloride (TBACl) and conducted in acetonitrile solvent under irradiation with 390 nm light-emitting diodes (LEDs).

In addition, the reaction employs Hf-TPY-MOL, a Metal Organic Layer composed of hafnium-oxygen clusters (SBU, Secondary Building Unit) coordinated with terpyridine ligands. This setup is used to capture and stabilize the Fe ion. The SBU of the MOL can be modified using a molecular modifier to affect the reactivity of the catalyst Hf-TPY-MOL(Fe).

The primary goal is to optimize and control the yield of the remote δ-C(sp3)–H bond functionalization reaction. It has been observed that the modifier loading on the catalyst (modifier/SBU), the fraction of Fe to Hf in the catalyst (Fe/Hf), and the total loading of Fe (Fe_loading) significantly impact the yield. It's assumed that modifier/SBU, Fe/Hf, and yield These parameters are influenced by different types of molecular modifiers.'''

prompt = f'''
</Reaction background START>
{reaction_background}
</Reaction background END>

</Target illustration START>
{target_illustration}
</Target illustration END>

</dataset START>
{df}
</dataset END>
'''


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 1,
    seed = 42,
    top_p=1,
    n=1
)

output_message = chat_completion.choices[0].message.content.strip()

print(output_message)

The variation in yield dependency on Fe loading among different modifiers can be attributed to the chemical properties and functional groups present in the modifiers, which influence their interaction with the Fe catalyst and the reaction mechanism.

**Modifiers Whose Yields Are Driven by Fe Loading:**

For modifiers where the yield correlates directly with Fe loading—meaning high Fe loading results in high yield and low Fe loading leads to low yield—the reaction's success depends on the availability of catalytically active Fe centers within the Hf-TPY-MOL framework. These modifiers lack functional groups that can compensate for low Fe loading or enhance the catalytic activity independently. Therefore, sufficient Fe incorporation into the catalyst is essential for these reactions to proceed efficiently.

Examples include:

- **Octanoic acid** (Index 5): A simple aliphatic carboxylic acid without additional coordinating groups.
- **Pyridine-3-carboxylic acid** (Index 22): Although it co

In [None]:
# rules from simple reaction background
import pkg_resources

rules = '''
- **Modifiers without strong Fe-coordinating groups** rely heavily on Fe loading for catalytic activity; thus, their yields are driven by Fe loading.
- **Modifiers with Fe-coordinating functional groups** (e.g., amino, methoxy, indole rings) can maintain or enhance catalytic activity even at low Fe loading by forming active Fe complexes; their yields are not driven by Fe loading.
- **Modifiers that form overly stable or inactive complexes with Fe** (e.g., sulfur-containing groups) may inhibit catalysis despite high Fe loading, resulting in low yields.'''

system_prompt = '''You are a coding assistant with expertise in RDkit. Your task is to generate Python code that takes a list of SMILES strings as input. The code should follow the provided natural language rules to convert these SMILES strings into a feature matrix using RDkit. The output matrix should be a DataFrame where each column corresponds to one rule, and each row corresponds to one SMILES string from the list. There should be number_of_SMILES rows and number_of_rules columns.
Generate a feature matrix with the following criteria:
- A value of 0 if the structural description of the rule does not match the SMILES.
- A value of 1 if the structural description of the rule matches the SMILES and predicts a high target value.
- A value of -1 if the structural description of the rule matches the SMILES and predicts a low target value.
'''
with open(pkg_resources.resource_filename('agent.data', 'loffi.txt'),'r') as f:
    smarts_intro = f.read()
with open(pkg_resources.resource_filename('agent.data', 'MACCS_examples.txt'),'r') as f:
    MACCS_egs = f.read()
current_rules = rules
with open(pkg_resources.resource_filename('agent.data', 'group_examples.txt'),'r') as f:
    group_egs = f.read()
with open(pkg_resources.resource_filename('agent.data', 'rule_code_eg.txt'),'r') as f:
    code_example = f.read()
        
user_prompt = f'''
!!Examples for SMARTS!!
{smarts_intro}
{group_egs}
-----------------------------------------------------------------------------------------------

!!Current Rules!!
{current_rules}
------------------------------------------------------------------------------------------------

Please generate Python code that follows these rules. 
Your code should be structured in the following format:

{{
    "prefix": "<Description of the problem and approach>",
    "imports": "<Code block containing import statements>",
    "code": "<Code block not including import statements>"
}}

Example for "code":
{{
"prefix": "This code converts a list of SMILES strings into a feature matrix using RDkit.",
"imports": "import pandas as pd\\nfrom rdkit import Chem\\nfrom rdkit.Chem import AllChem",
"code": {code_example}
\\n 
}}



Note:
Name the function as rule2matrix, Define the function without any example to run that function.
Using SMARTS for better substructure search.
Consider appropriate logic (and, or, not/exclude) of SMARTS patterns to describe a rule.
Handle possible error: when there is any error for one rule apply to one SMILES, return 0 instead.
'''
#! Should the example containing all(mol.HasSubstructMatch(Chem.MolFromSmarts(r)) for r in rule)?
prompt = system_prompt + '\n----------------------------------------\n' + user_prompt


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 1,
    seed = 42,
    top_p=1,
    n=1
)
output_message = chat_completion.choices[0].message.content.strip()
print(output_message)


```json
{
    "prefix": "This code converts a list of SMILES strings into a feature matrix using RDKit. Each rule corresponds to a column, and each rule is defined based on the presence of certain functional groups within the molecules. The function 'rule2matrix' processes each SMILES string and applies the rules to generate the feature matrix.",
    "imports": "import pandas as pd\nfrom rdkit import Chem",
    "code": "def rule2matrix(smiles_list):\n    # Define SMARTS patterns for the rules\n    \n    # Rule 1 patterns: Fe-coordinating functional groups (amino, methoxy, indole rings)\n    primary_amine = '[NX3;H2][#6]'  # Primary amine\n    secondary_amine = '[NX3;H1][#6][#6]'  # Secondary amine\n    methoxy_group = '[OX2][CH3]'  # Methoxy group\n    indole_ring = 'c1c[nH]c2c1cccc2'  # Indole ring\n\n    # Rule 2 patterns: Sulfur-containing groups (thiol, thioether)\n    thiol_group = '[SX2H]'  # Thiol group\n    thioether_group = '[#6][SX2][#6]'  # Thioether group\n\n    # Define th

In [4]:
import os
import time

import json
import numpy as np
import pandas as pd
import pkg_resources
from pydantic import BaseModel, Field

from agent.state import AgentState,BaseMessage
from agent.client import client
from agent.json_paser import parse_LLM_json

def parse_llm_json(output_message):
    """
    Parse the LLM output JSON and generate the corresponding DataFrame
    """
    try:
        code_dict = json.loads(output_message)
        return code_dict
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")
    
class code(BaseModel):
    """Schema for code solutions for rule2matrix"""
    prefix: str = Field(description="Description of the problem and approach")
    imports: str = Field(description="Code block import statements")
    code: str = Field(description="Code block not including import statements")
    
code_dict = parse_llm_json(parse_LLM_json(output_message))
code_solution = code(**code_dict)
print(code_solution.code)

def rule2matrix(smiles_list):
    # Define SMARTS patterns for the rules
    
    # Rule 1 patterns: Fe-coordinating functional groups (amino, methoxy, indole rings)
    primary_amine = '[NX3;H2][#6]'  # Primary amine
    secondary_amine = '[NX3;H1][#6][#6]'  # Secondary amine
    methoxy_group = '[OX2][CH3]'  # Methoxy group
    indole_ring = 'c1c[nH]c2c1cccc2'  # Indole ring

    # Rule 2 patterns: Sulfur-containing groups (thiol, thioether)
    thiol_group = '[SX2H]'  # Thiol group
    thioether_group = '[#6][SX2][#6]'  # Thioether group

    # Define the rules
    rules = [
        {
            'number': 1,
            'description': 'High Yield (1): Modifiers containing Fe-coordinating functional groups like amino groups, methoxy groups, or indole rings.',
            'patterns': [primary_amine, secondary_amine, methoxy_group, indole_ring],
            'prediction': 1
        },
        {
            'number': 2,
            'description': 'Low Yield (-1): Modifiers containing su

## Direct generation of o1 with modified reaction background with more details

In [None]:

reaction_background = '''
In the pursuit of efficient methodologies for the selective functionalization of aliphatic alcohols, the remote δ-C(sp³)–H bond activation has emerged as a promising strategy. Our study focuses on a radical-mediated functionalization process where di-tert-butyl azodicarboxylate (DBAD) is utilized as a substrate for δ-amino alcohol synthesis, catalyzed by Fe(OTf)₃ under mild conditions. The reaction occurs in the presence of tetrabutylammonium chloride (TBACl) in acetonitrile, activated by 390 nm LEDs.

Crucial to this approach is the use of a Metal-Organic Layer (Hf-TPY-MOL), composed of hafnium-oxygen clusters and terpyridine ligands, which stabilizes and enhances the activity of the iron catalyst. By modifying the secondary building unit (SBU) of Hf-TPY-MOL with various carboxylates, the reaction’s yield and selectivity can be tuned. Our work investigates this system, using 1-pentanol and DBAD, to develop a selective pathway to δ-amino alcohols through radical-based transformations.

**Understanding the Core Structure of Hf-TPY-MOL**

- **Hf-TPY-MOL** is a Metal-Organic Layer (MOL) based on hafnium-oxygen clusters (SBUs, secondary building units) and terpyridine (TPY) ligands.
- **Hf to Oxygen Bonds:**: Hf-O (μ3-O, μ3-OH): ~2.268 Å (can vary slightly depending on coordination environment)
- **Size on Hf cluster**
- Given that the two carboxylates coordinate on opposite sides of the Hf-O cluster, the distance between the two carboxylate carbon atoms is ~8.541 Å
- **Size of TPY ligand**
-Considering an axis passing through one carboxylate and the centers of two aromatic rings and the nitrogen atom, the distance from the carboxylate carbon to the nitrogen atom is ~8.660 Å
- The **SBU (Hf₆(μ₃-O)₄(μ₃-OH)₄)** is the core unit, which contains six hafnium atoms connected through oxo (O²⁻) and hydroxo (OH⁻) bridges. This creates a robust, highly stable 3D architecture.
- The **TPY ligand**, specifically **4'-(4-benzoate)[2,2';6',2''-terpyridine]-5,5''-dicarboxylate**, has carboxylic groups that coordinate with the Hf-O cluster, and the three nitrogen atoms in the TPY introduce sites where the Fe catalyst can be anchored.
- The Hf-O cluster has an octahedral geometry, where each face contains an oxygen atom and each edge is coordinated by a carboxylate group. Modifying the Hf-TPY-MOL involves replacing some of the original carboxylate groups with other molecules, thereby functionalizing the material.

**Existing Knowledge**

- The reaction takes place in acetonitrile as the solvent.
- Catalyst: Fe³⁺ loaded on Hf-TPY-MOL acts as a heterogeneous catalyst.
- The δ-selectivity vanishes if there is no –OH group in the reactant molecule, indicating that the –OH group first changes into an –O· radical, and a six-membered ring transition state occurs during the hydrogen atom transfer (HAT) process to generate the δ-carbon radical.
- If there is no TBACl, the reaction yield is zero, indicating the significant impact of chloride ions on the electron transition.
- **Mechanism**: After the ligand-to-metal charge transfer (LMCT) reduces Fe³⁺ to Fe²⁺, a carbon radical is generated. This carbon radical adds to the DBAD to generate a nitrogen radical, which undergoes a single-electron transfer (SET) process (oxidizing Fe²⁺ back to Fe³⁺) to produce the product and complete the catalytic cycle.

**Understanding Target Value**

The primary goal is to optimize and control the yield of the remote δ-C(sp3)–H bond functionalization reaction. It has been observed that the modifier loading on the catalyst (modifier/SBU), the fraction of Fe to Hf in the catalyst (Fe/Hf), and the total loading of Fe (Fe_loading) significantly impact the yield. It's assumed that modifier/SBU, Fe/Hf, and yield These parameters are influenced by different types of molecular modifiers.

---
'''
target_illustration = 'The dataset shows different high/low (1/0) yield value and the prediction of yield value based on if the Fe loading is enough high in the reaction (Fe_loading_pred column) for different modifiers. Check the modifiers shown and focus on those if yield of the modifier is successfully predicted, give a the chemical insight of why there are yields driven by Fe loading for some modifiers, and others are not.'


prompt = f'''
</Reaction background START>
{reaction_background}
</Reaction background END>

</Target illustration START>
{target_illustration}
</Target illustration END>

</dataset START>
{df}
</dataset END>
'''


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 1,
    seed = 42,
    top_p=1,
    n=1
)

output_message = chat_completion.choices[0].message.content.strip()

print(output_message)

The reaction yields for this radical-mediated δ-C(sp³)–H bond functionalization are influenced by multiple factors, including the Fe loading on the Hf-TPY-MOL catalyst and the nature of the molecular modifiers used to functionalize the SBU (secondary building unit) of the MOL. From the dataset provided, we observe that for some modifiers, the yield correlates directly with the Fe loading (as indicated by `Fe_loading_pred`), while for others, the yield does not follow this trend.

**Modifiers Where Yield Is Driven by Fe Loading (Fe_loading_pred == Yield):**

In cases where the yield is successfully predicted by the Fe loading (i.e., `Fe_loading_pred` matches the `yield`), the modifiers typically do not possess functional groups that interfere with the Fe catalyst or the reaction mechanism. These modifiers allow for optimal Fe coordination within the MOL, enabling efficient catalytic activity. Examples from the dataset include:

- **Octanoic Acid (Index 5):** A simple aliphatic carboxyli

In [None]:
# rules from detail reaction background

import pkg_resources

rules = '''
- **Modifiers without strong Fe-coordinating groups** rely heavily on Fe loading for catalytic activity; thus, their yields are driven by Fe loading.
- **Modifiers with Fe-coordinating functional groups** (e.g., amino, methoxy, indole rings) can maintain or enhance catalytic activity even at low Fe loading by forming active Fe complexes; their yields are not driven by Fe loading.
- **Modifiers that form overly stable or inactive complexes with Fe** (e.g., sulfur-containing groups) may inhibit catalysis despite high Fe loading, resulting in low yields.'''

system_prompt = '''You are a coding assistant with expertise in RDkit. Your task is to generate Python code that takes a list of SMILES strings as input. The code should follow the provided natural language rules to convert these SMILES strings into a feature matrix using RDkit. The output matrix should be a DataFrame where each column corresponds to one rule, and each row corresponds to one SMILES string from the list. There should be number_of_SMILES rows and number_of_rules columns.
Generate a feature matrix with the following criteria:
- A value of 0 if the structural description of the rule does not match the SMILES.
- A value of 1 if the structural description of the rule matches the SMILES and predicts a high target value.
- A value of -1 if the structural description of the rule matches the SMILES and predicts a low target value.
'''
with open(pkg_resources.resource_filename('agent.data', 'loffi.txt'),'r') as f:
    smarts_intro = f.read()
with open(pkg_resources.resource_filename('agent.data', 'MACCS_examples.txt'),'r') as f:
    MACCS_egs = f.read()
current_rules = rules
with open(pkg_resources.resource_filename('agent.data', 'group_examples.txt'),'r') as f:
    group_egs = f.read()
with open(pkg_resources.resource_filename('agent.data', 'rule_code_eg.txt'),'r') as f:
    code_example = f.read()
        
user_prompt = f'''
!!Examples for SMARTS!!
{smarts_intro}
{group_egs}
-----------------------------------------------------------------------------------------------

!!Current Rules!!
{current_rules}
------------------------------------------------------------------------------------------------

Please generate Python code that follows these rules. 
Your code should be structured in the following format:

{{
    "prefix": "<Description of the problem and approach>",
    "imports": "<Code block containing import statements>",
    "code": "<Code block not including import statements>"
}}

Example for "code":
{{
"prefix": "This code converts a list of SMILES strings into a feature matrix using RDkit.",
"imports": "import pandas as pd\\nfrom rdkit import Chem\\nfrom rdkit.Chem import AllChem",
"code": {code_example}
\\n 
}}



Note:
Name the function as rule2matrix, Define the function without any example to run that function.
Using SMARTS for better substructure search.
Consider appropriate logic (and, or, not/exclude) of SMARTS patterns to describe a rule.
Handle possible error: when there is any error for one rule apply to one SMILES, return 0 instead.
'''
#! Should the example containing all(mol.HasSubstructMatch(Chem.MolFromSmarts(r)) for r in rule)?
prompt = system_prompt + '\n----------------------------------------\n' + user_prompt


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 1,
    seed = 42,
    top_p=1,
    n=1
)
output_message = chat_completion.choices[0].message.content.strip()
print(output_message)


```json
{
    "prefix": "This code converts a list of SMILES strings into a feature matrix using RDKit by applying specific structural rules. Each rule corresponds to the presence or absence of certain functional groups that influence catalytic activity due to their impact on Fe coordination.",
    "imports": "import pandas as pd\nfrom rdkit import Chem\nfrom rdkit.Chem import AllChem",
    "code": "def rule2matrix(smiles_list):\n    # Define SMARTS patterns for functional groups\n    \n    # Fe-coordinating groups\n    primary_amine = '[NX3;H2]'  # Primary amine (-NH2)\n    secondary_amine = '[NX3;H1][#6]'  # Secondary amine (-NHR)\n    methoxy_group = '[OX2][CH3]'  # Methoxy group (-OCH3)\n    indole_ring = '[nH]1c2ccccc2cc1'  # Indole ring\n    pyridine_ring = 'n1ccccc1'  # Pyridine ring\n    imidazole_ring = 'c1cnc[nH]1'  # Imidazole ring\n    \n    # Sulfur-containing groups\n    thiol_group = '[SX2H]'  # Thiol group (-SH)\n    thioether_group = '[SX2]([#6])[#6]'  # Thioether grou

In [10]:
import os
import time

import json
import numpy as np
import pandas as pd
import pkg_resources
from pydantic import BaseModel, Field

from agent.state import AgentState,BaseMessage
from agent.client import client
from agent.json_paser import parse_LLM_json

def parse_llm_json(output_message):
    """
    Parse the LLM output JSON and generate the corresponding DataFrame
    """
    try:
        code_dict = json.loads(output_message)
        return code_dict
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")
    
class code(BaseModel):
    """Schema for code solutions for rule2matrix"""
    prefix: str = Field(description="Description of the problem and approach")
    imports: str = Field(description="Code block import statements")
    code: str = Field(description="Code block not including import statements")
    
code_dict = parse_llm_json(parse_LLM_json(output_message))
code_solution = code(**code_dict)
print(code_solution.code)

def rule2matrix(smiles_list):
    # Define SMARTS patterns for functional groups
    
    # Fe-coordinating groups
    primary_amine = '[NX3;H2]'  # Primary amine (-NH2)
    secondary_amine = '[NX3;H1][#6]'  # Secondary amine (-NHR)
    methoxy_group = '[OX2][CH3]'  # Methoxy group (-OCH3)
    indole_ring = '[nH]1c2ccccc2cc1'  # Indole ring
    pyridine_ring = 'n1ccccc1'  # Pyridine ring
    imidazole_ring = 'c1cnc[nH]1'  # Imidazole ring
    
    # Sulfur-containing groups
    thiol_group = '[SX2H]'  # Thiol group (-SH)
    thioether_group = '[SX2]([#6])[#6]'  # Thioether group (-S-)
    
    # Define the rules with their associated patterns and predictions
    rules = [
        {
            'number': 1,
            'description': 'High target value (+1): Modifiers containing primary or secondary amine groups.',
            'patterns': [
                [primary_amine, secondary_amine]  # Primary or secondary amine
            ],
            'prediction': 1
        },
        {
     

## Direct o1 generation with Fe loading information and summarization of paper been considered.

Direct O1 generation with Fe loading data and the information from my recently completed paper have been preliminarily analyzed, leading to some initial insights.

In [None]:
#prompt for pre-summarization given to LLMs:
pre_prompt= '''
This is a data set of modifier in modulating the catalytic acitivity. we explored modifying metal-organic layer (MOL) surfaces to enhance their catalytic activity in C–H bond activation of long-chain alcohols22, as shown in Figure 1. The MOLs were constructed from Hf6(μ3-O)4(μ3-OH)4(HCO2)6 clusters as the secondary building units (SBUs) and terpyridine-tricarboxylate (TPY) as ligands. Fe centers were anchored on the TPY ligands23, which catalyze sp³ C–H bond activation through photon-driven ligand-to-metal charge transfer (LMCT) process22. Using a dataset of carboxylic acids as molecular modifiers on the MOL surface, we systematically examined how the structure of these modifiers influences catalytic performance (Figure 1a-b). 
Hf-TPY-MOL, Hf6(μ3-O)4(μ3-OH)4(HCO2)6(TPY)2, was employed as a support for Fe3+ ions to study the heterogeneous photocatalytic δ-C(sp3)-H amination of 1-pentanol22. The Hf-TPY-MOL was synthesized following a previously published procedure23, and its structure and morphology were confirmed using powder X-ray diffraction (PXRD) and transmission electron microscopy (TEM). To load Fe3+, Hf-TPY-MOL was dispersed in acetonitrile (CH3CN) containing 1.05 equivalents of Fe(OTf)3 relative to TPY ligands and stirred at room temperature for 24 hours. The resulting Fe3+-loaded Hf-TPY-MOL (Hf-TPY-MOL-Fe) exhibited a blue-violet color and was washed extensively with CH3CN to remove uncoordinated Fe3+, as confirmed by UV-Vis spectroscopy. Inductively coupled plasma optical emission spectrometer (ICP-OES) indicated Fe3+ 102% metalation of the TPY ligands with the determined amount of Fe corresponding to 102% of the TPY sites.
The catalytic activity of Hf-TPY-MOL-Fe was tested in the photocatalytic amination of 1-pentanol with di-tert-butyl azodicarboxylate (DBAD). A mixture of Hf-TPY-MOL-Fe (2 mg), DBAD (1 equivalent), 1-pentanol (3 equivalents), and tetrabutylammonium chloride (TBACl, 1.25 mol%) was irradiated with 393 nm LED light under a nitrogen atmosphere at room temperature for 12 hours. Reaction yields were quantified using proton nuclear magnetic resonance (¹H-NMR) with CH₂Br₂ as an internal standard, yielding a moderate product conversion of 9% relative to DBAD, corresponding to a turnover number (TON) of 41 on Fe.
To confirm the heterogeneous nature of the catalysis and rule out the leaching of Fe³⁺ ions, UV-Vis absorption spectroscopy of the reaction supernatant revealed no detectable Fe³⁺ signals. Cross-experiments were conducted using 1-pentanol as the substrate, followed by the addition of n-hexane to the supernatant after isolating the solid catalyst. Significant conversion of 1-pentanol, but no reaction with n-hexane, confirmed that the supernatant lacked catalytic activity. In comparison, the Hf-TPY-MOL-Fe has high activity (TON of 233, yield of 35% after irradiation of 12 h) for converting n-hexane to a mixture of 1-substitution (di-tert-butyl 1-hexylhydrazine-1,2-dicarboxylate), 2-substitution, and 3-substitution (2.0: 1.9: 1.0). Furthermore, ICP-OES analysis verified that the Fe/Hf ratio remained unchanged before and after the reaction, confirming that Fe³⁺ ions remained bound to the MOL throughout the reaction.
The Hf6(μ3-O)4(μ3-OH)4 SBU in the Hf-TPY-MOL contains six formate-capped sites that can be replaced by other carboxylates, enabling post-synthetic modification of the MOLs. This straightforward replacement chemistry allows systematic tuning of the surface properties (Figure 2a). Surface modification not only affects the interaction between the terpyridine-Fe3+-Cl active site and the substrate but also alters the surface solvent structure, thereby influencing catalytic activity. To investigate these effects, we systematically studied how different molecular modifiers impact the catalytic performance of Fe3+-loaded MOLs.
For each modifier, Hf-TPY-MOL was incubated with a 0.2 M solution of a carboxylic acid modifier in CH3CN, H2O, DMF, or THF, chosen based on solubility, at 55 oC for 24 h. Structural integrity after modification was confirmed by PXRD and TEM (Figure 1b). The amount of substituted modifier was quantified by proton nuclear magnetic resonance (1H-NMR) after digesting the MOL with K3PO4/D2O, expressed as the Modifier/SBU ratio. Subsequently, Fe3+ was loaded onto the modified Hf-TPY-MOL by reacting with Fe(OTf)3, and the resulting materials were tested as catalysts.
Significant variations in activity and yield were observed among different modifications (Figure 2b). Most carboxylic acid modifications enhanced activity, with several achieving a turnover number (TON) exceeding 600, indicating substantial improvement. However, certain modifiers, such as 2-(2-(2-methoxyethoxy)ethoxy)acetic acid, 4-mercaptobenzoic acid, and 4-formylbenzoic acid, inhibited the reaction, resulting in reduced catalytic activity. 
Fe3+ ions can coordinate with either the TPY ligands or the molecular modifiers on the SBUs in Hf-TPY-MOL. To differentiate the roles of Fe3+ ions at these two sites, we designed a control catalyst, Hf-BTB-MOL, using 1,3, 5-tri (4-carboxyphenyl) benzene (H3BTB) as the ligand, which lacks TPY sites for Fe coordination (Figure S2a). The synthesized Hf-BTB-MOL was characterized by PXRD and TEM (Figure S2b-d). Upon loading Fe3+ via reaction with Fe(OTf)₃ (1.05 equivalents) in CH3CN, yellow MOLs were obtained. Under standard catalytic conditions, Hf-BTB-MOL (Fe) exhibited no activity (0% yield). However, when modified with L-aspartate, the catalyst achieved a significantly higher yield of 80.4%, indicating that both Fe³⁺ ions on TPY sites and Fe3+ ions interacting with modifiers contribute to the catalytic activity.
To elucidate the reaction mechanism, free radical trapping agents were employed. The addition of radical quenchers, such as Butylated Hydroxytoluene (BHT) or 2,2,6,6-Tetramethylpiperidine-1-oxyl (TEMPO), completely inhibited the reaction, supporting the proposed radical pathway. This finding aligns with prior research22. Control experiments without Fe3+ showed no conversion, confirming that Fe3+ is essential for catalysis. Similarly, reactions conducted in the dark yielded no conversion, demonstrating the photo-driven nature of the reaction. 
Additionally, removing TBACl from the reaction system resulted in zero conversion, highlighting the critical of Cl- in forming a Fe3+-Cl bond. This bond facilitates ligand-to-metal charge transfer (LMCT), generating Cl• radicals that initiate hydrogen atom transfer (HAT) with 1-pentanol. Substrate scope experiments (Figure S3) further validated the importance of the hydroxyl groups for δ-position selectivity via [1,5] HAT pathway. 
To address missing factors, two experimental descriptors—Fe loading (defined as the amount of Fe relative to the substrate) and the modifier/SBU ratio—were introduced into the models. Notably, Fe loading was deliberately decoupled from the Fe/Hf ratio by varying the amount of Hf-TPY-MOL used in each experiment. This random variation ensured that Fe loading (relative to the substrate) was independent of any molecular descriptors, avoiding potential biases where molecular modifications could affect Fe coordination and thus distort feature selection.
Can you examine the smiles and the other data provided in the .csv file to try to figure rules governing these modifications' effect on yields?
Here is the data table:
SMILES	Fe_loading	Fe_Hf	M_SBU	Yield
C(CC(=O)O)[C@@H](C(=O)O)N	0.83	0.137970792	2	0.711252035
C1=CC(=CC=C1CBr)C(=O)O	0.3	0.268518519	0.3	0.696789832
C([C@@H](C(=O)O)N)C(=O)O	0.41	0.287852619	1.8	0.660562221
C(CCN)CC(=O)O	0.3	0.300661455	0.38	0.507142857
C1=CC(=CC=C1C(=O)O)N	0.22	0.148411992	1.2	0.680836113
CCCCCCCC(=O)O	0.69	0.137797988	1.6	0.687228122
C(C(=O)O)NC(CO)(CO)CO	0.66	0.176397954	0.3	0.456500877
C1=CC(=CC=C1CCCCC(=O)O)O	0.54	0.135777325	1.2	0.519607483
C1=CNC(=O)C=C1C(=O)O	0.65	0.176087339	3	0.659786184
C([C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O)O	0.88	0.214316331	1.7	0.656840941
C1=CC(=CC=C1C(=O)O)S	0.87	0.241370987	3	0.066165255
COCCOCCOCC(=O)O	0.36	0.248570718	2	0.029961003
C1=CC(=CC=C1C2=CC=NC=C2)C(=O)O	0.43	0.226090889	0.68	0.126814958
C1=CC=C(C=C1)C[C@@H](C(=O)O)N	0.42	0.313971743	0.8	0.091231602
C(=O)(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)O	0.37	0.075797772	0.983352162	0.105724727
C1=CC=C2C(=C1)C(=CN2)C[C@H](C(=O)O)N	0.36	0.138140627	2.2	0.614409241
C1=C(C=NN1)C(=O)O	1.35	0.199322304	1.1	0.641912417
C1=CC2=C(C=C1C(=O)O)C(=O)OC2=O	0.59	0.353232073	1.6	0.631961207
C(CS)C(=O)O	0.8	0.25947068	0.65	0.688639831
C1=CC(=C(C=C1C=O)C(=O)O)O	0.38	0.111544897	0.5	0.20342965
C1=CC(=CC=C1C=O)C(=O)O	0.45	0.116808784	1.8	0
C1=CC(=CC=C1C(=O)O)[N+](=O)[O-]	0.86	0.295420975	2.3	0.645466642
C1=CC(=CN=C1)C(=O)O	0.6	0.176335743	2.2	0.585092593
CC1=CC=C(C=C1)C(=O)O	0.44	0.137608367	2	0.269489422
CSCC[C@@H](C(=O)NCC(=O)O)N	0.29	0.318877551	0.4	0.396053406
C1=CN=CC=C1CCCC(=O)O	0.59	0.256410256	2.5	0.673499028
COC1=C(C=C(C=C1)C(=O)CCC(=O)O)OC	0.44	0.249190132	1.4	0.487430174
C1=CC(=CC=C1C(=O)NCC(=O)O)O	0.2	0.173731758	1.5	0.049113312
C(CC(=O)O)C(=O)CCC(=O)O	0.17	0.167392032	0.8	0.045271477
C(CC(=O)O)CNC(=O)C(F)(F)F	0.18	0.165755014	1.9	0.084698664
C(CCCO)CCC(=O)O	0.26	0.254065041	2	0.049189139
C1=CC(=CC=C1CCC(=O)O)O	0.15	0.104898773	2.1	0.09208251
C(CCCN)CCC(=O)O	0.29	0.245278391	0.6	0.295387675
C(COCCOCCN)C(=O)O	0.26	0.248385494	0.6	0.207341836
CC(=O)NCCCC[C@@H](C(=O)O)N	0.28	0.276931598	0.8	0.273189562
C(CCCC(=O)O)CCCN	0.2	0.236910685	1	0.202356234
'''

## Direct generation conducted by o1: o1 (Direct)

In [16]:
# rules generate by o1 after looking through the paper
import pkg_resources

rules = '''
Below is a high‐level look at what stands out in the data and some plausible “rules” or trends that correlate structural features of the carboxylic‐acid modifiers with the observed yields. Obviously, with only a few dozen data points, these are qualitative guidelines rather than hard‐and‐fast conclusions.
1. Certain para‐substituted benzoic acids give large boosts—unless they can chelate Fe
Several substituted‐benzoic acids in the dataset show high yields (e.g., p‐bromo, p‐amino, p‐nitro), but others crash the yield (e.g., p‐formyl or p‐mercapto). A likely interpretation:
High‐yield substituents such as –Br, –NO₂, –NH₂ at the para‐position are either purely electron‐withdrawing (–Br, –NO₂) or electron‐donating but not strongly Fe‐chelating (para‐amino). They presumably tune the local electronics favorably without “stealing” the iron center from the terpyridine site.
Low‐yield substituents (e.g., –CHO, –SH on the ring) are prone to coordinate/chelate Fe³⁺ or otherwise disrupt the active site. A formyl group (–CHO) can interact with metal centers, and a thiol on an aromatic ring can bind Fe or form disulfides. Both appear to poison the catalyst.
Concrete examples from the table:
p‐Bromo “C1=CC(=CC=C1CBr)C(=O)O” → Yield ~0.70
p‐Amino “C1=CC(=CC=C1C(=O)O)N” → Yield ~0.68
p‐Nitro “C1=CC(=CC=C1C(=O)O)N+[O-]” → Yield ~0.65
p‐Formyl “C1=CC(=CC=C1C=O)C(=O)O” → Yield = 0.00
p‐Mercapto “C1=CC(=C(C=C1C(=O)O)S” → Yield ~0.07
Hence, simple ring substituents that don’t bind Fe directly can be good, but ring substituents that introduce strong Fe interactions (chelation, etc.) kill activity.
2. Aliphatic modifiers can be quite good—unless they have strongly coordinating groups placed badly
Straight‐chain carboxylic acids (no additional polar functional groups near the metal site) often give respectable yields. For instance:
Octanoic acid (“CCCCCCCC(=O)O”) → Yield ~0.69
3‐Mercaptopropionic acid (“C(CS)C(=O)O”) → Yield ~0.69
Interestingly, having a thiol far from the carboxyl (as in 3‐mercaptopropionic acid) works well, while a thiol directly on an aromatic ring (4‐mercaptobenzoic acid) wrecks the yield. This suggests that if a “soft” donor group (like –SH) is positioned too close to (or conjugated with) the aromatic ring, it can coordinate to Fe³⁺ and block the terpyridine site. But if it is “spaced out” on a flexible chain, it no longer poisons the iron center.
By contrast, many large “functionalized” chains with multiple polar groups or strongly coordinating moieties drastically reduce yields (e.g., the PEG‐like “COCCOCCOCC(=O)O” had yield ~0.03).
3. Amino‐acid‐type modifiers often boost yields (e.g., aspartic acid)
Several amino acids or related molecules give high or at least moderate yields. For example:
Aspartic acid (“C(CC(=O)O)C@@HN”) → Yield ~0.71 (highest)
Another aspartate‐like structure → Yield ~0.66
Other short‐chain amino acids (e.g., GABA‐type “C(CCN)CC(=O)O”) → moderate yields ~0.50
Probably these modifiers assist hydrogen bonding or orient the substrate favorably around the Fe–TPY sites, without strongly chelating Fe. However, once the amino‐acid side chain grows too large or carries additional strongly coordinating groups (amides, phenols, etc.), the yield tends to drop.
4. Substituents that directly coordinate Fe can “poison” the site
Compounds that have functional groups known to bind strongly to metal centers (e.g., a phenolate, –SH on an aromatic ring, –CHO that can form metal–imine or metal–O interactions) correlate with low yields. This is especially visible in:
4‐formylbenzoic acid (–CHO) → 0% yield
4‐mercaptobenzoic acid (aromatic –SH) → 0.07
Meanwhile, “gentle” electron donors or withdrawing groups that do not overshadow the Fe–TPY binding (like p‐NO₂, p‐Br, p‐NH₂) leave the catalytic site accessible and yield higher activity.
5. No simple “numeric” correlation with Fe‐loading or M_SBU
Looking purely at the numeric columns:
Fe_loading (moles Fe relative to substrate) varies widely (0.2–1.35) with both high‐ and low‐yield examples scattered across the range.
M_SBU (the number of modifier molecules per Hf6 SBU) likewise does not show a simple linear correlation with yield.
In other words, “just adding more Fe” or “just substituting more modifier” is not enough. The chemical nature of the modifier (i.e., can it coordinate Fe in an undesired way or not?) is the main driver.
6. Summary of “Rules of Thumb”
From this small dataset, a few broad guidelines emerge:
Avoid strong Fe‐chelating functional groups near the carboxylate (especially on aromatic rings). They disrupt or compete with the terpyridine–Fe active site.
Electron‐withdrawing or mildly electron‐donating ring substituents (e.g., nitro, bromo, amino in the para position) are often beneficial—likely because they tune polarity or local environment but do not bind Fe directly.
Simple aliphatic acids (especially medium‐chain length) and amino acids can give high yields, as long as any extra functional groups (like –SH) are spatially separated from Fe.
Overly polar or large “multidentate” functional groups (PEG‐like or ring‐attached donors) can kill the yield by introducing unwanted Fe coordination or strongly altering the MOL’s environment.
These observations are consistent with the idea that the best modifiers either:
Gently tune local polarity/electronic structure around Fe without “stealing” it from the TPY site,
Or help orient the substrate for efficient HAT and radical chemistry,
But do not provide direct coordination sites to the iron center beyond the primary TPY–Fe–Cl motif.
Hence, when choosing new modifiers, one would avoid substituents known to be strong chelators or that form strong internal hydrogen/coordinate bonds to Fe, and instead select moieties that remain mostly “innocent” while providing the right balance of polarity near the surface.
'''


system_prompt = '''You are a coding assistant with expertise in RDkit. Your task is to generate Python code that takes a list of SMILES strings as input. The code should follow the provided natural language rules to convert these SMILES strings into a feature matrix using RDkit. The output matrix should be a DataFrame where each column corresponds to one rule, and each row corresponds to one SMILES string from the list. There should be number_of_SMILES rows and number_of_rules columns.
Generate a feature matrix with the following criteria:
- A value of 0 if the structural description of the rule does not match the SMILES.
- A value of 1 if the structural description of the rule matches the SMILES and predicts a high target value.
- A value of -1 if the structural description of the rule matches the SMILES and predicts a low target value.
'''
with open(pkg_resources.resource_filename('agent.data', 'loffi.txt'),'r') as f:
    smarts_intro = f.read()
with open(pkg_resources.resource_filename('agent.data', 'MACCS_examples.txt'),'r') as f:
    MACCS_egs = f.read()
current_rules = rules
with open(pkg_resources.resource_filename('agent.data', 'group_examples.txt'),'r') as f:
    group_egs = f.read()
with open(pkg_resources.resource_filename('agent.data', 'rule_code_eg.txt'),'r') as f:
    code_example = f.read()
        
user_prompt = f'''
!!Examples for SMARTS!!
{smarts_intro}
{group_egs}
-----------------------------------------------------------------------------------------------

!!Current Rules!!
{current_rules}
------------------------------------------------------------------------------------------------

Please generate Python code that follows these rules. 
Your code should be structured in the following format:

{{
    "prefix": "<Description of the problem and approach>",
    "imports": "<Code block containing import statements>",
    "code": "<Code block not including import statements>"
}}

Example for "code":
{{
"prefix": "This code converts a list of SMILES strings into a feature matrix using RDkit.",
"imports": "import pandas as pd\\nfrom rdkit import Chem\\nfrom rdkit.Chem import AllChem",
"code": {code_example}
\\n 
}}



Note:
Name the function as rule2matrix, Define the function without any example to run that function.
Using SMARTS for better substructure search.
Consider appropriate logic (and, or, not/exclude) of SMARTS patterns to describe a rule.
Handle possible error: when there is any error for one rule apply to one SMILES, return 0 instead.
'''
#! Should the example containing all(mol.HasSubstructMatch(Chem.MolFromSmarts(r)) for r in rule)?
prompt = system_prompt + '\n----------------------------------------\n' + user_prompt


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 1,
    seed = 42,
    top_p=1,
    n=1
)
output_message = chat_completion.choices[0].message.content.strip()
# print(output_message)

import os
import time

import json
import numpy as np
import pandas as pd
import pkg_resources
from pydantic import BaseModel, Field

from agent.state import AgentState,BaseMessage
from agent.client import client
from agent.json_paser import parse_LLM_json

def parse_llm_json(output_message):
    """
    Parse the LLM output JSON and generate the corresponding DataFrame
    """
    try:
        code_dict = json.loads(output_message)
        return code_dict
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")
    
class code(BaseModel):
    """Schema for code solutions for rule2matrix"""
    prefix: str = Field(description="Description of the problem and approach")
    imports: str = Field(description="Code block import statements")
    code: str = Field(description="Code block not including import statements")
    
code_dict = parse_llm_json(parse_LLM_json(output_message))
code_solution = code(**code_dict)
print(code_solution.code)

def rule2matrix(smiles_list):
    # Import necessary modules
    import pandas as pd
    from rdkit import Chem
    from rdkit.Chem import AllChem

    # Define SMARTS patterns for functional groups

    # Carboxylic acid group (-COOH)
    carboxylic_acid = '[CX3](=O)[OX1H0-,OX2H1]'

    # Benzoic acid group
    benzoic_acid = 'c1ccccc1C(=O)[O;H1,-]'

    # Halogen attached to aromatic ring
    halogen_on_aromatic = '[c][F,Cl,Br,I]'

    # Nitro group attached to aromatic ring
    nitro_on_aromatic = '[c][N+](=O)[O-]'

    # Amino group attached to aromatic ring
    amino_on_aromatic = '[c][NH2]'

    # Aldehyde group attached to aromatic ring
    aldehyde_on_aromatic = '[c][CH=O]'

    # Thiol group attached to aromatic ring
    thiol_on_aromatic = '[c][SH]'

    # Pattern for aliphatic chain of five carbons
    aliphatic_chain_five_carbons = '[CH2][CH2][CH2][CH2][CH3]'

    # Thiol group in aliphatic chain (e.g., 3-mercaptopropionic acid)
    aliphatic_thiol = '[#6][#6][SX2H]'

    #

## Direct generation conducted by o3-mini-high: o3-mini (Direct)

In [None]:
# rules generate by o3-mini-high after looking through the paper
import pkg_resources

rules = '''
Below is a concise summary of the trends we can extract from the data:
Key Role of Modifier Structure:
– Problematic Functional Groups: Modifiers bearing groups known to bind strongly to Fe (or otherwise interfere with the LMCT process) give very low yields. For example:
 • 4‐Mercaptobenzoic acid (SMILES with “S”, row 11) gives yield ≈ 0.07, and
 • 4‐Formylbenzoic acid (row 21) gives 0% yield.
– Polyether‐Type Modifiers: A modifier with a chain of ethoxy units (row 12) also nearly shuts down the reaction (yield ≈ 0.03), suggesting that too much hydrophilicity or conformational flexibility can be detrimental.
Beneficial Modifiers:
– Many “simple” carboxylic acids—whether aliphatic (e.g. octanoic acid, row 6) or aromatic with modest substituents (e.g. brominated or nitrobenzoic acids, rows 2 and 22)—tend to give high yields (≈ 0.65–0.71).
– Some amino acid derivatives (e.g. aspartic acid in row 1 and aminobenzoic acid in row 5) also perform well.
Surface Coverage (M_SBU) Effects:
– The modifier/SBU ratio (M_SBU) varies from about 0.3 to 3. In some cases a high substitution (e.g. M_SBU ≈ 3 in row 9) is compatible with high yield, but when combined with a problematic functional group (as in row 11), high coverage correlates with inhibition.
– Thus, an optimal degree of modification appears to depend on the chemical nature of the modifier: too little may not influence the surface enough, but too much of an “active” (i.e. interfering) modifier can hurt performance.
Fe‐Loading and Fe/Hf Ratios:
– No simple linear correlation is seen between yield and either Fe_loading or Fe_Hf. The variation in these parameters (which were deliberately decoupled from the modifier chemistry) suggests that while the Fe metrics are important for catalysis, the chemical “identity” of the modifier is the dominant factor.
Overall Rule-of-Thumb:
Modifiers that are “innocent” – that is, those that do not introduce strongly coordinating or disruptive groups (like –SH or –CHO) and that provide moderate surface coverage – tend to enhance or at least maintain high catalytic yields. In contrast, modifiers with groups that can either bind too strongly to Fe or overly alter the local solvent/coordination environment tend to inhibit the reaction.
These qualitative trends suggest that when selecting or designing new modifiers, one should avoid functionalities (e.g. thiol or aldehyde groups, or overly flexible polyether chains) that may interfere with the critical Fe³⁺–Cl LMCT step, and instead favor simple carboxylic acids or derivatives that offer the right balance of hydrophobicity/hydrophilicity without strong competing coordination.
'''


system_prompt = '''You are a coding assistant with expertise in RDkit. Your task is to generate Python code that takes a list of SMILES strings as input. The code should follow the provided natural language rules to convert these SMILES strings into a feature matrix using RDkit. The output matrix should be a DataFrame where each column corresponds to one rule, and each row corresponds to one SMILES string from the list. There should be number_of_SMILES rows and number_of_rules columns.
Generate a feature matrix with the following criteria:
- A value of 0 if the structural description of the rule does not match the SMILES.
- A value of 1 if the structural description of the rule matches the SMILES and predicts a high target value.
- A value of -1 if the structural description of the rule matches the SMILES and predicts a low target value.
'''
with open(pkg_resources.resource_filename('agent.data', 'loffi.txt'),'r') as f:
    smarts_intro = f.read()
with open(pkg_resources.resource_filename('agent.data', 'MACCS_examples.txt'),'r') as f:
    MACCS_egs = f.read()
current_rules = rules
with open(pkg_resources.resource_filename('agent.data', 'group_examples.txt'),'r') as f:
    group_egs = f.read()
with open(pkg_resources.resource_filename('agent.data', 'rule_code_eg.txt'),'r') as f:
    code_example = f.read()
        
user_prompt = f'''
!!Examples for SMARTS!!
{smarts_intro}
{group_egs}
-----------------------------------------------------------------------------------------------

!!Current Rules!!
{current_rules}
------------------------------------------------------------------------------------------------

Please generate Python code that follows these rules. 
Your code should be structured in the following format:

{{
    "prefix": "<Description of the problem and approach>",
    "imports": "<Code block containing import statements>",
    "code": "<Code block not including import statements>"
}}

Example for "code":
{{
"prefix": "This code converts a list of SMILES strings into a feature matrix using RDkit.",
"imports": "import pandas as pd\\nfrom rdkit import Chem\\nfrom rdkit.Chem import AllChem",
"code": {code_example}
\\n 
}}



Note:
Name the function as rule2matrix, Define the function without any example to run that function.
Using SMARTS for better substructure search.
Consider appropriate logic (and, or, not/exclude) of SMARTS patterns to describe a rule.
Handle possible error: when there is any error for one rule apply to one SMILES, return 0 instead.
'''
#! Should the example containing all(mol.HasSubstructMatch(Chem.MolFromSmarts(r)) for r in rule)?
prompt = system_prompt + '\n----------------------------------------\n' + user_prompt


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 1,
    seed = 42,
    top_p=1,
    n=1
)
output_message = chat_completion.choices[0].message.content.strip()
# print(output_message)

import os
import time

import json
import numpy as np
import pandas as pd
import pkg_resources
from pydantic import BaseModel, Field

from agent.state import AgentState,BaseMessage
from agent.client import client
from agent.json_paser import parse_LLM_json

def parse_llm_json(output_message):
    """
    Parse the LLM output JSON and generate the corresponding DataFrame
    """
    try:
        code_dict = json.loads(output_message)
        return code_dict
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")
    
class code(BaseModel):
    """Schema for code solutions for rule2matrix"""
    prefix: str = Field(description="Description of the problem and approach")
    imports: str = Field(description="Code block import statements")
    code: str = Field(description="Code block not including import statements")
    
code_dict = parse_llm_json(parse_LLM_json(output_message))
code_solution = code(**code_dict)
print(code_solution.code)


def rule2matrix(smiles_list):
    # Define SMARTS patterns
    # Carboxylic acid group
    carboxylic_acid_patterns = ['[CX3](=O)[OX2H1]', '[CX3](=O)[O-]']

    # Thiol group (-SH)
    thiol_pattern = '[SX2H]'

    # Aldehyde group (-CHO)
    aldehyde_pattern = '[$([CX3H][#6]),$([CX3H2])]=[OX1]'
    
    # Ether linkage in aliphatic chains
    ether_linkage_pattern = '[#6][OX2][#6]'

    # Halogens
    halogen_pattern = '[F,Cl,Br,I]'
    
    # Nitro group
    nitro_group_pattern = '[NX3](=O)[O-]'
    
    # Aromatic ring with halogen or nitro substituents
    halogen_on_aromatic_pattern = '[c][F,Cl,Br,I]'
    nitro_on_aromatic_pattern = '[c][NX3](=O)[O-]'

    # Primary amine (-NH2)
    primary_amine_pattern = '[NX3H2]'

    # Aromatic ring pattern
    aromatic_ring_pattern = '[a]1[a][a][a][a][a]1'
    
    # Define the rules with their associated patterns and predictions
    rules = [
        {
            'number': 1,
            'description': 'Low yield (-1): Modifiers containing

## Direct generation conducted by kimi-k1.5: kimi-k1.5 (Direct)

In [None]:
# rules generate by Kimi-k1.5 looking through the paper

import pkg_resources

rules = '''
In summary, possible rules could be:
Aromatic Rings with Electron-Withdrawing Groups: Modifiers containing aromatic rings (like benzene) with electron-withdrawing groups (e.g., -NO2, -Br) tend to have higher yields. These groups might activate the Fe center.
Multiple Carboxylic Acid Groups: Modifiers with more than one carboxylic acid group may anchor more effectively to the Fe centers, enhancing activity. However, this isn't always the case, so other factors like steric effects might interfere.
Presence of Electron-Donating Groups: Modifiers with electron-donating groups (e.g., -OCH3, -NH2) might reduce activity by deactivating the Fe center.
Steric Hindrance: Bulky or highly branched modifiers could hinder substrate access to the active site, leading to lower yields.
Functional Groups That Bind to Fe: Modifiers with functional groups that can coordinate to Fe (like carboxylic acids) are necessary for activity, but certain groups (like thiols or aldehydes) might inhibit by blocking the active site.
Modifier/SBU Ratio: Higher M_SBU ratios (more modifier per SBU) might sometimes correlate with higher yields, but this isn't consistent, indicating that the structure of the modifier is more critical than the quantity.
To validate these rules, I could perform a more detailed analysis, such as calculating molecular descriptors (e.g., logP, number of aromatic rings, presence of specific functional groups) and using statistical methods or machine learning to find correlations with yield. However, given the limited data, these are preliminary observations.

To analyze the rules governing the effect of molecular modifiers on catalytic yields, we systematically examined the relationship between structural features of the modifiers (represented by SMILES) and the provided parameters (Fe_loading, Fe_Hf, M_SBU, Yield). Below is a structured summary of the key observations and inferred rules:
1. Structural Features of Modifiers
A. Aromatic Rings with Electron-Withdrawing Groups (EWGs)
High-Yield Examples:
SMILES: C1=CC(=CC=C1CBr)C(=O)O (yield: 0.696), C1=CC(=CC=C1C(=O)O)N (yield: 0.680), C1=CC(=CC=C1C(=O)O)[N+](=O)[O-] (yield: 0.645).
Key Features: Benzene rings with halogens (Br, Cl) or nitro groups (-NO₂) enhance catalytic activity. EWGs likely stabilize charge transfer processes or activate the Fe³⁺ center.
B. Electron-Donating Groups (EDGs)
Low-Yield Examples:
SMILES: COCCOCCOCC(=O)O (yield: 0.029), C(COCCOCCN)C(=O)O (yield: 0.207).
Key Features: Long alkyl chains or ethers (e.g., -OCH₂CH₂-) may reduce activity by deactivating the Fe³⁺ center or introducing steric hindrance.
C. Steric Hindrance
Low-Yield Examples:
SMILES: C1=CC=C2C(=C1)C(=CN2)C[C@H](C(=O)O)N (yield: 0.614), C1=CC=C(C=C1)C[C@@H](C(=O)O)N (yield: 0.091).
Key Features: Bulky or highly branched structures (e.g., polycyclic rings) likely hinder substrate access to the active site.
D. Functional Groups That Bind to Fe³⁺
High-Yield Examples:
SMILES: C(CS)C(=O)O (yield: 0.688), C1=CC(=CC=C1C(=O)O)S (yield: 0.066).
Key Features: Carboxylic acids (-COOH) are essential for Fe³⁺ coordination. However, thiols (-SH) or aldehydes (-CHO) may inhibit activity by blocking the active site.

2. Parameter Analysis
A. Fe_loading (Fe/substrate ratio)
Observation: No clear correlation with yield. For example:
High Fe_loading (0.88) → yield: 0.656.
Low Fe_loading (0.22) → yield: 0.680.
Inference: Fe_loading alone does not dictate yield; modifier structure is more critical.
B. Fe_Hf (Fe/Hf ratio)
Observation: Higher Fe_Hf ratios (e.g., 0.314) correlate with moderate yields (e.g., 0.396), but exceptions exist.
Inference: Fe_Hf is not a primary driver of activity.
C. M_SBU (Modifier/SBU ratio)
Observation: Higher M_SBU ratios (e.g., 3) sometimes improve yield (e.g., 0.659), but not consistently.
Inference: Modifier structure outweighs quantity.

3. Inhibitory Modifiers
Examples:
SMILES: C1=CC(=CC=C1C(=O)O)S (yield: 0.066), C1=CC=C(C=C1)C[C@@H](C(=O)O)N (yield: 0.091).
Key Features: Thiols (-SH) or complex chiral centers may block the Fe³⁺-Cl bond critical for ligand-to-metal charge transfer (LMCT).

4. Proposed Rules
Aromatic Rings with EWGs: Enhance activity by stabilizing charge transfer.
Electron-Donating Groups: Reduce activity by deactivating Fe³⁺.
Steric Hindrance: Bulky modifiers hinder substrate access.
Fe³⁺-Binding Groups: Carboxylic acids are essential; thiols/aldehydes inhibit.
Modifier Structure > Quantity: M_SBU ratio is secondary to structural effects.

5. Recommendations for Future Work
Molecular Descriptors: Calculate logP, aromaticity, and functional group counts to quantify structure-activity relationships.
Machine Learning: Train models to predict yields from SMILES and parameters.
Mechanistic Studies: Use DFT calculations to explore charge transfer pathways for high-yield modifiers.
This analysis provides a framework for rational design of modifiers to optimize catalytic performance in Fe³⁺-loaded MOL systems.
'''


system_prompt = '''You are a coding assistant with expertise in RDkit. Your task is to generate Python code that takes a list of SMILES strings as input. The code should follow the provided natural language rules to convert these SMILES strings into a feature matrix using RDkit. The output matrix should be a DataFrame where each column corresponds to one rule, and each row corresponds to one SMILES string from the list. There should be number_of_SMILES rows and number_of_rules columns.
Generate a feature matrix with the following criteria:
- A value of 0 if the structural description of the rule does not match the SMILES.
- A value of 1 if the structural description of the rule matches the SMILES and predicts a high target value.
- A value of -1 if the structural description of the rule matches the SMILES and predicts a low target value.
'''
with open(pkg_resources.resource_filename('agent.data', 'loffi.txt'),'r') as f:
    smarts_intro = f.read()
with open(pkg_resources.resource_filename('agent.data', 'MACCS_examples.txt'),'r') as f:
    MACCS_egs = f.read()
current_rules = rules
with open(pkg_resources.resource_filename('agent.data', 'group_examples.txt'),'r') as f:
    group_egs = f.read()
with open(pkg_resources.resource_filename('agent.data', 'rule_code_eg.txt'),'r') as f:
    code_example = f.read()
        
user_prompt = f'''
!!Examples for SMARTS!!
{smarts_intro}
{group_egs}
-----------------------------------------------------------------------------------------------

!!Current Rules!!
{current_rules}
------------------------------------------------------------------------------------------------

Please generate Python code that follows these rules. 
Your code should be structured in the following format:

{{
    "prefix": "<Description of the problem and approach>",
    "imports": "<Code block containing import statements>",
    "code": "<Code block not including import statements>"
}}

Example for "code":
{{
"prefix": "This code converts a list of SMILES strings into a feature matrix using RDkit.",
"imports": "import pandas as pd\\nfrom rdkit import Chem\\nfrom rdkit.Chem import AllChem",
"code": {code_example}
\\n 
}}



Note:
Name the function as rule2matrix, Define the function without any example to run that function.
Using SMARTS for better substructure search.
Consider appropriate logic (and, or, not/exclude) of SMARTS patterns to describe a rule.
Handle possible error: when there is any error for one rule apply to one SMILES, return 0 instead.
'''
#! Should the example containing all(mol.HasSubstructMatch(Chem.MolFromSmarts(r)) for r in rule)?
prompt = system_prompt + '\n----------------------------------------\n' + user_prompt


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 1,
    seed = 42,
    top_p=1,
    n=1
)
output_message = chat_completion.choices[0].message.content.strip()
# print(output_message)

import os
import time

import json
import numpy as np
import pandas as pd
import pkg_resources
from pydantic import BaseModel, Field

from agent.state import AgentState,BaseMessage
from agent.client import client
from agent.json_paser import parse_LLM_json

def parse_llm_json(output_message):
    """
    Parse the LLM output JSON and generate the corresponding DataFrame
    """
    try:
        code_dict = json.loads(output_message)
        return code_dict
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")
    
class code(BaseModel):
    """Schema for code solutions for rule2matrix"""
    prefix: str = Field(description="Description of the problem and approach")
    imports: str = Field(description="Code block import statements")
    code: str = Field(description="Code block not including import statements")
    
code_dict = parse_llm_json(parse_LLM_json(output_message))
code_solution = code(**code_dict)
print(code_solution.code)

def rule2matrix(smiles_list):
    # Define SMARTS patterns for functional groups
    # Aromatic ring
    aromatic_ring = '[a]'

    # Electron-Withdrawing Groups (EWGs)
    nitro_group = '[NX3](=O)=O'
    halogen = '[F,Cl,Br,I]'
    trifluoromethyl = '[CX4]([F])([F])[F]'
    
    # EWG attached to aromatic ring
    ewg_on_aromatic = ['[a][NX3](=O)=O', '[a][F,Cl,Br,I]', '[a][CX4]([F])([F])[F]']
    
    # Carboxylic acid group
    carboxylic_acid = '[CX3](=O)[OX2H1]'
    
    # Electron-Donating Groups (EDGs)
    hydroxyl_group = '[OX2H]'
    amino_group = '[NX3H2]'
    methoxy_group = '[OX2][CH3]'
    edg_on_aromatic = ['[a][OX2H]', '[a][NX3H2]', '[a][OX2][CH3]']

    # Aldehyde group
    aldehyde = '[CX3H][OX1]'

    # Define the rules
    rules = [
        {
            'number': 1,
            'description': 'High yield (+1): Modifiers containing aromatic rings with electron-withdrawing groups attached and connected to a carboxylic acid group.',
            'patterns': [
           

## Direct generation conducted by qwen-max: qwen_max (Direct)

In [21]:
# qwen-max-2025-01-25

import os

from openai import OpenAI

from dotenv import load_dotenv

load_dotenv('GPT_agent.env')

client = OpenAI(
    api_key = os.getenv('OPENAI_CLIENT_KEY_bak'),
    base_url = "https://api.agicto.cn/v1"
)
model="qwen-max-2025-01-25"

In [6]:
prompt = '''
This is a data set of modifier in modulating the catalytic acitivity. we explored modifying metal-organic layer (MOL) surfaces to enhance their catalytic activity in C–H bond activation of long-chain alcohols22, as shown in Figure 1. The MOLs were constructed from Hf6(μ3-O)4(μ3-OH)4(HCO2)6 clusters as the secondary building units (SBUs) and terpyridine-tricarboxylate (TPY) as ligands. Fe centers were anchored on the TPY ligands23, which catalyze sp³ C–H bond activation through photon-driven ligand-to-metal charge transfer (LMCT) process22. Using a dataset of carboxylic acids as molecular modifiers on the MOL surface, we systematically examined how the structure of these modifiers influences catalytic performance (Figure 1a-b). 
Hf-TPY-MOL, Hf6(μ3-O)4(μ3-OH)4(HCO2)6(TPY)2, was employed as a support for Fe3+ ions to study the heterogeneous photocatalytic δ-C(sp3)-H amination of 1-pentanol22. The Hf-TPY-MOL was synthesized following a previously published procedure23, and its structure and morphology were confirmed using powder X-ray diffraction (PXRD) and transmission electron microscopy (TEM). To load Fe3+, Hf-TPY-MOL was dispersed in acetonitrile (CH3CN) containing 1.05 equivalents of Fe(OTf)3 relative to TPY ligands and stirred at room temperature for 24 hours. The resulting Fe3+-loaded Hf-TPY-MOL (Hf-TPY-MOL-Fe) exhibited a blue-violet color and was washed extensively with CH3CN to remove uncoordinated Fe3+, as confirmed by UV-Vis spectroscopy. Inductively coupled plasma optical emission spectrometer (ICP-OES) indicated Fe3+ 102% metalation of the TPY ligands with the determined amount of Fe corresponding to 102% of the TPY sites.
The catalytic activity of Hf-TPY-MOL-Fe was tested in the photocatalytic amination of 1-pentanol with di-tert-butyl azodicarboxylate (DBAD). A mixture of Hf-TPY-MOL-Fe (2 mg), DBAD (1 equivalent), 1-pentanol (3 equivalents), and tetrabutylammonium chloride (TBACl, 1.25 mol%) was irradiated with 393 nm LED light under a nitrogen atmosphere at room temperature for 12 hours. Reaction yields were quantified using proton nuclear magnetic resonance (¹H-NMR) with CH₂Br₂ as an internal standard, yielding a moderate product conversion of 9% relative to DBAD, corresponding to a turnover number (TON) of 41 on Fe.
To confirm the heterogeneous nature of the catalysis and rule out the leaching of Fe³⁺ ions, UV-Vis absorption spectroscopy of the reaction supernatant revealed no detectable Fe³⁺ signals. Cross-experiments were conducted using 1-pentanol as the substrate, followed by the addition of n-hexane to the supernatant after isolating the solid catalyst. Significant conversion of 1-pentanol, but no reaction with n-hexane, confirmed that the supernatant lacked catalytic activity. In comparison, the Hf-TPY-MOL-Fe has high activity (TON of 233, yield of 35% after irradiation of 12 h) for converting n-hexane to a mixture of 1-substitution (di-tert-butyl 1-hexylhydrazine-1,2-dicarboxylate), 2-substitution, and 3-substitution (2.0: 1.9: 1.0). Furthermore, ICP-OES analysis verified that the Fe/Hf ratio remained unchanged before and after the reaction, confirming that Fe³⁺ ions remained bound to the MOL throughout the reaction.
The Hf6(μ3-O)4(μ3-OH)4 SBU in the Hf-TPY-MOL contains six formate-capped sites that can be replaced by other carboxylates, enabling post-synthetic modification of the MOLs. This straightforward replacement chemistry allows systematic tuning of the surface properties (Figure 2a). Surface modification not only affects the interaction between the terpyridine-Fe3+-Cl active site and the substrate but also alters the surface solvent structure, thereby influencing catalytic activity. To investigate these effects, we systematically studied how different molecular modifiers impact the catalytic performance of Fe3+-loaded MOLs.
For each modifier, Hf-TPY-MOL was incubated with a 0.2 M solution of a carboxylic acid modifier in CH3CN, H2O, DMF, or THF, chosen based on solubility, at 55 oC for 24 h. Structural integrity after modification was confirmed by PXRD and TEM (Figure 1b). The amount of substituted modifier was quantified by proton nuclear magnetic resonance (1H-NMR) after digesting the MOL with K3PO4/D2O, expressed as the Modifier/SBU ratio. Subsequently, Fe3+ was loaded onto the modified Hf-TPY-MOL by reacting with Fe(OTf)3, and the resulting materials were tested as catalysts.
Significant variations in activity and yield were observed among different modifications (Figure 2b). Most carboxylic acid modifications enhanced activity, with several achieving a turnover number (TON) exceeding 600, indicating substantial improvement. However, certain modifiers, such as 2-(2-(2-methoxyethoxy)ethoxy)acetic acid, 4-mercaptobenzoic acid, and 4-formylbenzoic acid, inhibited the reaction, resulting in reduced catalytic activity. 
Fe3+ ions can coordinate with either the TPY ligands or the molecular modifiers on the SBUs in Hf-TPY-MOL. To differentiate the roles of Fe3+ ions at these two sites, we designed a control catalyst, Hf-BTB-MOL, using 1,3, 5-tri (4-carboxyphenyl) benzene (H3BTB) as the ligand, which lacks TPY sites for Fe coordination (Figure S2a). The synthesized Hf-BTB-MOL was characterized by PXRD and TEM (Figure S2b-d). Upon loading Fe3+ via reaction with Fe(OTf)₃ (1.05 equivalents) in CH3CN, yellow MOLs were obtained. Under standard catalytic conditions, Hf-BTB-MOL (Fe) exhibited no activity (0% yield). However, when modified with L-aspartate, the catalyst achieved a significantly higher yield of 80.4%, indicating that both Fe³⁺ ions on TPY sites and Fe3+ ions interacting with modifiers contribute to the catalytic activity.
To elucidate the reaction mechanism, free radical trapping agents were employed. The addition of radical quenchers, such as Butylated Hydroxytoluene (BHT) or 2,2,6,6-Tetramethylpiperidine-1-oxyl (TEMPO), completely inhibited the reaction, supporting the proposed radical pathway. This finding aligns with prior research22. Control experiments without Fe3+ showed no conversion, confirming that Fe3+ is essential for catalysis. Similarly, reactions conducted in the dark yielded no conversion, demonstrating the photo-driven nature of the reaction. 
Additionally, removing TBACl from the reaction system resulted in zero conversion, highlighting the critical of Cl- in forming a Fe3+-Cl bond. This bond facilitates ligand-to-metal charge transfer (LMCT), generating Cl• radicals that initiate hydrogen atom transfer (HAT) with 1-pentanol. Substrate scope experiments (Figure S3) further validated the importance of the hydroxyl groups for δ-position selectivity via [1,5] HAT pathway. 
To address missing factors, two experimental descriptors—Fe loading (defined as the amount of Fe relative to the substrate) and the modifier/SBU ratio—were introduced into the models. Notably, Fe loading was deliberately decoupled from the Fe/Hf ratio by varying the amount of Hf-TPY-MOL used in each experiment. This random variation ensured that Fe loading (relative to the substrate) was independent of any molecular descriptors, avoiding potential biases where molecular modifications could affect Fe coordination and thus distort feature selection.
Can you examine the smiles and the other data provided in the .csv file to try to figure rules governing these modifications' effect on yields?
Here is the data table:
SMILES	Fe_loading	Fe_Hf	M_SBU	Yield
C(CC(=O)O)[C@@H](C(=O)O)N	0.83	0.137970792	2	0.711252035
C1=CC(=CC=C1CBr)C(=O)O	0.3	0.268518519	0.3	0.696789832
C([C@@H](C(=O)O)N)C(=O)O	0.41	0.287852619	1.8	0.660562221
C(CCN)CC(=O)O	0.3	0.300661455	0.38	0.507142857
C1=CC(=CC=C1C(=O)O)N	0.22	0.148411992	1.2	0.680836113
CCCCCCCC(=O)O	0.69	0.137797988	1.6	0.687228122
C(C(=O)O)NC(CO)(CO)CO	0.66	0.176397954	0.3	0.456500877
C1=CC(=CC=C1CCCCC(=O)O)O	0.54	0.135777325	1.2	0.519607483
C1=CNC(=O)C=C1C(=O)O	0.65	0.176087339	3	0.659786184
C([C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O)O	0.88	0.214316331	1.7	0.656840941
C1=CC(=CC=C1C(=O)O)S	0.87	0.241370987	3	0.066165255
COCCOCCOCC(=O)O	0.36	0.248570718	2	0.029961003
C1=CC(=CC=C1C2=CC=NC=C2)C(=O)O	0.43	0.226090889	0.68	0.126814958
C1=CC=C(C=C1)C[C@@H](C(=O)O)N	0.42	0.313971743	0.8	0.091231602
C(=O)(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)O	0.37	0.075797772	0.983352162	0.105724727
C1=CC=C2C(=C1)C(=CN2)C[C@H](C(=O)O)N	0.36	0.138140627	2.2	0.614409241
C1=C(C=NN1)C(=O)O	1.35	0.199322304	1.1	0.641912417
C1=CC2=C(C=C1C(=O)O)C(=O)OC2=O	0.59	0.353232073	1.6	0.631961207
C(CS)C(=O)O	0.8	0.25947068	0.65	0.688639831
C1=CC(=C(C=C1C=O)C(=O)O)O	0.38	0.111544897	0.5	0.20342965
C1=CC(=CC=C1C=O)C(=O)O	0.45	0.116808784	1.8	0
C1=CC(=CC=C1C(=O)O)[N+](=O)[O-]	0.86	0.295420975	2.3	0.645466642
C1=CC(=CN=C1)C(=O)O	0.6	0.176335743	2.2	0.585092593
CC1=CC=C(C=C1)C(=O)O	0.44	0.137608367	2	0.269489422
CSCC[C@@H](C(=O)NCC(=O)O)N	0.29	0.318877551	0.4	0.396053406
C1=CN=CC=C1CCCC(=O)O	0.59	0.256410256	2.5	0.673499028
COC1=C(C=C(C=C1)C(=O)CCC(=O)O)OC	0.44	0.249190132	1.4	0.487430174
C1=CC(=CC=C1C(=O)NCC(=O)O)O	0.2	0.173731758	1.5	0.049113312
C(CC(=O)O)C(=O)CCC(=O)O	0.17	0.167392032	0.8	0.045271477
C(CC(=O)O)CNC(=O)C(F)(F)F	0.18	0.165755014	1.9	0.084698664
C(CCCO)CCC(=O)O	0.26	0.254065041	2	0.049189139
C1=CC(=CC=C1CCC(=O)O)O	0.15	0.104898773	2.1	0.09208251
C(CCCN)CCC(=O)O	0.29	0.245278391	0.6	0.295387675
C(COCCOCCN)C(=O)O	0.26	0.248385494	0.6	0.207341836
CC(=O)NCCCC[C@@H](C(=O)O)N	0.28	0.276931598	0.8	0.273189562
C(CCCC(=O)O)CCCN	0.2	0.236910685	1	0.202356234
'''
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="qwen-max-2025-01-25",
    temperature=1,
    seed = 42,
    top_p=1,
    n=1
)
print(chat_completion.choices[0].message.content.strip())

To analyze the provided dataset and infer rules governing how carboxylic acid modifiers affect the catalytic performance of Fe3+-loaded Hf-TPY-MOLs, we can examine patterns in the structural properties of the modifiers (represented by their SMILES strings) and their relationship to experimental descriptors such as Fe loading, Fe/Hf ratio, modifier-to-SBU (secondary building unit) ratio, and yield. Below, I outline potential trends and relationships based on the data:

---

### **1. Structural Features of Modifiers**
The SMILES strings represent the molecular structures of the carboxylic acid modifiers. By examining these, we can identify functional groups or structural motifs that correlate with high or low yields:

#### a. **Functional Groups Linked to High Yields**
Modifiers achieving higher yields (> 0.5) often contain:
   - Aromatic rings (e.g., benzoic acids): These appear frequently in high-yield entries.
     - Examples: `C1=CC(=CC=C1C(=O)O)N`, `C1=CC(=CN=C1)C(=O)O`.
   - Hydrox

In [22]:
# rules generate by qwen-max-2025-01-25 looking through the paper

import pkg_resources

rules = '''
To analyze the provided dataset and infer rules governing how carboxylic acid modifiers affect the catalytic performance of Fe3+-loaded Hf-TPY-MOLs, we can examine patterns in the structural properties of the modifiers (represented by their SMILES strings) and their relationship to experimental descriptors such as Fe loading, Fe/Hf ratio, modifier-to-SBU (secondary building unit) ratio, and yield. Below, I outline potential trends and relationships based on the data:

---

### **1. Structural Features of Modifiers**
The SMILES strings represent the molecular structures of the carboxylic acid modifiers. By examining these, we can identify functional groups or structural motifs that correlate with high or low yields:

#### a. **Functional Groups Linked to High Yields**
Modifiers achieving higher yields (> 0.5) often contain:
   - Aromatic rings (e.g., benzoic acids): These appear frequently in high-yield entries.
     - Examples: `C1=CC(=CC=C1C(=O)O)N`, `C1=CC(=CN=C1)C(=O)O`.
   - Hydroxyl groups: Compounds with hydroxyl groups or ether linkages tend to improve activity.
     - Example: `C1=CC(=CC=C1C(=O)O)O`.
   - Aliphatic chains with amine functionalities: These may enhance substrate interaction.
     - Example: `C(CC(=O)O)[C@@H](C(=O)O)N`.

#### b. **Functional Groups Linked to Low Yields**
Modifiers with very low yields (< 0.1) often contain:
   - Electron-withdrawing groups like nitro groups (-NO2) or formyl groups (-CHO), which may deactivate the Fe3+ center.
     - Example: `C1=CC(=CC=C1C=O)C(=O)O` (contains aldehyde).
   - Bulky functional groups or long polyether chains, which may sterically hinder the active site.
     - Example: `COCCOCCOCC(=O)O` (long polyether chain).

---

### **2. Quantitative Relationships**
#### a. **Fe Loading (Fe_loading)**
Higher Fe loading generally correlates with better yields, but this is not universally true. Some compounds achieve high yields even at low Fe loading, likely due to optimized interactions between the modifier and the substrate.

   - **High Fe_loading + High Yield**: Compounds like `C1=C(C=NN1)C(=O)O` and `C([C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O)O`.
   - **Low Fe_loading + Moderate Yield**: Compounds like `C(CCN)CC(=O)O`.

#### b. **Modifier-to-SBU Ratio (M_SBU)**
Modifiers with M_SBU values around 1–2 tend to produce higher yields, suggesting an optimal balance between surface modification and active site accessibility.

   - **Optimal Range**: M_SBU ≈ 1–2.
     - Example: `C1=CNC(=O)C=C1C(=O)O` (high yield, M_SBU = 3).
   - **Too High M_SBU**: May block active sites, reducing yield.
     - Example: `C1=CC(=CC=C1C(=O)O)S` (very low yield, M_SBU = 3).

#### c. **Fe/Hf Ratio (Fe_Hf)**
This descriptor indicates the extent of Fe coordination relative to Hf6 clusters. Optimal Fe_Hf ranges for high yield are typically 0.1–0.25.

   - **Optimal Range**: Fe_Hf ≈ 0.1–0.25.
     - Example: `C1=C(C=NN1)C(=O)O` (high yield, Fe_Hf = 0.2).
   - **Too High Fe_Hf**: May lead to overcoordination, reducing activity.
     - Example: `C1=CC(=CC=C1C=O)C(=O)O` (very low yield, Fe_Hf = 0.12).

---

### **3. Trends in Specific Functionalities**
#### a. **Aromaticity**
Aromatic modifiers dominate among high-yield compounds. Their π-electron systems likely stabilize intermediates involved in LMCT processes.

   - High-Yield Examples:
     - `C1=CC(=CC=C1C(=O)O)N` (nitrogen-containing aromatic ring).
     - `C1=CC(=CN=C1)C(=O)O` (pyridine derivative).

#### b. **Polarity and Hydrogen Bonding**
Modifiers with polar functional groups (e.g., -OH, -NH2) or hydrogen bond donors/acceptors often exhibit improved activity due to enhanced substrate binding.

   - Example: `C1=CC(=CC=C1CCCCC(=O)O)O` (hydroxyl group enhances yield).

#### c. **Steric Hindrance**
Bulky modifiers or those with long side chains tend to reduce activity, likely due to steric blocking of the Fe3+ center.

   - Low-Yield Examples:
     - `COCCOCCOCC(=O)O` (long polyether chain).
     - `C1=CC(=CC=C1C=O)C(=O)O` (bulky aldehyde group).

---

### **4. Interaction Between Descriptors**
#### a. **Synergy Between Fe_loading and M_SBU**
Modifiers with moderate Fe_loading (~0.4–0.7) and M_SBU (~1–2) often show the highest yields, suggesting a balance between Fe availability and surface coverage.

   - Example: `C1=CC(=CC=C1CCC(=O)O)O` (moderate Fe_loading = 0.54, M_SBU = 1.2, high yield).

#### b. **Impact of Fe_Hf**
At higher Fe_Hf values (> 0.25), yields decrease regardless of other factors, indicating potential saturation of Fe centers.

   - Example: `C1=CC(=CC=C1C=O)C(=O)O` (low yield, high Fe_Hf = 0.12).

---

### **5. General Rules Derived from Analysis**
Based on the patterns observed:
   1. **Favorable Structures**:
      - Aromatic rings (esp. with electron-donating substituents like -OH, -NH2).
      - Short-to-moderate-length polar chains.
      - Minimal sterically demanding groups.

   2. **Unfavorable Structures**:
      - Bulky functional groups (e.g., aldehydes, large alkyl chains).
      - Long polyether chains.
      - Electron-withdrawing groups (e.g., -NO2).

   3. **Optimal Quantitative Parameters**:
      - Fe_loading ≈ 0.4–0.7.
      - Fe_Hf ≈ 0.1–0.25.
      - M_SBU ≈ 1–2.

---

### **Key Takeaways**
The activity of Fe3+-loaded MOLs is highly sensitive to both the chemical structure of the modifiers and their interaction with the catalyst surface. Aromatic and polar modifiers with moderate surface coverage and Fe loading tend to maximize catalytic efficiency. Steric hindrance and electron-withdrawing groups should be avoided to prevent deactivation of the Fe3+ centers.

If additional data or computational tools (e.g., machine learning models) become available, more precise predictions can be made regarding optimal modifier designs for future studies. 

**Boxed Final Answer**:
Patterns suggest that **aromaticity**, **polar groups**, and **moderate steric bulk** enhance catalytic yields, while electron-withdrawing groups, bulky moieties, and excessive surface coverage inhibit activity. The most effective configurations occur under Fe_loading ≈ 0.4–0.7, Fe_Hf ≈ 0.1–0.25, and M_SBU ≈ 1–2.
'''


system_prompt = '''You are a coding assistant with expertise in RDkit. Your task is to generate Python code that takes a list of SMILES strings as input. The code should follow the provided natural language rules to convert these SMILES strings into a feature matrix using RDkit. The output matrix should be a DataFrame where each column corresponds to one rule, and each row corresponds to one SMILES string from the list. There should be number_of_SMILES rows and number_of_rules columns.
Generate a feature matrix with the following criteria:
- A value of 0 if the structural description of the rule does not match the SMILES.
- A value of 1 if the structural description of the rule matches the SMILES and predicts a high target value.
- A value of -1 if the structural description of the rule matches the SMILES and predicts a low target value.
'''
with open(pkg_resources.resource_filename('agent.data', 'loffi.txt'),'r') as f:
    smarts_intro = f.read()
with open(pkg_resources.resource_filename('agent.data', 'MACCS_examples.txt'),'r') as f:
    MACCS_egs = f.read()
current_rules = rules
with open(pkg_resources.resource_filename('agent.data', 'group_examples.txt'),'r') as f:
    group_egs = f.read()
with open(pkg_resources.resource_filename('agent.data', 'rule_code_eg.txt'),'r') as f:
    code_example = f.read()
        
user_prompt = f'''
!!Examples for SMARTS!!
{smarts_intro}
{group_egs}
-----------------------------------------------------------------------------------------------

!!Current Rules!!
{current_rules}
------------------------------------------------------------------------------------------------

Please generate Python code that follows these rules. 
Your code should be structured in the following format:

{{
    "prefix": "<Description of the problem and approach>",
    "imports": "<Code block containing import statements>",
    "code": "<Code block not including import statements>"
}}

Example for "code":
{{
"prefix": "This code converts a list of SMILES strings into a feature matrix using RDkit.",
"imports": "import pandas as pd\\nfrom rdkit import Chem\\nfrom rdkit.Chem import AllChem",
"code": {code_example}
\\n 
}}



Note:
Name the function as rule2matrix, Define the function without any example to run that function.
Using SMARTS for better substructure search.
Consider appropriate logic (and, or, not/exclude) of SMARTS patterns to describe a rule.
Handle possible error: when there is any error for one rule apply to one SMILES, return 0 instead.
'''
#! Should the example containing all(mol.HasSubstructMatch(Chem.MolFromSmarts(r)) for r in rule)?
prompt = system_prompt + '\n----------------------------------------\n' + user_prompt


chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model,
    temperature = 0.3,
    seed = 42,
    top_p=1,
    n=1
)
output_message = chat_completion.choices[0].message.content.strip()
# print(output_message)

import os
import time

import json
import numpy as np
import pandas as pd
import pkg_resources
from pydantic import BaseModel, Field

from agent.state import AgentState,BaseMessage
from agent.client import client
from agent.json_paser import parse_LLM_json

def parse_llm_json(output_message):
    """
    Parse the LLM output JSON and generate the corresponding DataFrame
    """
    try:
        code_dict = json.loads(output_message)
        return code_dict
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")
    
class code(BaseModel):
    """Schema for code solutions for rule2matrix"""
    prefix: str = Field(description="Description of the problem and approach")
    imports: str = Field(description="Code block import statements")
    code: str = Field(description="Code block not including import statements")
    
code_dict = parse_llm_json(parse_LLM_json(output_message))
code_solution = code(**code_dict)
print(code_solution.code)

def rule2matrix(smiles_list):
    # Define SMARTS patterns for functional groups and structural features
    aromatic_ring = 'c1ccccc1'
    hydroxyl_group = '[OX2H]'
    alkoxy_group = '[OX2][#6]'
    amino_group = '[NX3;H2,H1][#6]'
    nitro_group = '[NX3](=O)=O'
    halogen_group = '[F,Cl,Br,I]'
    trifluoromethyl_group = '[CX4][F][F][F]'
    carboxylic_acid = '[CX3](=O)[OX2H1]'
    carbonyl_group = '[CX3]=[OX1]'
    aldehyde_group = '[CX3H1](=O)'
    bulky_group = '[CX4;!$([CX4][C])][CX4;!$([CX4][C])]'
    long_aliphatic_chain = '[CH2][CH2][CH2][CH2][CH2]'

    # Define the rules with their associated patterns and predictions
    rules = [
        {
            'number': 1,
            'description': 'High Yield: Aromatic ring present.',
            'patterns': [[aromatic_ring]],
            'prediction': 1
        },
        {
            'number': 2,
            'description': 'High Yield: Hydroxyl group present.',
            'patterns': [[hydroxyl_group]],
            'predicti