In [2]:
import json
from typing import Optional, Literal
from contextvars import ContextVar

import dspy
from pydantic import BaseModel, Field, field_validator

additional_args = ContextVar('additional_args', default={})
# config
lm = dspy.LM('openai/gpt-4o', cache=False)
dspy.configure(lm=lm)


class Target(BaseModel):
    target_formula: str = Field(description='make sure it is a valid chemical formula')
    amount_var: dict[str, list[float]] = Field(description='the amount variable in the formula, e.g. AxBC, {x: [1, 2]}')
    element_var: dict[str, list[str]] = Field(description='the element variable in the formula')

class Reaction(BaseModel):
    precursors: list[str] = Field(description='ensure it is a valid chemical formula')
    additives: list[str]
    target: Target
    reaction_type: Literal['solid-state', 'sol-gel', 'co-precipitation', 'hydrothermal', 'flux', 'others']

    # @field_validator('precursors')
    # @classmethod
    # def validate_precusors(cls, v):
    #     if not v:
    #         raise ValueError('precursors should not be empty')
    #     return v
    

class QA(dspy.Signature):
    """extract all chmemical reactions consitituent from the text.
    Note: please only include those reactions in which precursors and target are explicitly mentioned in the text.
    Intermediate reaction should be extracted as well."""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reactions')
    reactions: Optional[list[Reaction]] = dspy.OutputField(desc='the reactions extracted from the text, return null if no reaction found')

class ExtractReactionWithType(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=QA)

    def forward(self, text):
        prediction = self.predictor(text=text)
        return prediction

class ExtractReactionTandem(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=QA)

    def forward(self, text):
        summary = chain.invoke({"text": text})
        prediction = self.predictor(text=summary)
        return prediction

from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
opt_v4 = """Extract detailed reaction information from the provided experimental description. For each reaction, structure the output as follows:  

1. **Overview of the Reaction:**
   - Identify the main target compound(s).
   - List the key precursor(s).  

2. **Detailed Reaction Steps:**
   - For each distinct reaction or synthesis process, specify:  
     - **Precursors**: List the starting materials.  
     - **Target Compounds**: Name the products formed in that step.  
     - **Experimental Procedure**: Describe the operations involved (e.g., mixing, milling, calcination) and include relevant conditions (e.g., temperature, duration).  
   - Highlight intermediate products and subsequent reactions involving them.  

3. **Final Processing Steps:**
   - Summarize operations for combining and processing intermediate products into final forms (e.g., granulation, pressing, sintering).  

Ensure clarity and organization by grouping steps logically and labeling each reaction or procedure distinctly. 
Additional Guidelines: 
Logical Grouping: Organize reactions sequentially or thematically, ensuring that all described reactions are included and distinctly labeled.
text:
{text}"""
chatmodel = ChatOpenAI(model='gpt-4o', temperature=0)
prompt = PromptTemplate.from_template(opt_v4)
chain = prompt | chatmodel | StrOutputParser()

In [2]:
doc = """Identify and extract all chemical reactions from the text. For each reaction:

Include reactions where precursors and target products are explicitly mentioned.
Ensure not to overlook reactions that are less prominently discussed but still have their precursors and target explicitly stated.
Also extract intermediate reactions and label them as such."""
QA.__doc__ = doc

In [3]:
QA.__doc__ = "extract reactions from given text"

In [15]:
extractor = ExtractReactionWithType()
extractor.predictor.extended_signature

StringSignature(text -> reasoning, reactions
    instructions='extract reactions from given text'
    text = Field(annotation=str required=True json_schema_extra={'desc': 'a piece of text which may contains chemical reactions', '__dspy_field_type': 'input', 'prefix': 'Text:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    reactions = Field(annotation=Union[list[Reaction], NoneType] required=True json_schema_extra={'desc': 'the reactions extracted from the text, return null if no reaction found', '__dspy_field_type': 'output', 'prefix': 'Reactions:'})
)

In [3]:
from sisyphus.utils.helper_functions import load_from_curated_examples
from dspy.datasets.dataloader import DataLoader

dataset = load_from_curated_examples('curated_examples.json', ('text', 'reactions'), ('text',), Reaction)
splits = DataLoader().train_test_split(dataset, train_size=0.4, random_state=44)
train, dev = splits['train'], splits['test']

In [4]:
train

[Example({'text': '(1−x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3 [hereafter referred to as (1−x)CST–xBNT] ceramics (x\u2009=\u20090.025, 0.050, 0.075, 0.10) were prepared by the conventional solid-state reaction method. The starting materials were high-purity grade powders (>\u200999%): CaCO3, SrCO3, Bi2O3, Na2CO3 and TiO2. CST and BNT powders were synthesized separately by mixing the starting materials according to the desired stoichiometry, and milled with ZrO2 balls and deionized water for 6 h in nylon jars. After drying, the powders were calcined at 1100 °C for 3 h and 850 °C for 3 h, respectively. The resulting powders were re-milled for 6 h. After that, the dried and re-milled mixtures were granulated with appropriate poly vinyl alcohol (PVA) as binder and uniaxial pressed at 300 MPa into cylinders with approximate size of 15 mm in diameter and 7.3 mm in thickness. The cylinders with x\u2009=\u20090.05, 0.075 and 0.1 were sintered in air at 1225–1300 °C for 3 h, and the cylinders wit

In [5]:
dev

[Example({'text': 'Perovskite nanosheets were prepared by delaminating layered perovskites according to previously described procedures. The starting material KCa2Nb3O10, prepared by a solid-state reaction, was converted into a protonic form, HCa2Nb3O10 1.5H2O, in HNO3 solution. A colloidal suspension of Ca2Nb3O10 nanosheets was synthesized by delaminating HCa2Nb3O10 1.5H2O, with tetrabutylammonium hydroxide solution (TBAOH). Sr2Nb3O10, Ca2Ta3O10, and Sr2Ta3O10 nanosheets were also synthesized by delaminating layered perovskites (KSr2Nb3O10, KCa2Ta3O10, and KSr2Ta3O10).', 'reactions': [Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Target(target_formula='HCa2Nb3O10 1.5H2O', amount_var={}, element_var={}), reaction_type='others'), Reaction(precursors=['HCa2Nb3O10 1.5H2O'], additives=['TBAOH'], target=Target(target_formula='Ca2Nb3O10', amount_var={}, element_var={}), reaction_type='others'), Reaction(precursors=['KSr2Nb3O10'], additives=[], target=Target(target_formula='S

In [11]:
train[0].inputs().keys()

['text']

In [8]:
class Match(dspy.Signature):
    """whether the extract reactions match the gold reactions"""
    gold = dspy.InputField(desc='the gold reactions which should be extracted')
    extracted = dspy.InputField(desc='the extracted reactions')
    answer: Literal['yes', 'no'] = dspy.OutputField(desc='Does the extracted reactions match the gold reactions? Tolerate some minor differences')

def metric(gold, pred, trace=None):
    def render_reactions(reactions):
        repr_reactions = []
        for reaction in reactions:
            precursors = reaction.precursors
            target = reaction.target.target_formula
            target_amount_var = reaction.target.amount_var
            reprsentation = f"precursors: {','.join(precursors)}, target: {target}, target amount variable: {target_amount_var}"
            repr_reactions.append(reprsentation)
        return '\n'.join(repr_reactions)
    gold_reactions = render_reactions(gold.reactions)
    extracted_reactions = render_reactions(pred.reactions)
    answer = dspy.ChainOfThought(Match)(gold=gold_reactions, extracted=extracted_reactions).answer
    return True if answer == 'yes' else False

In [9]:
optimizer = dspy.BootstrapFewShot(metric=metric, max_bootstrapped_demos=6)
uncompiled_direct = ExtractReactionWithType()
uncompiled_tandem = ExtractReactionTandem()

In [9]:
compiled_direct = optimizer.compile(uncompiled_direct, trainset=train)

100%|██████████| 6/6 [00:37<00:00,  6.26s/it]

Bootstrapped 5 full traces after 5 examples for up to 1 rounds, amounting to 6 attempts.





In [9]:
vanilla = dspy.Predict(QA)
compiled_vanilla = optimizer.compile(vanilla, trainset=train)

100%|██████████| 6/6 [00:30<00:00,  5.11s/it]

Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 6 attempts.





In [None]:
import sisyphus.patch.dspy_patch
file_name = 'compiled_direct_with_element_var_version_2'
compiled_direct.save('compiled_direct_with_element_var_version_2')

In [17]:
compiled_tandem = optimizer.compile(uncompiled_tandem, trainset=train)

100%|██████████| 6/6 [02:31<00:00, 25.30s/it]

Bootstrapped 3 full traces after 5 examples for up to 1 rounds, amounting to 6 attempts.





In [11]:
from dspy.evaluate import Evaluate

devset = dev
evaluate = Evaluate(devset=devset, metric=metric, num_threads=10, display_progress=True, display_table=True)
# evaluate(uncompiled_direct)

In [12]:
evaluate(extractor, metric=metric, num_threads=10, display_progress=True, display_table=True, devset=train)

Average Metric: 4.00 / 6 (66.7%): 100%|██████████| 6/6 [00:16<00:00,  2.80s/it] 

2024/12/18 16:34:24 INFO dspy.evaluate.evaluate: Average Metric: 4 / 6 (66.7%)





Unnamed: 0,text,example_reactions,reasoning,pred_reactions,metric
0,(1−x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3 [hereafter referred to a...,"[Reaction(precursors=['CaCO3', 'SrCO3', 'TiO2'], additives=[], tar...",The text describes the preparation of (1−x)(Ca0.88Sr0.12)TiO3–x(Bi...,"[precursors=['CaCO3', 'SrCO3', 'Bi2O3', 'Na2CO3', 'TiO2'] additive...",
1,Three interesting papers published during this review period demon...,"[precursors=['Bi2SrNa2Nb4O9', 'NaNbO3'] additives=[] target=Target...",The text describes various studies involving chemical analysis tec...,"[precursors=['Bi2SrNa2Nb4O9', 'NaNbO3'] additives=[] target=Target...",✔️ [True]
2,"The new phases, Na4M3(PO4)2(P2O7) (M = Mn, Co, Ni), were also obta...","[precursors=['Na4P2O7', 'MnCO3', 'NH4H2PO4'] additives=[] target=T...",The text describes a solid-state reaction process to synthesize ne...,"[Reaction(precursors=['Na4P2O7', 'MnCO3', 'NH4H2PO4'], additives=[...",✔️ [True]
3,CaBiO2Cl samples were prepared by a two-step process. All chemical...,"[precursors=['Bi(NO3)3', 'KCl'] additives=[] target=Target(target_...",The text describes two chemical reactions. The first reaction invo...,"[precursors=['Bi(NO3)3', 'KCl'] additives=['H2O'] target=Target(ta...",✔️ [True]
4,Polycrystalline LiFeP2O7 was synthesized by solid-state methods. S...,"[precursors=['LiH2PO4', 'Fe2O3', 'NH4H2PO4'] additives=[] target=T...",The text describes the synthesis of polycrystalline LiFeP2O7 using...,"[precursors=['LiH2PO4', 'Fe2O3', 'NH4H2PO4'] additives=[] target=T...",✔️ [True]
5,The BaCe0.7In0.15Ta0.05Y0.1O3-δ (BCITY) powder was prepared by com...,"[precursors=['Ba(NO3)2', 'Ce(NO3)3⋅6H2O', 'In(NO3)3⋅4.5H2O', 'Y(NO...",The text describes the preparation of two different powders: BaCe0...,"[precursors=['Ba(NO3)2', 'Ce(NO3)3⋅6H2O', 'In(NO3)3⋅4.5H2O', 'Y(NO...",


66.67

In [16]:
extractor(text=train[0].text)

Prediction(
    reasoning='The text describes the preparation of (1−x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3 ceramics using a conventional solid-state reaction method. The precursors used are CaCO3, SrCO3, Bi2O3, Na2CO3, and TiO2. The process involves mixing these starting materials, milling, drying, calcining, re-milling, granulating with PVA, pressing into cylinders, and sintering. The reaction type is clearly stated as a solid-state reaction. The target formula is (1−x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3, with x being a variable amount (0.025, 0.050, 0.075, 0.10). There are no element variables in the target formula.',
    reactions=[Reaction(precursors=['CaCO3', 'SrCO3', 'Bi2O3', 'Na2CO3', 'TiO2'], additives=['ZrO2', 'H2O', 'PVA'], target=Target(target_formula='(1−x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3', amount_var={'x': [0.025, 0.05, 0.075, 0.1]}, element_var={}), reaction_type='solid-state')]
)

In [17]:
extractor(text=train[5].text)

Prediction(
    reasoning='The text describes the preparation of two different powders: BaCe0.7In0.15Ta0.05Y0.1O3-δ (BCITY) and BaCe0.7Y0.3O3-δ (BCY). The preparation of BCITY involves a combination of solid-state reaction and the Pechini method, while BCY is prepared solely by the Pechini method. The precursors for BCITY include Ba(NO3)2, Ce(NO3)3⋅6H2O, In(NO3)3⋅4.5H2O, Y(NO3)3⋅6H2O, and Ta2O5, with citric acid as an additive. The process involves dissolving these precursors in water, adjusting the pH, and heating to form a gel, which is then ignited and calcined. The BCY preparation involves similar steps but without the inclusion of In and Ta precursors. The reactions can be categorized as "others" due to the combination of methods used.',
    reactions=[Reaction(precursors=['Ba(NO3)2', 'Ce(NO3)3⋅6H2O', 'In(NO3)3⋅4.5H2O', 'Y(NO3)3⋅6H2O', 'Ta2O5'], additives=['citric acid', 'ammonia water'], target=Target(target_formula='BaCe0.7In0.15Ta0.05Y0.1O3-δ', amount_var={}, element_var={}), r

In [24]:
evaluate(extractor, metric=metric, num_threads=10, display_progress=True, display_table=True, devset=dev)

Average Metric: 8.00 / 9 (88.9%): 100%|██████████| 9/9 [00:18<00:00,  2.08s/it] 

2024/12/18 17:20:38 INFO dspy.evaluate.evaluate: Average Metric: 8 / 9 (88.9%)





Unnamed: 0,text,example_reactions,reasoning,pred_reactions,metric
0,Perovskite nanosheets were prepared by delaminating layered perovs...,"[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Ta...",The text describes the preparation of perovskite nanosheets throug...,"[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Ta...",✔️ [True]
1,Strontium titanate doped with rhodium species (SrTiO3:Rh) was prep...,"[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...",The text describes the preparation of strontium titanate doped wit...,"[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...",✔️ [True]
2,"Polycrystalline samples of ACu3Sn4O12 (A = Ca2+, Sr2+, and Pb2+) w...","[precursors=['CaCO3', 'CuO', 'SnO2'] additives=[] target=Target(ta...",The text describes the synthesis of polycrystalline samples of ACu...,"[Reaction(precursors=['CaCO3', 'CuO', 'SnO2'], additives=[], targe...",✔️ [True]
3,The particles of SrTiO3:Rh(2%) were prepared via the water-based h...,"[precursors=['TiO2', 'SrCO3', 'Rh2O3'] additives=[] target=Target(...",The text describes two methods for preparing SrTiO3:Rh(2%) particl...,"[precursors=['TIPT', 'acac', 'AcOH', 'Sr(OAc)2', 'RhCl3'] additive...",
4,"Nonetheless, the molten salt flux assistance has room for optimiza...","[precursors=['SnO', 'Nb2O5'] additives=['SnCl2'] target=Target(tar...",The text describes a chemical reaction process involving the synth...,"[precursors=['SnO', 'Nb2O5'] additives=['SnCl2'] target=Target(tar...",✔️ [True]
5,"The Sr1−xCexCo0.2Fe0.8O3−δ (x = 0.0, 0.10, 0.15 and 0.20) and La0....","[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...",The text describes the synthesis of two different materials: Sr1−x...,"[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...",✔️ [True]
6,We used sources of thallium with a composition TlxBa2Ca2Cu3Oz (x=1...,"[precursors=['Tl2O3', 'BaCuO2', 'Ca2CuO3', 'Ba2Ca2Cu3O7'] additive...",The text describes a chemical reaction process involving the prepa...,"[precursors=['BaO2', 'CuO', 'CaO'] additives=[] target=Target(targ...",✔️ [True]
7,Single crystals of Cd4YbO(BO3)3 were grown through high-temperatur...,"[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...",The text describes the synthesis of single crystals of Cd4YbO(BO3)...,"[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...",✔️ [True]
8,Single crystals of TbBaMn2O6 were obtained by floating zone (FZ) m...,"[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=[] target=Target...",The text describes the synthesis of TbBaMn2O6 through a series of ...,"[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=['Ar'] target=Ta...",✔️ [True]


88.89

In [19]:
extractor(text=dev[3].text)

Prediction(
    reasoning='The text describes two methods for preparing SrTiO3:Rh(2%) particles. The first method is the water-based hetero-chelate method (WH-method), which involves using a titania sol as a precursor and mixing it with solutions of Sr(OAc)2, lactic acid, and RhCl3. The process includes stirring, drying, and calcination. The second method is the solid-state reaction method (SS-method), which uses TiO2, SrCO3, and Rh2O3 as raw materials, followed by calcination. Both methods result in the formation of SrTiO3:Rh(2%) particles. The reactions can be categorized based on the precursors, additives, and the target compound.',
    reactions=[Reaction(precursors=['TIPT', 'Sr(OAc)2', 'RhCl3'], additives=['acac', 'AcOH', 'lactic acid', 'acrylic emulsion'], target=Target(target_formula='SrTiO3:Rh', amount_var={}, element_var={}), reaction_type='others'), Reaction(precursors=['TiO2', 'SrCO3', 'Rh2O3'], additives=[], target=Target(target_formula='SrTiO3:Rh', amount_var={}, element_v

In [16]:
evaluate(vanilla)

Average Metric: 6.00 / 9 (66.7%): 100%|██████████| 9/9 [00:09<00:00,  1.00s/it] 

2024/12/13 14:50:03 INFO dspy.evaluate.evaluate: Average Metric: 6 / 9 (66.7%)





Unnamed: 0,text,example_reactions,pred_reactions,metric
0,Perovskite nanosheets were prepared by delaminating layered perovs...,"[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Ta...","[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Ta...",✔️ [True]
1,Strontium titanate doped with rhodium species (SrTiO3:Rh) was prep...,"[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...","[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...",
2,"Polycrystalline samples of ACu3Sn4O12 (A = Ca2+, Sr2+, and Pb2+) w...","[precursors=['CaCO3', 'CuO', 'SnO2'] additives=[] target=Target(ta...","[Reaction(precursors=['CaCO3', 'CuO', 'SnO2'], additives=[], targe...",✔️ [True]
3,The particles of SrTiO3:Rh(2%) were prepared via the water-based h...,"[precursors=['TiO2', 'SrCO3', 'Rh2O3'] additives=[] target=Target(...","[precursors=['TIPT', 'acac', 'AcOH', 'Sr(OAc)2', 'RhCl3'] additive...",✔️ [True]
4,"Nonetheless, the molten salt flux assistance has room for optimiza...","[precursors=['SnO', 'Nb2O5'] additives=['SnCl2'] target=Target(tar...","[precursors=['SnO', 'Nb2O5', 'SnCl2'] additives=[] target=Target(t...",
5,"The Sr1−xCexCo0.2Fe0.8O3−δ (x = 0.0, 0.10, 0.15 and 0.20) and La0....","[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...","[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...",
6,We used sources of thallium with a composition TlxBa2Ca2Cu3Oz (x=1...,"[precursors=['Tl2O3', 'BaCuO2', 'Ca2CuO3', 'Ba2Ca2Cu3O7'] additive...","[precursors=['BaO2', 'CuO', 'CaO'] additives=[] target=Target(targ...",✔️ [True]
7,Single crystals of Cd4YbO(BO3)3 were grown through high-temperatur...,"[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...","[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...",✔️ [True]
8,Single crystals of TbBaMn2O6 were obtained by floating zone (FZ) m...,"[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=[] target=Target...","[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=['Ar'] target=Ta...",✔️ [True]


66.67

In [18]:
evaluate(compiled_vanilla)

Average Metric: 8.00 / 9 (88.9%): 100%|██████████| 9/9 [00:06<00:00,  1.34it/s] 

2024/12/13 14:50:22 INFO dspy.evaluate.evaluate: Average Metric: 8 / 9 (88.9%)





Unnamed: 0,text,example_reactions,pred_reactions,metric
0,Perovskite nanosheets were prepared by delaminating layered perovs...,"[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Ta...","[Reaction(precursors=['KCa2Nb3O10'], additives=[], target=Target(t...",✔️ [True]
1,Strontium titanate doped with rhodium species (SrTiO3:Rh) was prep...,"[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...","[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...",✔️ [True]
2,"Polycrystalline samples of ACu3Sn4O12 (A = Ca2+, Sr2+, and Pb2+) w...","[precursors=['CaCO3', 'CuO', 'SnO2'] additives=[] target=Target(ta...","[precursors=['CaCO3', 'CuO', 'SnO2'] additives=[] target=Target(ta...",✔️ [True]
3,The particles of SrTiO3:Rh(2%) were prepared via the water-based h...,"[precursors=['TiO2', 'SrCO3', 'Rh2O3'] additives=[] target=Target(...","[precursors=['TIPT', 'Sr(OAc)2', 'RhCl3'] additives=['acac', 'AcOH...",✔️ [True]
4,"Nonetheless, the molten salt flux assistance has room for optimiza...","[precursors=['SnO', 'Nb2O5'] additives=['SnCl2'] target=Target(tar...","[precursors=['SnO', 'Nb2O5'] additives=['SnCl2'] target=Target(tar...",✔️ [True]
5,"The Sr1−xCexCo0.2Fe0.8O3−δ (x = 0.0, 0.10, 0.15 and 0.20) and La0....","[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...","[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...",✔️ [True]
6,We used sources of thallium with a composition TlxBa2Ca2Cu3Oz (x=1...,"[precursors=['Tl2O3', 'BaCuO2', 'Ca2CuO3', 'Ba2Ca2Cu3O7'] additive...","[precursors=['BaO2', 'CuO', 'CaO'] additives=[] target=Target(targ...",✔️ [True]
7,Single crystals of Cd4YbO(BO3)3 were grown through high-temperatur...,"[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...","[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...",✔️ [True]
8,Single crystals of TbBaMn2O6 were obtained by floating zone (FZ) m...,"[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=[] target=Target...","[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=[] target=Target...",


88.89

In [12]:
evaluate(compiled_direct, display_progress=True, display_table=True)

Average Metric: 7.00 / 9 (77.8%): 100%|██████████| 9/9 [00:21<00:00,  2.38s/it] 

2024/12/13 12:33:37 INFO dspy.evaluate.evaluate: Average Metric: 7 / 9 (77.8%)





Unnamed: 0,text,example_reactions,reasoning,pred_reactions,metric
0,Perovskite nanosheets were prepared by delaminating layered perovs...,"[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Ta...",The text describes the preparation of perovskite nanosheets throug...,"[precursors=['K2CO3', 'CaCO3', 'Nb2O5'] additives=[] target=Target...",
1,Strontium titanate doped with rhodium species (SrTiO3:Rh) was prep...,"[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...",The text describes the preparation of SrTiO3 doped with rhodium (S...,"[precursors=['SrCO3', 'TiO2', 'Rh2O3'] additives=[] target=Target(...",✔️ [True]
2,"Polycrystalline samples of ACu3Sn4O12 (A = Ca2+, Sr2+, and Pb2+) w...","[precursors=['CaCO3', 'CuO', 'SnO2'] additives=[] target=Target(ta...",The text describes the synthesis of polycrystalline samples of ACu...,"[precursors=['CaCO3', 'CuO', 'SnO2'] additives=[] target=Target(ta...",✔️ [True]
3,The particles of SrTiO3:Rh(2%) were prepared via the water-based h...,"[precursors=['TiO2', 'SrCO3', 'Rh2O3'] additives=[] target=Target(...",The text describes two methods for preparing SrTiO3:Rh(2%) particl...,"[precursors=['TIPT', 'Sr(OAc)2', 'RhCl3'] additives=['acac', 'AcOH...",✔️ [True]
4,"Nonetheless, the molten salt flux assistance has room for optimiza...","[precursors=['SnO', 'Nb2O5'] additives=['SnCl2'] target=Target(tar...",The text describes the synthesis of SnNb2O6 using a molten salt fl...,"[precursors=['SnO', 'Nb2O5'] additives=['SnCl2'] target=Target(tar...",✔️ [True]
5,"The Sr1−xCexCo0.2Fe0.8O3−δ (x = 0.0, 0.10, 0.15 and 0.20) and La0....","[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...",The text describes the synthesis of two types of samples: Sr1−xCex...,"[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'...",✔️ [True]
6,We used sources of thallium with a composition TlxBa2Ca2Cu3Oz (x=1...,"[precursors=['Tl2O3', 'BaCuO2', 'Ca2CuO3', 'Ba2Ca2Cu3O7'] additive...",The text describes the preparation of a thallium-based compound wi...,"[precursors=['BaO2', 'CuO', 'CaO'] additives=[] target=Target(targ...",✔️ [True]
7,Single crystals of Cd4YbO(BO3)3 were grown through high-temperatur...,"[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...",The text describes the synthesis of single crystals of Cd4YbO(BO3)...,"[precursors=['CdCO3', 'H3BO3', 'Yb2O3'] additives=[] target=Target...",✔️ [True]
8,Single crystals of TbBaMn2O6 were obtained by floating zone (FZ) m...,"[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=[] target=Target...",The text describes the synthesis of TbBaMn2O6 using a two-step pro...,"[precursors=['Tb4O7', 'MnCO3', 'BaCO3'] additives=[] target=Target...",


77.78

In [29]:
compiled_direct(text='The BaCe0.7In0.15Ta0.05Y0.1O3-\u03b4 (BCITY) powder was prepared by combing solid state reaction and Pechini method to ensure homogeneity of mixed oxides. Firstly, the Ba(NO3)2, Ce(NO3)3\u22c56H2O, In(NO3)3\u22c54.5H2O, and Y(NO3)3\u22c56H2O with the stoichiometric amounts were dissolved in deionized water. Secondly, the citric acid, as a complexation agent, was added with the molar ratio (3:2) of citric acid to metal ions. After the pH value of the solution was adjusted to be 7 by adding appropriate ammonia water, the stoichiometric amount of Ta2O5 was added. Thirdly, the solution added with Ta2O5 was stirred continuously during heating to evaporate water until it was changed into viscous gel and finally ignited to flame, which resulted in the formation of some white ashes. Finally, the BCITY powder was obtained by calcining the white ashes at 1000\u202f\u00b0C for 5\u202fh in air. In addition, the BaCe0.7Y0.3O3-\u03b4 (BCY) powder was also prepared by Pechini method and calcined at 1000\u202f\u00b0C for 5\u202fh in air.')

Prediction(
    reasoning='The text describes the preparation of BaCe0.7In0.15Ta0.05Y0.1O3-δ (BCITY) and BaCe0.7Y0.3O3-δ (BCY) powders using a combination of solid-state reaction and the Pechini method. The process involves dissolving metal nitrates in water, adding citric acid as a complexation agent, adjusting the pH, and adding Ta2O5. The mixture is then heated to form a gel, which is ignited to form ashes. These ashes are calcined to obtain the final BCITY powder. The BCY powder is prepared similarly using the Pechini method. The Pechini method is a type of sol-gel process, which is categorized under "others" in the reaction type.',
    reactions=[Reaction(precursors=['Ba(NO3)2', 'Ce(NO3)3·6H2O', 'In(NO3)3·4.5H2O', 'Y(NO3)3·6H2O', 'Ta2O5'], additives=['citric acid', 'ammonia water'], target=Target(target_formula='BaCe0.7In0.15Ta0.05Y0.1O3-δ', amount_var={}, extra_description='BCITY powder'), reaction_type='others'), Reaction(precursors=['Ba(NO3)2', 'Ce(NO3)3·6H2O', 'Y(NO3)3·6H2O'],

In [9]:
test_text = "In 2004, Kudo and coworkers first reported the photocatalytic activities of SnNb2O6, Sn2Nb2O7, SnTa2O6 and Sn2Ta2O7 for hydrogen evolution, using methanol as the sacrificial reagent. The photocatalysts were synthesized using solid-state synthesis, and the photocatalytic activity was measured under visible irradiation for the tin niobate structures (λ > 420 nm) and under UV irradiation for the tin tantalate structures (λ > 300 nm). The two tin tantalate structures were shown to be less interesting due to their wide band gaps and low photocatalytic activity, as the hydrogen evolution activity of Sn2Ta2O7 was reported to be approximately 2.1 μmol h-1 and SnTa2O6 was found to be inactive. Interestingly, although the two tin niobate structures show similar band gaps of approximately 2.3 eV, the SnNb2O6 structure's optimized hydrogen evolution rate was found to be 18 μmol h-1, with a turnover number (TON) of 180 at 10 h. summarizes all the materials that have been used as photocatalysts for the hydrogen evolution reaction. Later in 2006, Kudo and coworkers reported the flux-assisted synthesis of SnNb2O6 starting from a Sr2Nb2O7 perovskite structure with SnCl2 as a molten salt. The synthesis proceeds through an ion exchange mechanism at a low temperature of 300 °C for a long time of 30 h, where Sn2+ replaces the Sr2+, resulting in a layered structure. Although the flux-assisted method resulted in a nanoplate material with a higher surface area in comparison to the bulk tin niobate synthesized by the solid-state method, it did not improve the activity, which may be attributed to the decreased crystallinity."
compiled_direct(text=test_text).reactions

[Reaction(precursors=['Sr2Nb2O7', 'SnCl2'], additives=[], target=Target(target_formula='SnNb2O6', amount_var={}, element_var={}), reaction_type='flux')]

In [19]:
lm.inspect_history(6)





[34m[2024-12-13T14:50:19.949828][0m

[31mSystem message:[0m

Your input fields are:
1. `gold` (str): the gold reactions which should be extracted
2. `extracted` (str): the extracted reactions

Your output fields are:
1. `reasoning` (str)
2. `answer` (Literal[yes, no]): Does the extracted reactions match the gold reactions? Tolerate some minor differences

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## gold ## ]]
{gold}

[[ ## extracted ## ]]
{extracted}

[[ ## reasoning ## ]]
{reasoning}

[[ ## answer ## ]]
{answer}        # note: the value you produce must be one of: yes; no

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        whether the extract reactions match the gold reactions


[31mUser message:[0m

[[ ## gold ## ]]
precursors: TiO2,SrCO3,Rh2O3, target: SrTiO3:Rh(2%), target amount variable: {}
precursors: TIPT,Sr(OAc)2,RhCl3, target: SrTiO3:Rh(2%), target amount variable: {}

[[ ## e

In [18]:
loaded = ExtractReactionWithType()
loaded.load('compiled_direct_with_element_var')

In [19]:
loaded(text=test_text).reactions

[Reaction(precursors=['Sr2Nb2O7', 'SnCl2'], additives=[], target=Target(target_formula='SnNb2O6', amount_var={}, element_var={}), reaction_type='flux')]

In [20]:
lm.inspect_history()





[34m[2024-12-11T16:56:14.903861][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str): a piece of text which may contains chemical reactions

Your output fields are:
1. `reasoning` (str)
2. `reactions` (Union[list[Reaction], NoneType]): the reactions extracted from the text, return null if no reaction found

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## reasoning ## ]]
{reasoning}

[[ ## reactions ## ]]
{reactions}        # note: the value you produce must be pareseable according to the following JSON schema: {"$defs": {"Reaction": {"type": "object", "properties": {"additives": {"type": "array", "items": {"type": "string"}, "title": "Additives"}, "precursors": {"type": "array", "description": "ensure it is a valid chemical formula", "items": {"type": "string"}, "title": "Precursors"}, "reaction_type": {"type": "string", "enum": ["solid-state", "sol-gel", "co-precipitation", "hydroth

In [4]:
compiled_program = ExtractReactionWithType()
compiled_program.load('compiled_direct_with_element_var_version_2')

In [13]:
compiled_program.predictor.extended_signature

StringSignature(text -> reasoning, reactions
    instructions='Identify and extract all chemical reactions from the text. For each reaction:\n\nInclude reactions where precursors and target products are explicitly mentioned.\nEnsure not to overlook reactions that are less prominently discussed but still have their precursors and target explicitly stated.\nAlso extract intermediate reactions and label them as such.'
    text = Field(annotation=str required=True json_schema_extra={'desc': 'a piece of text which may contains chemical reactions', '__dspy_field_type': 'input', 'prefix': 'Text:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    reactions = Field(annotation=Union[list[Reaction], NoneType] required=True json_schema_extra={'desc': 'the reactions extracted from the text, return null if no reaction found', '__dspy_field_type': 'output', 'p

In [10]:
text_for_test = """Polycrystalline BaGa2S4:Eu2+ sample were synthesised from BaS and Ga2S3 sulphide powders mixed in stoichiometric composition and annealed at 900 °C under a stream of H2S+Ar for 4 h. The doping ions were introduced in the form of EuF3. This preparation method is different from that reported by Peters et al. [11] and Davolos et al. [13] who had used barium carbonate (BaCO3), gallium oxide (Ga2O3) and europium oxide (Eu2O3) as starting materials and H2S gas as sulphuring agent. A third preparation method presented by Donohue et al. was based on solid-state reactions between Ba and Ga in elemental form and europium sulphide EuS [14]. The method presented here provides powder samples with good crystalline properties as shown by X-ray diffraction measurements. Powder samples with varying Eu2+ concentrations (1, 5 and 10%) were studied. The samples exhibit a brownish yellow colour."""

In [14]:
compiled_program(text=text_for_test).reactions

[Reaction(precursors=['BaS', 'Ga2S3', 'EuF3'], additives=[], target=Target(target_formula='BaGa2S4:Eu2+', amount_var={'Eu2+': [0.01, 0.05, 0.1]}, element_var={}), reaction_type='solid-state')]

In [8]:
uncompiled_direct = ExtractReactionWithType()
uncompiled_direct.predictor.extended_signature.instructions = compiled_program.predictor.extended_signature.instructions

In [11]:
uncompiled_direct(text=text_for_test).reactions

[Reaction(precursors=['BaS', 'Ga2S3'], additives=['EuF3', 'H2S', 'Ar'], target=Target(target_formula='BaGa2S4:Eu2+', amount_var={}, element_var={}), reaction_type='solid-state'),
 Reaction(precursors=['BaCO3', 'Ga2O3', 'Eu2O3'], additives=['H2S'], target=Target(target_formula='BaGa2S4:Eu2+', amount_var={}, element_var={}), reaction_type='solid-state'),
 Reaction(precursors=['Ba', 'Ga', 'EuS'], additives=[], target=Target(target_formula='BaGa2S4:Eu2+', amount_var={}, element_var={}), reaction_type='solid-state')]

In [22]:
lm.inspect_history(3)





[34m[2024-12-12T10:29:07.733580][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str): a piece of text which may contains chemical reactions

Your output fields are:
1. `reasoning` (str)
2. `reactions` (Union[list[Reaction], NoneType]): the reactions extracted from the text, return null if no reaction found

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## reasoning ## ]]
{reasoning}

[[ ## reactions ## ]]
{reactions}        # note: the value you produce must be pareseable according to the following JSON schema: {"$defs": {"Reaction": {"type": "object", "properties": {"additives": {"type": "array", "items": {"type": "string"}, "title": "Additives"}, "precursors": {"type": "array", "description": "ensure it is a valid chemical formula", "items": {"type": "string"}, "title": "Precursors"}, "reaction_type": {"type": "string", "enum": ["solid-state", "sol-gel", "co-precipitation", "hydroth

In [20]:
class SwappedChatAdapter(dspy.adapters.ChatAdapter):
    def format(self, signature, demos, inputs):
        messages:list = super().format(signature, demos, inputs)
        systeme_message = messages.pop(0)
        messages.insert(-2, systeme_message)
        return messages

dspy.configure(adapter=SwappedChatAdapter())
with dspy.context(lm=dspy.LM('openai/gpt-4o')):
    res = compiled_program(text=text_for_test)
    print(res.reasoning, '\n', res.reactions)

The text describes the synthesis of polycrystalline BaGa2S4:Eu2+ using a solid-state reaction method. The precursors used in this method are BaS, Ga2S3, and EuF3. The mixture is annealed at 900 °C under a stream of H2S+Ar gas. The text also mentions alternative methods used by other researchers, but the focus is on the method using BaS and Ga2S3. The target product is BaGa2S4 doped with Eu2+ ions, with varying concentrations of Eu2+ (1, 5, and 10%). 
 [Reaction(precursors=['BaS', 'Ga2S3', 'EuF3'], additives=[], target=Target(target_formula='BaGa2S4:Eu2+', amount_var={'Eu2+': [0.01, 0.05, 0.1]}, element_var={}), reaction_type='solid-state')]


In [31]:
compiled_program(text=dev[0])

Prediction(
    reasoning='The text describes the preparation of perovskite nanosheets through a series of chemical transformations. The starting material, KCa2Nb3O10, is first converted into a protonic form, HCa2Nb3O10 1.5H2O, using HNO3. This intermediate is then delaminated using tetrabutylammonium hydroxide (TBAOH) to produce Ca2Nb3O10 nanosheets. Additionally, other nanosheets such as Sr2Nb3O10, Ca2Ta3O10, and Sr2Ta3O10 are synthesized by delaminating their respective layered perovskites (KSr2Nb3O10, KCa2Ta3O10, and KSr2Ta3O10). Each step involves a transformation of a precursor into a target product, often with the aid of an additive.',
    reactions=[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Target(target_formula='HCa2Nb3O10 1.5H2O', amount_var={}, element_var={}), reaction_type='others'), Reaction(precursors=['HCa2Nb3O10 1.5H2O'], additives=['TBAOH'], target=Target(target_formula='Ca2Nb3O10', amount_var={}, element_var={}), reaction_type='others'), Reaction

In [32]:
uncompiled_direct(text=dev[0])

Prediction(
    reasoning='The text describes the preparation of perovskite nanosheets through a series of chemical reactions. The reactions involve the conversion of layered perovskites into their protonic forms and subsequent delamination to form nanosheets. The text explicitly mentions the starting materials, additives, and target products for each reaction, allowing for the extraction of detailed reaction information. The reactions are not categorized under typical reaction types like solid-state or sol-gel, so they are labeled as "others."',
    reactions=[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target=Target(target_formula='HCa2Nb3O10 1.5H2O', amount_var={}, element_var={}), reaction_type='others'), Reaction(precursors=['HCa2Nb3O10 1.5H2O'], additives=['TBAOH'], target=Target(target_formula='Ca2Nb3O10', amount_var={}, element_var={}), reaction_type='others'), Reaction(precursors=['KSr2Nb3O10'], additives=[], target=Target(target_formula='Sr2Nb3O10', amount_var={},

In [33]:
lm.inspect_history(2)





[34m[2024-12-12T11:08:35.720688][0m

[31mUser message:[0m

This is an example of the task, though some input or output fields are not supplied.

[[ ## text ## ]]
The BaCe0.7In0.15Ta0.05Y0.1O3-δ (BCITY) powder was prepared by combing solid state reaction and Pechini method to ensure homogeneity of mixed oxides. Firstly, the Ba(NO3)2, Ce(NO3)3⋅6H2O, In(NO3)3⋅4.5H2O, and Y(NO3)3⋅6H2O with the stoichiometric amounts were dissolved in deionized water. Secondly, the citric acid, as a complexation agent, was added with the molar ratio (3:2) of citric acid to metal ions. After the pH value of the solution was adjusted to be 7 by adding appropriate ammonia water, the stoichiometric amount of Ta2O5 was added. Thirdly, the solution added with Ta2O5 was stirred continuously during heating to evaporate water until it was changed into viscous gel and finally ignited to flame, which resulted in the formation of some white ashes. Finally, the BCITY powder was obtained by calcining the white as

In [None]:
bad_behaviour_dois = [
    "10.1002/adfm.201101123",
    "10.1039/c1cy00199j",
    "10.1016/j.jpcs.2005.02.003"
]