In [None]:
import re
import json
from typing import Optional, Literal
from contextvars import ContextVar

import dspy
from pydantic import BaseModel, Field, field_validator

from sisyphus.chain import Filter, Writer
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.chain.customized_elements import customized_extractor
from sisyphus.utils.helper_functions import return_valid, load_from_curated_examples


additional_args = ContextVar('additional_args', default={})
# config
lm = dspy.LM('openai/gpt-4o', cache=False)
dspy.configure(lm=lm)


class ClassifyReaction(dspy.Signature):
    """Giving the availability of extracting solid-state reaction formula from the text """
    text = dspy.InputField(desc='a piece of text which may contains solid-state chemical reaction formula')
    solid_state_definition = dspy.InputField(desc='the definition of solid-state reaction')
    
    extraction_availability_solid_react: int = dspy.OutputField(desc='1 for yes, 0 for no')


class Classify_CoT(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=ClassifyReaction)

    def forward(self, text, solid_state_definition):
        prediction = self.predictor(text=text, solid_state_definition=solid_state_definition)
        return prediction
    

class Target(BaseModel):
    target_formula: str
    amount_var: dict[str, list] = Field(description='amount variable in the formula, e.g. {x: [1, 2]}')
    extra_description: Optional[str] = Field(description='extra description other than the formula')

class Reaction(BaseModel):
    precursors: list[str] = Field(description='the precursors or starting material of reaction, ensure it is a valid chemical formula')
    additives: list[str] = Field(description='the additives of the reaction')
    target: Target = Field(description='the product of the reaction, make sure it is a valid chemical formula')
    reaction_type: Literal['solid-state', 'sol-gel', 'co-precipitation', 'hydrothermal', 'flux', 'others'] = Field(description='the type of the reaction')

    # @field_validator('precursors')
    # @classmethod
    # def validate_precusors(cls, v):
    #     if not v:
    #         raise ValueError('precursors should not be empty')
    #     return v

def at_least_precursors(reactions):
    if not reactions:
        return True, None
    for reaction in reactions:
        if not reaction.precursors:
            return False, reaction.model_dump_json()
    return True, None
    
class QA(dspy.Signature):
    """extract chmemical reactions consitituents from the text"""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reaction')
    reactions: Optional[list[Reaction]] = dspy.OutputField(desc='the reactions extracted from the text, return null if no reaction found')

class GenInstruction(dspy.Signature):
    """based on the goal, check the correctness of previous results, considering possibly data loss or hallucination"""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reaction')
    goal = dspy.InputField()
    last_step_results: Optional[list[Reaction]] = dspy.InputField()
    action_instruction: Optional[str] = dspy.OutputField(default=None, desc='the instruction for the following step, return null if no instruction needed')

class Rectify(dspy.Signature):
    """rectify the previous step results based on the instruction"""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reaction')
    last_step_results: Optional[list[Reaction]] = dspy.InputField(desc='the reactions extracted from the text')
    action_instruction: str = dspy.InputField(desc='the instruction of how to rectify the previous step results')
    rectified_results: Optional[list[Reaction]] = dspy.OutputField(desc='the rectified results, return null if no rectification needed')

class ReCheck(dspy.Signature):
    """Based on"""

class ExtractReactionWithType(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=QA)
        # self.checker = dspy.ChainOfThought(signature=GenInstruction)
        # self.rectifier = dspy.ChainOfThought(signature=Rectify)

    def forward(self, text):
        reactions = self.predictor(text=text).reactions
        least_precursor, reaction_repr = at_least_precursors(reactions)
        dspy.Suggest(
            least_precursor,
            f'for {reaction_repr}: reaction must have at least one precursor, or there is no reaction at all',
            target_module=self.predictor
        )
        # instructions = self.checker(text=text, goal=QA.__doc__, last_step_results=reactions).action_instruction
        # if instructions:
        #     rectified_results = self.rectifier(text=text, last_step_results=reactions, action_instruction=instructions).rectified_results
        #     return dspy.Prediction(reactions=rectified_results)
        return dspy.Prediction(reactions=reactions)
        

class Assess(dspy.Signature):
    """Assess the quality of the extracted reaction"""
    grounded_result = dspy.InputField(desc='the grounded result by a human expert in json format')
    extracted_result = dspy.InputField(desc='the extracted result by a NLP program in json format')
    question = dspy.InputField()
    answer = dspy.OutputField(desc='please answer yes/no')


def llm_metric(gold, pred, trace=None):
    grounded_result = [reaction.model_dump() for reaction in gold.reactions]
    extract_result = [reaction.model_dump() for reaction in pred.reactions]
    critic = dspy.ChainOfThoughtWithHint(Assess)(
        grounded_result=json.dumps(grounded_result, indent=2),
        extracted_result=json.dumps(extract_result, indent=2),
        question='based on the given grounded result, do you think that the extracted result successfully capture the reaction?'
    )
    return True if critic.answer.lower() == 'yes' else False



In [2]:
examples = load_from_curated_examples('curated_examples.json', ('text', 'reactions'), ('text',), Reaction)

In [3]:
from dspy.datasets.dataloader import DataLoader
dt = DataLoader()
splits = dt.train_test_split(examples, train_size=0.6, random_state=22)
train, dev = splits['train'], splits['test']

In [4]:
train

[Example({'text': '(1−x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3 [hereafter referred to as (1−x)CST–xBNT] ceramics (x\u2009=\u20090.025, 0.050, 0.075, 0.10) were prepared by the conventional solid-state reaction method. The starting materials were high-purity grade powders (>\u200999%): CaCO3, SrCO3, Bi2O3, Na2CO3 and TiO2. CST and BNT powders were synthesized separately by mixing the starting materials according to the desired stoichiometry, and milled with ZrO2 balls and deionized water for 6 h in nylon jars. After drying, the powders were calcined at 1100 °C for 3 h and 850 °C for 3 h, respectively. The resulting powders were re-milled for 6 h. After that, the dried and re-milled mixtures were granulated with appropriate poly vinyl alcohol (PVA) as binder and uniaxial pressed at 300 MPa into cylinders with approximate size of 15 mm in diameter and 7.3 mm in thickness. The cylinders with x\u2009=\u20090.05, 0.075 and 0.1 were sintered in air at 1225–1300 °C for 3 h, and the cylinders wit

In [5]:
z_shot_extractor = ExtractReactionWithType().activate_assertions()
z_shot_extractor(text=train[0]['text'])

Prediction(
    reactions=[Reaction(precursors=['CaCO3', 'SrCO3', 'Bi2O3', 'Na2CO3', 'TiO2'], additives=['ZrO2', 'H2O', 'PVA'], target=Target(target_formula='(1−x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3', amount_var={'x': [0.025, 0.05, 0.075, 0.1]}, extra_description='ceramics'), reaction_type='solid-state')]
)

In [28]:
_.reactions

[Reaction(precursors=['Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O', 'Ce(NO3)3·6H2O'], additives=['EDTA', 'NH4OH', 'citric acid'], target=Target(target_formula='Sr1−xCexCo0.2Fe0.8O3−δ', amount_var={'x': [0.0, 0.1, 0.15, 0.2]}, extra_description='cubic perovskite phase'), reaction_type='others'),
 Reaction(precursors=['La(NO3)3·6H2O', 'Sr(NO3)2', 'Co(NO3)2·6H2O', 'Fe(NO3)3·6H2O'], additives=['EDTA', 'NH4OH', 'citric acid'], target=Target(target_formula='La0.6Sr0.4Co0.2Fe0.8O3−δ', amount_var={}, extra_description='cubic perovskite phase'), reaction_type='others')]

In [40]:
compiler = dspy.BootstrapFewShot(metric=llm_metric)

In [43]:
compiled_program = compiler.compile(ExtractReactionWithType().activate_assertions(), teacher=ExtractReactionWithType().activate_assertions(), trainset=train)

 89%|████████▉ | 8/9 [01:02<00:07,  7.77s/it]

Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.





In [44]:
import sisyphus.patch.dspy_patch
compiled_program.save('compiled_program.json')

In [8]:
@return_valid
def extract(doc):
    if classifier(text=doc.page_content, solid_state_definition=definition).extraction_availability_solid_react:
        text = doc.page_content
        title = doc.metadata['title']
        abstract = doc.metadata['abstract']
        sub_titles = doc.metadata['sub_titles']
        context = f'title:\n{title}\n' + f'abstract:\n{abstract}\n' + 'subtitles:\n' + '\n'.join([f'{sub_title}' for sub_title in sub_titles.split('/')]) + f'\nparagraph:\n{text}'
        prediction = compiled_extractor(text=context)
        reactions = prediction.reactions
        return reactions


ARTICLE = '40_with_good_title'
TARGET = 'result_inorgan_40'
exp_section_pattern = re.compile(r'\b(?:experiment(?:al|s|ing|ed)?|synthesis(?:es|ing|ed)?|preparation(?:s|al|ed|ing)?|process(?:es|ion|ing)?|method(?:s)?)\b', re.I)
def filter_with_kw(doc):
    return bool(exp_section_pattern.search(doc.metadata['sub_titles']))

article_db = get_plain_articledb(ARTICLE)
article_getter = Filter(article_db, filter_func=filter_with_kw, with_abstract=True)
result_db = get_create_resultdb(TARGET, Reaction)
# compiled_extractor = compiled_program

classifier = Classify_CoT()
definition = 'Solid-state reaction refers to a conventional method used in chemistry to synthesize various materials like ceramics and crystals by heating a mixture of raw materials in solid form.'

my_extractor = customized_extractor(extract, 'thread', 5)
chain = article_getter + my_extractor + Writer(result_db=result_db)
# chain_with_out_writer = article_getter + my_extractor

# result_db.clear_tables()
from sisyphus.chain.chain_elements import run_chains_with_extarction_history_multi_threads
# with dspy.context(lm=dspy.LM('openai/gpt-4o-mini')):
    # run_chains_with_extarction_history_multi_threads(chain, 'articles_processed', 10, 'inorgan_good_prompt')
# with dspy.context(lm=dspy.LM('openai/gpt-4o-mini', cache=False)):
#     docinfos = chain.compose('10.1021&sol;nn101453v.html')

In [9]:
with open('extract_results_with_doi.json', 'w', encoding='utf8') as f:
    json.dump(result_db.load_as_json(with_doi=True), f, indent=2, ensure_ascii=False)

In [10]:
text_ = """The high purity of samples were fabricated by a time-efficient method combining levitation melting and spark plasma sintering (SPS).31, 44 Alloys with nominal composition Hf0.65Zr0.35Ni1-zPtzSn0.98Sb0.02 (z = 0, 0.05, 0.1, 0.15) were first prepared by levitation melting of stoichiometric amounts of Hf (piece, 99.99%), Zr (piece, 99.99%), Ni (block, 99.999%), Pt (particles, 99.999%), Sn (particles, 99.999%), and Sb (block, 99.99%) under an argon atmosphere for 2 min, and the melt was quenched in a water-cooled copper crucible. The ingots were remelted twice to ensure homogeneity. Mechanical milling was carried out with normal butane protection at 200 rpm for 4 h. The powders were then sintered by SPS (SPS-1050, Sumitomo Coal Mining Co.) at 1175 K under 65 MPa in vacuum for 10 min. The as-sintered samples, of which the relative density was found to be ≈95%, were used for measurements of thermal conductivity and Hall data, and then cut into rectangular bars for Seebeck coefficient and electrical conductivity measurements."""
classifier = Classify_CoT()
definition = 'Solid-state reaction refers to a conventional method used in chemistry to synthesize various materials like ceramics and crystals by heating a mixture of raw materials in solid form.'
classifier(text=text_, solid_state_definition=definition).extraction_availability_solid_react

0

In [11]:
lm.inspect_history()





[34m[2024-12-05T21:37:45.307470][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str): a piece of text which may contains solid-state chemical reaction formula
2. `solid_state_definition` (str): the definition of solid-state reaction

Your output fields are:
1. `reasoning` (str)
2. `extraction_availability_solid_react` (int): 1 for yes, 0 for no

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## solid_state_definition ## ]]
{solid_state_definition}

[[ ## reasoning ## ]]
{reasoning}

[[ ## extraction_availability_solid_react ## ]]
{extraction_availability_solid_react}        # note: the value you produce must be a single int value

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Giving the availability of extracting solid-state reaction formula from the text


[31mUser message:[0m

[[ ## text ## ]]
The high purity of samples were fabricated by a time-