In [22]:
import re
import json
from typing import Optional, Literal
from contextvars import ContextVar

import dspy
from pydantic import BaseModel, Field, field_validator

from sisyphus.chain import Filter, Writer
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.chain.customized_elements import customized_extractor
from sisyphus.utils.helper_functions import return_valid


additional_args = ContextVar('additional_args', default={})
# config
lm = dspy.LM('openai/gpt-4o-mini', cache=False)
dspy.configure(lm=lm)

def load_json(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        data = json.load(f)
    train_set = [dspy.Example(**example).with_inputs('text') for example in data['examples'][:10]]
    dev_set = [dspy.Example(**example).with_inputs('text') for example in data['examples'][10:]]
    return train_set, dev_set


class ClassifyReaction(dspy.Signature):
    """Giving the availability of extracting solid-state reaction formula from the text """
    text = dspy.InputField(desc='a piece of text which may contains solid-state chemical reaction formula')
    solid_state_definition = dspy.InputField(desc='the definition of solid-state reaction')
    
    extraction_availability_solid_react: int = dspy.OutputField(desc='1 for yes, 0 for no')


class Classify_CoT(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=ClassifyReaction)

    def forward(self, text, solid_state_definition):
        prediction = self.predictor(text=text, solid_state_definition=solid_state_definition)
        return prediction
    

class Target(BaseModel):
    target_formula: str
    element_var: dict[str, list] = Field(description='element variable in the formula, e.g. {X: [Li, Na]}')
    amount_var: dict[str, list] = Field(description='amount variable in the formula, e.g. {x: [1, 2]}')
    extra_description: Optional[str] = Field(description='extra description other than the formula')

class Reaction(BaseModel):
    precursors: list[str] = Field(description='the precursors or starting material of reaction, ensure it is a valid chemical formula')
    additives: list[str] = Field(description='the additives of the reaction')
    target: Target = Field(description='the product of the reaction, make sure it is a valid chemical formula')
    reaction_type: Literal['solid-state', 'sol-gel', 'co-preciptation', 'hydrothermal', 'flux', 'others'] = Field(description='the type of the reaction')

    @field_validator('precursors')
    @classmethod
    def validate_precusors(cls, v):
        if not v:
            raise ValueError('precursors should not be empty')
        return v
    

class QA(dspy.Signature):
    """extract chmemical reactions consitituent from the text"""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reaction')
    reactions: Optional[list[Reaction]] = dspy.OutputField(desc='the reactions extracted from the text, return null if no reaction found')

class CheckerSig(dspy.Signature):
    """To check hallucination and missing data points of last step's results, then rectify"""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reaction')
    last_step_results: Optional[list[Reaction]] = dspy.InputField(desc='the reactions extracted from the text')
    rectified_results: Optional[list[Reaction]] = dspy.OutputField(desc='the rectified results')


class ExtractReactionWithType(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=QA)
        self.checker = dspy.ChainOfThought(signature=CheckerSig)

    def forward(self, text):
        prediction = self.predictor(text=text)
        checked_prediction = self.checker(text=text, last_step_results=prediction.reactions)
        return checked_prediction
        
    
class Assess(dspy.Signature):
    """Assess the quality of the extracted reaction"""
    grounded_result = dspy.InputField(desc='the grounded result by a human expert in json format')
    extracted_result = dspy.InputField(desc='the extracted result by a NLP program in json format')
    question = dspy.InputField()
    answer = dspy.OutputField(desc='please answer yes/no')


def llm_metric(gold, pred, trace=None):
    grounded_result_restrict = [{'precursors': r['precursors'], 'target': r['target']} for r in gold.reactions]
    extract_result_restrict = [{'precursors': r.precursors, 'target': r.target} for r in pred.reactions]
    critic = dspy.ChainOfThoughtWithHint(Assess)(
        grounded_result=json.dumps(grounded_result_restrict, indent=2),
        extracted_result=json.dumps(extract_result_restrict, indent=2),
        question='based on the given grounded result, do you think that the extracted result successfully capture the reaction?'
    )
    return True if critic.answer.lower() == 'yes' else False



In [None]:
predictor = ExtractReactionWithType()
predictor.predictors()

[Predict(StringSignature(text -> reasoning, reactions
     instructions='extract reaction consitituent from the text'
     text = Field(annotation=str required=True json_schema_extra={'desc': 'a piece of text which may contains chemical reaction', '__dspy_field_type': 'input', 'prefix': 'Text:'})
     reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
     reactions = Field(annotation=Union[list[Reaction], NoneType] required=True json_schema_extra={'desc': 'the reactions extracted from the text, return null if no reaction found', '__dspy_field_type': 'output', 'prefix': 'Reactions:'})
 )),
 Predict(StringSignature(text, last_step_results -> reasoning, rectified_results
     instructions="To check hallucination and missing data points of last step's results, then rectify"
     text = Field(annotation=str required=True json_schema_extra={'desc': 'a piece of 

In [23]:
@return_valid
def extract(doc):
    if classifier(text=doc.page_content, solid_state_definition=definition).extraction_availability_solid_react:
        text = doc.page_content
        title = doc.metadata['title']
        abstract = doc.metadata['abstract']
        sub_titles = doc.metadata['sub_titles']
        context = f'title:\n{title}\n' + f'abstract:\n{abstract}\n' + 'subtitles:\n' + '\n'.join([f'{sub_title}' for sub_title in sub_titles.split('/')]) + f'\nparagraph:\n{text}'
        prediction = compiled_extractor(text=context)
        reactions = prediction.rectified_results
        return reactions


ARTICLE = '40_with_good_title'
TARGET = 'new_type_db'
exp_section_pattern = re.compile(r'\b(?:experiment(?:al|s|ing|ed)?|synthesis(?:es|ing|ed)?|preparation(?:s|al|ed|ing)?|process(?:es|ion|ing)?|method(?:s)?)\b', re.I)
def filter_with_kw(doc):
    return bool(exp_section_pattern.search(doc.metadata['sub_titles']))

article_db = get_plain_articledb(ARTICLE)
article_getter = Filter(article_db, filter_func=filter_with_kw, with_abstract=True)
result_db = get_create_resultdb(TARGET, Reaction)

cot = ExtractReactionWithType()
# cot.load('compiled_extractor.json')
compiled_extractor = cot

classifier = Classify_CoT()
definition = 'Solid-state reaction refers to a conventional method used in chemistry to synthesize various materials like ceramics and crystals by heating a mixture of raw materials in solid form.'

my_extractor = customized_extractor(extract, 'thread', 5)
chain = article_getter + my_extractor + Writer(result_db=result_db)
# chain_with_out_writer = article_getter + my_extractor

# from sisyphus.chain.chain_elements import run_chains_with_extarction_history_multi_threads
# run_chains_with_extarction_history_multi_threads(chain, 'articles_processed', 10, 'reaction_extraction_test', extract_nums=10)
# result_db.clear_tables()
# with dspy.context(lm=dspy.LM('openai/gpt-4o-mini')):
# docinfos = chain.compose('10.1021&sol;nn101453v.html')



In [24]:
zero_shot = ExtractReactionWithType()
text = """The high purity of samples were fabricated by a time-efficient method combining levitation melting and spark plasma sintering (SPS).31, 44 Alloys with nominal composition Hf0.65Zr0.35Ni1-zPtzSn0.98Sb0.02 (z = 0, 0.05, 0.1, 0.15) were first prepared by levitation melting of stoichiometric amounts of Hf (piece, 99.99%), Zr (piece, 99.99%), Ni (block, 99.999%), Pt (particles, 99.999%), Sn (particles, 99.999%), and Sb (block, 99.99%) under an argon atmosphere for 2 min, and the melt was quenched in a water-cooled copper crucible. The ingots were remelted twice to ensure homogeneity. Mechanical milling was carried out with normal butane protection at 200 rpm for 4 h. The powders were then sintered by SPS (SPS-1050, Sumitomo Coal Mining Co.) at 1175 K under 65 MPa in vacuum for 10 min. The as-sintered samples, of which the relative density was found to be ≈95%, were used for measurements of thermal conductivity and Hall data, and then cut into rectangular bars for Seebeck coefficient and electrical conductivity measurements."""
text = """The LiMn2-yMyO4 (M = Li, Al, Ti, Co, and Ni and 0 ≤ y ≤ 0.2) and LiMn2-y-zMyLizO4 (M = Ti, Co, and Ni, 0 ≤ y ≤ 0.1, and 0 ≤ z ≤ 0.1) samples were synthesized by solid-state reactions of Li2CO3 and Mn2O3 with Al2O3, TiO2, Co3O4, or NiO at 800 °C for 48 h in air. All samples were characterized by X-ray powder diffraction to be single-phase materials, and the lattice parameters are given in Table 1. The lithium contents in the samples determined by atomic absorption spectroscopy were found to be similar to those in the starting reaction mixtures indicating negligible volatilization of lithium during the firing process. The electrochemical performance of the LiMn2-yMyO4 and LiMn2-y-zMyLizO4 cathodes were evaluated with CR2032 coin cells using metallic lithium anode and 1 M LiPF6 in ethylene carbonate (EC) and diethyl carbonate (DEC) electrolyte. The cathodes were fabricated by mixing 75 wt % LiMn2-y-zMyLizO4 with 20 wt % acetylene black and 5 wt % of poly(tetrafluoroethylene) (PTFE) binder, rolling the mixture into thin sheets of about 0.2 mm thick, and cutting into circular electrodes of 0.65 cm2 area. Electrochemical data were collected between 4.3 and 3.5 V at various rates between C/10 and 20C at room temperature and 60 °C. Cyclic voltammogram (CV) plots were recorded between 3.6 and 4.3 V at a scan rate of 50 μV/s."""
zero_shot(text=text)

Prediction(
    reasoning='The last step results contain two reactions extracted from the text. However, there are some issues with the representation of the variable ranges in the `amount_var` and `element_var` fields. Specifically, the `y` and `z` variables in the `element_var` should not be empty arrays, as they are part of the target formulas. Additionally, the `amount_var` for `y` and `z` should be represented correctly to reflect their ranges. The rectified results will include these corrections.',
    rectified_results=[Reaction(precursors=['Li2CO3', 'Mn2O3'], additives=['Al2O3', 'TiO2', 'Co3O4', 'NiO'], target=Target(target_formula='LiMn2-yMyO4', element_var={'M': ['Li', 'Al', 'Ti', 'Co', 'Ni'], 'y': [0, 0.2]}, amount_var={'y': [0, 0.2]}, extra_description=None), reaction_type='solid-state'), Reaction(precursors=['Li2CO3', 'Mn2O3'], additives=['TiO2', 'Co3O4', 'NiO'], target=Target(target_formula='LiMn2-y-zMyLizO4', element_var={'M': ['Ti', 'Co', 'Ni'], 'y': [0, 0.1], 'z': [0, 

In [25]:
results = _.rectified_results

In [26]:
results

[Reaction(precursors=['Li2CO3', 'Mn2O3'], additives=['Al2O3', 'TiO2', 'Co3O4', 'NiO'], target=Target(target_formula='LiMn2-yMyO4', element_var={'M': ['Li', 'Al', 'Ti', 'Co', 'Ni'], 'y': [0, 0.2]}, amount_var={'y': [0, 0.2]}, extra_description=None), reaction_type='solid-state'),
 Reaction(precursors=['Li2CO3', 'Mn2O3'], additives=['TiO2', 'Co3O4', 'NiO'], target=Target(target_formula='LiMn2-y-zMyLizO4', element_var={'M': ['Ti', 'Co', 'Ni'], 'y': [0, 0.1], 'z': [0, 0.1]}, amount_var={'y': [0, 0.1], 'z': [0, 0.1]}, extra_description=None), reaction_type='solid-state')]