In [1]:
import dspy
import json
from typing import Optional
from pydantic import BaseModel, Field

# config
lm = dspy.LM('openai/gpt-4o-mini')
dspy.configure(lm=lm)

def load_json(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        data = json.load(f)
    train_set = [dspy.Example(**example).with_inputs('text') for example in data['examples'][:10]]
    dev_set = [dspy.Example(**example).with_inputs('text') for example in data['examples'][10:]]
    return train_set, dev_set


class ClassifyReaction(dspy.Signature):
    """Giving the availability of extracting solid-state reaction formula from the text """
    text = dspy.InputField(desc='a piece of text which may contains solid-state chemical reaction formula')
    solid_state_definition = dspy.InputField(desc='the definition of solid-state reaction')
    
    extraction_availability_solid_react: int = dspy.OutputField(desc='1 for yes, 0 for no')


class Classify_CoT(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=ClassifyReaction)

    def forward(self, text, hint):
        prediction = self.predictor(text=text, hint=hint)
        return prediction
    
    
class Reaction(BaseModel):
    precursors: list[str] = Field(description='the precursors or starting material of reaction')
    additives: list[str] = Field(description='the additives of the reaction')
    target: str = Field(description='the product of the reaction, make sure it is a valid chemical formula')
    reaction_type: str = Field(description='the type of the reaction, choose from [solid-state, sol-gel, co-preciptation, hydrothermal, flux, others]')


class QA(dspy.Signature):
    """extract reaction consitituent from the text"""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reaction')
    reactions: Optional[list[Reaction]] = dspy.OutputField(desc='the reactions extracted from the text, return null if no reaction found')


class ExtractReactionWithType(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=QA)

    def forward(self, text):
        prediction = self.predictor(text=text)
        return prediction
        
    
class Assess(dspy.Signature):
    """Assess the quality of the extracted reaction"""
    grounded_result = dspy.InputField(desc='the grounded result by a human expert in json format')
    extracted_result = dspy.InputField(desc='the extracted result by a NLP program in json format')
    question = dspy.InputField()
    answer = dspy.OutputField(desc='please answer yes/no')


def llm_metric(gold, pred, trace=None):
    grounded_result_restrict = [{'precursors': r['precursors'], 'target': r['target']} for r in gold.reactions]
    extract_result_restrict = [{'precursors': r.precursors, 'target': r.target} for r in pred.reactions]
    critic = dspy.ChainOfThoughtWithHint(Assess)(
        grounded_result=json.dumps(grounded_result_restrict, indent=2),
        extracted_result=json.dumps(extract_result_restrict, indent=2),
        question='based on the given grounded result, do you think that the extracted result successfully capture the reaction?'
    )
    return True if critic.answer.lower() == 'yes' else False



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re
from concurrent.futures import ThreadPoolExecutor

from sisyphus.chain import Filter, Writer
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb


ARTICLE = 'inorganic_dspy'
TARGET = 'temp_test'
exp_section_pattern = re.compile(r'\b(?:experiment(?:al|s|ing|ed)?|synthesis(?:es|ing|ed)?|preparation(?:s|al|ed|ing)?|process(?:es|ion|ing)?|method(?:s)?)\b', re.I)

article_db = get_plain_articledb(ARTICLE)
article_getter = Filter(article_db)
result_db = get_create_resultdb(TARGET, Reaction)

def return_valid(func):
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        return result if result else None
    return wrapper

@return_valid
def customized_filter(documents):
    docs = []
    for doc in documents:
        if exp_section_pattern.search(doc.metadata['content_title']) or exp_section_pattern.search(doc.metadata['section_title']):
            docs.append(doc)
    return docs



In [6]:
from random import shuffle
compiler = dspy.BootstrapFewShot(metric=llm_metric, max_labeled_demos=6)
train, dev = load_json('curated_examples.json')
shuffle(train)
with dspy.context(lm=dspy.LM('openai/gpt-4o')):
    compiled_extractor = compiler.compile(ExtractReactionWithType(), trainset=train)

 60%|██████    | 6/10 [00:45<00:30,  7.56s/it]

Bootstrapped 4 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.





In [7]:
evaluator = dspy.Evaluate(devset=dev, metric=llm_metric, num_threads=2, display_progress=True, display_table=True)
evaluator(compiled_extractor)

Average Metric: 2 / 2  (100.0): 100%|██████████| 2/2 [00:09<00:00,  4.63s/it]
2024/11/06 15:09:48 INFO dspy.evaluate.evaluate: Average Metric: 2 / 2 (100.0%)


Unnamed: 0,id,text,example_reactions,reasoning,pred_reactions,llm_metric
0,11,"Perovskite nanosheets were prepared by delaminating layered perovskites according to previously described procedures. (-22, 23) The starting material KCa2Nb3O10, prepared by a solid-state reaction, was...","[{'precursors': ['KCa2Nb3O10'], 'additives': ['HNO3'], 'target': 'HCa2Nb3O10 1.5H2O', 'reaction_type': 'others'}, {'precursors': ['HCa2Nb3O10 1.5H2O'], 'additives': ['TBAOH'], 'target': 'Ca2Nb3O10', 'reaction_type': 'others'}, {'precursors': ['KSr2Nb3O10'], 'additives': ['TBAOH'], 'target': 'Sr2Nb3O10', 'reaction_type':...","The text describes the preparation of perovskite nanosheets through a delamination process starting from layered perovskites. The initial material, KCa2Nb3O10, is converted into its protonic...","[Reaction(precursors=['KCa2Nb3O10'], additives=['HNO3'], target='HCa2Nb3O10 1.5H2O', reaction_type='solid-state'), Reaction(precursors=['HCa2Nb3O10 1.5H2O'], additives=['TBAOH'], target='Ca2Nb3O10', reaction_type='others'), Reaction(precursors=['KSr2Nb3O10'], additives=['TBAOH'], target='Sr2Nb3O10', reaction_type='others'), Reaction(precursors=['KCa2Ta3O10'], additives=['TBAOH'], target='Ca2Ta3O10', reaction_type='others'), Reaction(precursors=['KSr2Ta3O10'], additives=['TBAOH'], target='Sr2Ta3O10', reaction_type='others')]",✔️ [True]
1,12,"Polycrystalline LiFeP2O7 was synthesized by solid-state methods. Stoichiometric amounts of LiH2PO4 (Alfa Aesar, 97%), Fe2O3 (Fisher Scientific, Certified), and NH4H2PO4 (Alfa Aesar, 98.0%) were ground...","[{'precursors': ['LiH2PO4', 'Fe2O3', 'NH4H2PO4'], 'additives': [], 'target': 'LiFeP2O7', 'reaction_type': 'solid-state'}]","The text describes the synthesis of polycrystalline LiFeP2O7 using solid-state methods. The precursors involved in the reaction are LiH2PO4, Fe2O3, and NH4H2PO4, which are mixed...","[Reaction(precursors=['LiH2PO4', 'Fe2O3', 'NH4H2PO4'], additives=[], target='LiFeP2O7', reaction_type='solid-state')]",✔️ [True]


100.0

In [8]:
#patch
import ujson

def custom_save(self, path, save_field_meta=False):
    def convert_to_dict(obj):
        if isinstance(obj, BaseModel):
            return obj.model_dump()
        elif isinstance(obj, dict):
            return {k: convert_to_dict(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert_to_dict(i) for i in obj]
        elif isinstance(obj, dspy.Example):
            d = dict(obj)
            return convert_to_dict(d)
        else:
            return obj

    state = self.dump_state(save_field_meta)
    state = convert_to_dict(state)
    with open(path, "w") as f:
        f.write(ujson.dumps(state, indent=2))

dspy.Module.save = custom_save
compiled_extractor.save('compiled_extractor.json')

In [9]:
cot = ExtractReactionWithType()
cot.load('compiled_extractor.json')

In [5]:
from sisyphus.chain.chain_elements import DocInfo
classifier = Classify_CoT()
hint = 'whether the solid state reaction can infer from the text, 1 for yes, 0 for no'
definition = 'Solid-state reaction refers to a conventional method used in chemistry to synthesize various materials like ceramics and crystals by heating a mixture of raw materials in solid form.'


@return_valid
def extract(doc):
    reactions = None
    if classifier(text=doc.page_content, solid_state_definition=definition).extraction_availability_solid_react:
        prediction = compiled_extractor(text=doc.page_content)
        reactions = prediction.reactions
    return reactions 

@return_valid
def customized_extractor(docs):
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = executor.map(extract, docs)
    zipped_results = filter(lambda x: x[1], zip(docs, results))
    doc_infos = [DocInfo(doc=doc, info=result) for doc, result in zipped_results]
    return doc_infos

In [6]:
chain = article_getter + customized_filter + customized_extractor + Writer(result_db=result_db)

In [12]:
import os
files = os.listdir('articles_processed')
files = files[5:]
with dspy.context(lm=dspy.LM('openai/gpt-4o-mini'), temperature=0.1):
    with ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(chain.compose, files)