In [None]:
import dspy
import json
from typing import Optional
from pydantic import BaseModel, Field

# config
lm = dspy.LM('openai/gpt-4o')
dspy.configure(lm=lm)

def load_json(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        data = json.load(f)
    train_set = [dspy.Example(**example).with_inputs('text') for example in data['examples'][:10]]
    dev_set = [dspy.Example(**example).with_inputs('text') for example in data['examples'][10:]]
    return train_set, dev_set


class ClassifyReaction(dspy.Signature):
    """Giving the availability of extracting solid-state reaction formula from the text """
    text = dspy.InputField(desc='a piece of text which may contains solid-state chemical reaction formula')
    hint = dspy.InputField(desc='user hint')
    
    extraction_availability_solid_react: int = dspy.OutputField(desc='1 for yes, 0 for no')


class Classify_CoT(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=ClassifyReaction)

    def forward(self, text, hint):
        prediction = self.predictor(text=text, hint=hint)
        return prediction
    
    
class Reaction(BaseModel):
    precursors: list[str] = Field(description='the precursors or starting material of reaction')
    additives: list[str] = Field(description='the additives of the reaction')
    target: str = Field(description='the product of the reaction, make sure it is a valid chemical formula')
    reaction_type: str = Field(description='the type of the reaction, choose from [solid-state, sol-gel, co-preciptation, hydrothermal, flux, others]')


class QA(dspy.Signature):
    """extract reaction consitituent from the text"""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reaction')
    reactions: Optional[list[Reaction]] = dspy.OutputField(desc='the reactions extracted from the text, return null if no reaction found')


class ExtractReactionWithType(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=QA)

    def forward(self, text):
        prediction = self.predictor(text=text)
        return prediction
        
    
class Assess(dspy.Signature):
    """Assess the quality of the extracted reaction"""
    grounded_result = dspy.InputField(desc='the grounded result by a human expert in json format')
    extracted_result = dspy.InputField(desc='the extracted result by a NLP program in json format')
    question = dspy.InputField()
    answer = dspy.OutputField(desc='please answer yes/no')


def llm_metric(gold, pred, trace=None):
    grounded_result_restrict = [{'precursors': r['precursors'], 'target': r['target']} for r in gold.reactions]
    extract_result_restrict = [{'precursors': r.precursors, 'target': r.target} for r in pred.reactions]
    critic = dspy.ChainOfThoughtWithHint(Assess)(
        grounded_result=json.dumps(grounded_result_restrict, indent=2),
        extracted_result=json.dumps(extract_result_restrict, indent=2),
        question='based on the given grounded result, do you think that the extracted result successfully capture the reaction?'
    )
    return critic.answer.lower() == 'yes'



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import re
from concurrent.futures import ThreadPoolExecutor

from sisyphus.chain import Filter, Writer
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb


ARTICLE = 'inorganic_dspy'
TARGET = 'temp_test'
exp_section_pattern = re.compile(r'\b(?:experiment(?:al|s|ing|ed)?|synthesis(?:es|ing|ed)?|preparation(?:s|al|ed|ing)?|process(?:es|ion|ing)?|method(?:s)?)\b', re.I)

article_db = get_plain_articledb(ARTICLE)
article_getter = Filter(article_db)
result_db = get_create_resultdb(TARGET, Reaction)

def customized_filter(documents):
    docs = []
    for doc in documents:
        if exp_section_pattern.search(doc.metadata['content_title']) or exp_section_pattern.search(doc.metadata['section_title']):
            docs.append(doc)
    return docs if docs else None



In [6]:
from random import shuffle
compiler = dspy.BootstrapFewShot(metric=llm_metric, max_labeled_demos=6)
train, dev = load_json('curated_examples.json')
shuffle(train)
compiled_extractor = compiler.compile(ExtractReactionWithType(), trainset=train)

 90%|█████████ | 9/10 [00:46<00:05,  5.21s/it]

Bootstrapped 4 full traces after 9 examples for up to 1 rounds, amounting to 9 attempts.





In [7]:
evaluator = dspy.Evaluate(devset=dev, metric=llm_metric, num_threads=2, display_progress=True, display_table=True)
evaluator(compiled_extractor)

Average Metric: 1 / 2  (50.0): 100%|██████████| 2/2 [00:09<00:00,  4.94s/it] 
2024/11/04 20:39:58 INFO dspy.evaluate.evaluate: Average Metric: 1 / 2 (50.0%)


Unnamed: 0,id,text,example_reactions,reasoning,pred_reactions,llm_metric
0,11,"Perovskite nanosheets were prepared by delaminating layered perovskites according to previously described procedures. (-22, 23) The starting material KCa2Nb3O10, prepared by a solid-state reaction, was...","[{'precursors': ['KCa2Nb3O10'], 'additives': ['HNO3'], 'target': 'HCa2Nb3O10 1.5H2O', 'reaction_type': 'others'}, {'precursors': ['HCa2Nb3O10 1.5H2O'], 'additives': ['TBAOH'], 'target': 'Ca2Nb3O10', 'reaction_type': 'others'}, {'precursors': ['KSr2Nb3O10'], 'additives': ['TBAOH'], 'target': 'Sr2Nb3O10', 'reaction_type':...","The text describes the preparation of perovskite nanosheets through a delamination process. The starting material, KCa2Nb3O10, is prepared by a solid-state reaction and then converted...","[Reaction(precursors=['K2CO3', 'CaCO3', 'Nb2O5'], additives=[], target='KCa2Nb3O10', reaction_type='solid-state')]",
1,12,"Polycrystalline LiFeP2O7 was synthesized by solid-state methods. Stoichiometric amounts of LiH2PO4 (Alfa Aesar, 97%), Fe2O3 (Fisher Scientific, Certified), and NH4H2PO4 (Alfa Aesar, 98.0%) were ground...","[{'precursors': ['LiH2PO4', 'Fe2O3', 'NH4H2PO4'], 'additives': [], 'target': 'LiFeP2O7', 'reaction_type': 'solid-state'}]","The text describes the synthesis of polycrystalline LiFeP2O7 using a solid-state method. The precursors used in the reaction are LiH2PO4, Fe2O3, and NH4H2PO4. The process...","[Reaction(precursors=['LiH2PO4', 'Fe2O3', 'NH4H2PO4'], additives=[], target='LiFeP2O7', reaction_type='solid-state')]",✔️ [True]


50.0

In [None]:
lm.inspect_history(4)

In [11]:
from sisyphus.chain.chain_elements import DocInfo
classifier = Classify_CoT()
hint = 'whether the solid state reaction can infer from the text, 1 for yes, 0 for no'
def extract(doc):
    reactions = None
    if classifier(text=doc.page_content, hint=hint).extraction_availability_solid_react:
        prediction = compiled_extractor(text=doc.page_content)
        reactions = prediction.reactions
    return DocInfo(doc=doc, info=reactions) if reactions else None

def customized_extractor(docs):
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = executor.map(extract, docs)
    results = filter(None, results)
    return results if results else None

In [13]:
chain = article_getter + customized_filter + customized_extractor + Writer(result_db=result_db)

In [1]:
import os
files = os.listdir('articles_processed')
files = files[5:]
with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(chain.compose, files)

NameError: name 'ThreadPoolExecutor' is not defined