In [1]:
from threading import Lock
from typing import Optional, Literal

import dspy
from dspy.utils.callback import BaseCallback
from pydantic import BaseModel, Field, field_validator


class SwappedChatAdapter(dspy.adapters.ChatAdapter):
    def format(self, signature, demos, inputs):
        messages:list = super().format(signature, demos, inputs)
        systeme_message = messages.pop(0)
        messages.insert(-1, systeme_message)
        return messages

lm = dspy.LM('openai/gpt-4o', cache=False)
dspy.configure(lm=lm, adapter=SwappedChatAdapter())

class Target(BaseModel):
    target_formula: str = Field(description='make sure it is a valid chemical formula')
    amount_var: dict[str, list[float]] = Field(description='the amount variable in the formula, e.g. AxBC, {x: [1, 2]}')
    element_var: dict[str, list[str]] = Field(description='the element variable in the formula')


class Reaction(BaseModel):
    precursors: list[str] = Field(description='ensure it is a valid chemical formula')
    additives: list[str]
    target: Target
    reaction_type: Literal['solid-state', 'sol-gel', 'co-precipitation', 'hydrothermal', 'flux', 'others']

    # @field_validator('precursors')
    # @classmethod
    # def validate_precusors(cls, v):
    #     if not v:
    #         raise ValueError('precursors should not be empty')
    #     return v
    

class QA(dspy.Signature):
    """Your objective is to extract all chemical reactions described in the text, regardless of prominence. Each reaction should include:
    1. Explicitly mentioned precursors and target products.
    2. Reaction type, as classified into solid-state, sol-gel, hydrothermal, co-precipitation, flux, or others.

    Ensure reasoning includes all reactions described, mentioning each reaction process briefly.

    Examples are provided for reference."""
    text: str = dspy.InputField(desc='a piece of text which may contains chemical reactions')
    reactions: Optional[list[Reaction]] = dspy.OutputField(desc='the reactions extracted from the text, return null if no reaction found')

class ExtractReactionWithType(dspy.Module):
    def __init__(self):
        self.predictor = dspy.ChainOfThought(signature=QA)

    def forward(self, text):
        prediction = self.predictor(text=text)
        return prediction

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
compiled_program = ExtractReactionWithType()
compiled_program.load('compiled_direct_with_element_var_version_2')

In [3]:
demos = compiled_program.predictor.demos
keeped_demos = []
keeped_demos.append(demos[0])
keeped_demos.append(demos[2])
keeped_demos.append(demos[3])
compiled_program.predictor._predict.demos = keeped_demos
compiled_program.predictor.demos

[{'augmented': True,
  'text': '(1-x)(Ca0.88Sr0.12)TiO3–x(Bi0.5Na0.5)TiO3 [hereafter referred to as (1−x)CST–xBNT] ceramics (x\u2009=\u20090.025, 0.050, 0.075, 0.10) were prepared by the conventional solid-state reaction method. The starting materials were high-purity grade powders (>\u200999%): CaCO3, SrCO3, Bi2O3, Na2CO3 and TiO2. CST and BNT powders were synthesized separately by mixing the starting materials according to the desired stoichiometry, and milled with ZrO2 balls and deionized water for 6 h in nylon jars. After drying, the powders were calcined at 1100 °C for 3 h and 850 °C for 3 h, respectively. The resulting powders were re-milled for 6 h. After that, the dried and re-milled mixtures were granulated with appropriate poly vinyl alcohol (PVA) as binder and uniaxial pressed at 300 MPa into cylinders with approximate size of 15 mm in diameter and 7.3 mm in thickness. The cylinders with x\u2009=\u20090.05, 0.075 and 0.1 were sintered in air at 1225–1300 °C for 3 h, and the 

In [4]:
# since the compiled model signature not compatible with current version, we need to update the signature
compiled_program.predictor.extended_signature.instructions = """Your objective is to extract all chemical reactions described in the text, regardless of prominence. Each reaction should include:
1. Explicitly mentioned precursors and target products.
2. Reaction type, as classified into solid-state, sol-gel, hydrothermal, co-precipitation, flux, or others.

Ensure reasoning includes all reactions described, mentioning each reaction process briefly.

Examples are provided for reference."""

In [5]:
compiled_program.predictor.extended_signature

StringSignature(text -> reasoning, reactions
    instructions='Your objective is to extract all chemical reactions described in the text, regardless of prominence. Each reaction should include:\n1. Explicitly mentioned precursors and target products.\n2. Reaction type, as classified into solid-state, sol-gel, hydrothermal, co-precipitation, flux, or others.\n\nEnsure reasoning includes all reactions described, mentioning each reaction process briefly.\n\nExamples are provided for reference.'
    text = Field(annotation=str required=True json_schema_extra={'desc': 'a piece of text which may contains chemical reactions', '__dspy_field_type': 'input', 'prefix': 'Text:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    reactions = Field(annotation=Union[list[Reaction], NoneType] required=True json_schema_extra={'desc': 'the reactions extracted from 

In [6]:
import re
from sisyphus.utils.tenacity_retry_utils import pydantic_validate_retry_wraps
from sisyphus.utils.helper_functions import return_valid


exp_section_pattern = re.compile(r'\b(?:experiment(?:al|s|ing|ed)?|synthesis(?:es|ing|ed)?|preparation(?:s|al|ed|ing)?|process(?:es|ion|ing)?|method(?:s)?)\b', re.I)
def filter_with_kw(doc):
    return bool(exp_section_pattern.search(doc.metadata['sub_titles']))

@return_valid
@pydantic_validate_retry_wraps
def extract(doc, extract_program):
    text = doc.page_content
    title = doc.metadata['title']
    sub_titles = doc.metadata['sub_titles']
    context = f'title:\n{title}\n' + 'subtitles:\n' + '\n'.join([f'{sub_title}' for sub_title in sub_titles.split('/')]) + f'\nparagraph:\n{text}'
    prediction = extract_program(text=context)
    reactions = prediction.reactions
    return reactions

In [None]:
class UnidentifiedTerm(dspy.Signature):
    """You are an expert chemist. You objective is to check if the """

def resolve(doc, extract_program):
    pass

In [7]:
ARTICLE = '40_with_good_title'
TARGET = 'inorganic_with_validation'

In [8]:
from functools import partial
from sisyphus.chain import Filter, Writer
from sisyphus.chain.customized_elements import customized_extractor
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.chain.chain_elements import run_chains_with_extarction_history_multi_threads

article_db = get_plain_articledb(ARTICLE)
article_getter = Filter(article_db, filter_func=filter_with_kw, with_abstract=True)
result_db = get_create_resultdb(TARGET, Reaction)
compiled_extractor = partial(extract, extract_program=compiled_program)
compiled_extract_el = customized_extractor(compiled_extractor, 'thread', 5)



In [9]:
compiled_extract_chain = article_getter + compiled_extract_el
chain = compiled_extract_chain + Writer(result_db)

In [None]:
d