In [1]:
import dspy

lm = dspy.LM('openai/gpt-4o', cache=False)
dspy.configure(lm=lm)

In [2]:
from sisyphus.chain.chain_elements import Filter, Writer, run_chains_with_extarction_history_multi_threads
from sisyphus.utils.helper_functions import get_create_resultdb, get_plain_articledb

In [3]:
import re

pattern = re.compile(r'result', re.I)
bg_pattern = re.compile(r'\b([Bb]and\s*[Gg]aps?|Eg)\b')
unit_pattern = re.compile(r'\beV\b')
def filter_by_section_bg(doc):
    if not (bool(pattern.search(doc.metadata['sub_titles'])) or doc.metadata['sub_titles'] == 'table' or doc.metadata['sub_titles'] == ''):
        return False
    if bool(bg_pattern.search(doc.page_content)) and bool(unit_pattern.search(doc.page_content)):
        return True
    return False

In [4]:
articledb = get_plain_articledb('bg_411')
getter = Filter(articledb, filter_func=filter_by_section_bg)

In [5]:
import os
files = os.listdir('articles_processed')[:20]

In [6]:
from pydantic import BaseModel, Field
from typing import Literal

class BandGap(BaseModel):
    material: str = Field(description='The material name, please make sure it is a valid chemical formula rather than an acronym. Keep the phase information if it is available, e.g. α-[formula], or [formula]-II')
    band_gap: str = Field(description='The band gap value in electron volts (eV). It can be a specific number, a range (lower to upper bound), or an inequality (greater than or less than). Examples: 5, 2-3, >5, or <2.')
    direct_or_indirect: Literal['direct', 'indirect', 'unknown'] = Field(description='Whether the band gap type is direct or indirect.') 
    optical_or_electronic: Literal['optical', 'electronic', 'unknown'] = Field(description='Whether the band gap type is optical or electronic. If it was calculated by DFT method, it should be electronic. Choose unknown if it is not clear.')

In [7]:
from dspy import Example
# Examples
text_1 = """Magnetic and Optical Properties of Mn1-xZnxFe2O4 Nanoparticles
Mn1-xZnxFe2O4 (x=0.0-1.0) NPs (MZF NPs) were synthesized byacitric acid assisted sol−gel process. MZF NPs show super paramagnetic characteristics at room temperature. Saturation magnetization (Ms) of MnFe2O4 NPs is 70.52 emu/g isvery close tothe bulk saturation magnetization value of 80 emu/g. The observed Ms = 35.90 emu/g value forZnFe 2O4 particles ismuch greater than the bulk Ms value of 5 emu/g. This case isattributed tocation distribution change from normal spinel tomixed structure. The small Mr/Ms ratios (the maximum 0.147) specify uniaxial anisotropy in the Mn1-xZnxFe2O4 NPs. The average crystallite diameter (Dmag)was evaluated from magnetic analyses. The obtained Dmag values are between 27.67 and 33.60 nm and this range isingreat accordance with theresults calculated from XRD measurements. Among the NPs, the samples with more zinc content show higher diffuse reflectance. The optical direct band gap of MZF NPs is found to decrease from 2.1 to 1.90 eV as the zinc content rises."""
informations_1 = [BandGap(material='Mn1-xZnxFe2O4 (x = 0.0 - 1.0)', band_gap='1.9-2.1', direct_or_indirect='direct', optical_or_electronic='optical')]
text_2 = """Toward the Rational Design of Mid‐Infrared Nonlinear Optical Materials with Targeted Properties via a Multi‐Level Data‐Driven Approach
Design and exploratory synthesis of new mid-infrared (mid-IR) nonlinear optical (NLO) materials are urgently needed for modern laser science and technology because the widely used IR NLO crystals still suffer from their inextricable drawbacks. Herein, a multi-level data-driven approach to realize fast and efficient structure prediction for the exploration of promising mid-IR NLO materials is proposed. Techniques based on machine learning, crystal structure prediction, high-throughput calculation and screening, database building, and experimental verification are tightly combined for creating pathways from chemical compositions, crystal structures to rational synthesis. Through this data-driven approach, not only are all known structures successfully predicted but also five thermodynamically stable and 50 metastable new selenides in AIBIIISe2 systems (AI = Li, Na, K, Rb, and Cs; BIII = Al and Ga) are found, among which eight outstanding compounds with wide bandgaps (> 2.70 eV) and large SHG responses ( > 10 pm V-1) are suggested. Moreover, the predicted compounds I 4 ¯ 2d-LiGaSe2 and I4/mcm-KAlSe2 are successfully obtained experimentally. In particular, LiGaSe2 exhibits a robust SHG response ( ≈ 2 × AGS) and long IR absorption edge that can cover two atmospheric windows (3-5, 8-12 µm). Simultaneously, this new research paradigm is also applicative for discovering new materials in other fields."""
informations_2 = [BandGap(material='AIBIIISe2 (AI = Li, Na, K, Rb, Cs; BIII = Al, Ga)', band_gap='2.70', direct_or_indirect='unknown', optical_or_electronic='unknown')]
# text_3 = """Mg-Si-As: An Unexplored System with Promising Nonlinear Optical Properties
# Two new non-centrosymmetric ternary compounds, MgSiAs2 and Mg3Si6As8, are discovered via metal flux and solid-state synthetic methods. MgSiAs2 belongs to the well-known II-IV-V2 family, which is extensively studied experimentally and computationally for their optical properties. MgSiAs2 is computationally predicted but not experimentally known prior to this work. Mg3Si6As8 crystallizes in a new non-centrosymmetric cubic chiral structure type with the Pearson symbol cP68. The syntheses, crystal structure, thermal and chemical stabilities, electronic structures, and optical properties of these two new compounds are investigated in this work. Optical absorption measurements and electronic structure calculations reveal the two compounds to be direct or pseudo-direct bandgap semiconductors (1.8-2 eV). The crystal structures of both compounds are non-centrosymmetric, though Mg3Si6As8 belongs to the 432 chiral crystal class, which is optically active but does not exhibit second harmonic generation (SHG) behavior. The SHG response of MgSiAs2 is 60% of that for AgGaS2, but MgSiAs2 exhibits a higher laser damage threshold than AgGaS2 at 33.2 MW cm-2."""
# informations_3 = [BandGap(material='MgSiAs2', band_gap='1.8-2', direct_or_indirect='direct', optical_or_electronic='optical'), BandGap(material='Mg3Si6As8', band_gap='1.8-2', direct_or_indirect='direct', optical_or_electronic='optical')]
# examples = [
#     Example(text=text_1, informations=informations_1).with_inputs('text'),
#     Example(text=text_2, informations=informations_2).with_inputs('text'),
#     Example(text=text_3, informations=informations_3).with_inputs('text')
# ]

In [8]:
text = """To better understand the origin of the outstanding optical properties, the electronic structures of CBBO and δ-BBO were investigated using DFT calculations based on their experimental crystal structures (Supporting Information, Figure S12). The calculated band gap of CBBO (8.25 eV) aligns closely with its experimental value (8.05 eV), showing a similarity to LBO (calc. 8.22 eV, expt. 8.37 eV). Meanwhile, the band gap of δ-BBO was determined to be 8.61 eV through calculations."""
information = [BandGap(material='CBBO', band_gap='8.25', direct_or_indirect='unknown', optical_or_electronic='electronic'), BandGap(material='CBBO', band_gap='8.05', direct_or_indirect='unknown', optical_or_electronic='unknown'), BandGap(material='δ-BBO', band_gap='8.61', direct_or_indirect='unknown', optical_or_electronic='electronic'), BandGap(material='LBO', band_gap='8.22', direct_or_indirect='unknown', optical_or_electronic='electronic'), BandGap(material='LBO', band_gap='8.37', direct_or_indirect='unknown', optical_or_electronic='unknown')]
examples = [Example(text=text_1, informations=informations_1).with_inputs('text'),
    Example(text=text_2, informations=informations_2).with_inputs('text'),
    Example(text=text, informations=information).with_inputs('text')]

In [9]:
class ExtractBandGap(dspy.Signature):
    """Extract band gap information from given text or table, including refered bandgaps.
    Note: 
    - Record both **electronic** and **optical** band gaps if they are mentioned.
    - If multiple **calculated** band gap values are provided for the same material, record only the **highest** value.
    - Include both **experimental** and **calculated** band gap values if they are mentioned.
    - Additionally, extract any **referenced band gaps** that the author cites for comparison.
    """
    text = dspy.InputField()
    informations: list[BandGap] = dspy.OutputField()
lm_extractor_3_shots = dspy.LabeledFewShot().compile(dspy.Predict(ExtractBandGap), trainset=examples)
# lm_extractor_1_shot = dspy.LabeledFewShot().compile(dspy.Predict(ExtractBandGap), trainset=example)

In [10]:
def extract(doc):
    results = lm_extractor_3_shots(text=doc.page_content)
    return results.informations

def cem_collector(model):
    return [model.material]

def cem_updater(model, acro_dict, drops):
    if model.material in drops:
        return
    if model.band_gap == 'unknown':
        return
    model_d = model.model_dump()
    if acro_dict is None:
        return model_d
    if model.material in acro_dict:
        model_d['material'] = acro_dict[model.material]
    return model_d

In [11]:
from sisyphus.chain.adapters import ParagraphExtractionAdapter, ParagraphSelectionAdapter, ParagraphCemValidationAdapter

retrieve = Filter(articledb)
selection = ParagraphSelectionAdapter(filter_by_section_bg, True, 'Band gaps tables:')
extraction = ParagraphExtractionAdapter(extract, 'thread', 5)
write = Writer(get_create_resultdb('bg_20_results_with_standard'))

drop_rule = """Please drop chemicals that exist solely as anions (e.g., OH⁻, Cl⁻, SO₄²⁻)"""
standard_rule = """You are provided with chemical formulas which is type of inorganic materials. You are required to standardize those formulas as follow rules and examples:
remove literal descriptional part of each formula which is not part of the formula: e.g. "calculated", "obtained"
formula with phase information should be standardized as follow: [phase]-[formula] or [formula]-[phase]. e.g. α-BaTiO3
Note:
- If the mention is a valid chemical formula or acronym, keep it as is.
- Keep the part which describe the formula represents a group of materials, e.g. "Mn1-xZnxFe2O4 (x=0.0-1.0) NPs or AIBIIISe2 (AI = Li, Na, K, Rb, Cs; BIII = Al, Ga)"
"""
validation = ParagraphCemValidationAdapter(cem_collector, cem_updater, drop_rule=drop_rule, standard_rule=standard_rule)
adapted_chain = retrieve + selection + extraction + validation + write
adapted_chain_test = retrieve + selection + extraction + validation 



In [16]:
adapted_chain_test.compose('10.1002&sol;anie.201700540.html')

[DocInfo(doc=Document(metadata={'source': '10.1002&sol;anie.201700540.html', 'doi': '10.1002/anie.201700540', 'sub_titles': '', 'title': 'Fluorooxoborates: Beryllium‐Free Deep‐Ultraviolet Nonlinear Optical Materials without Layered Growth'}, page_content='Fluorooxoborates: Beryllium‐Free Deep‐Ultraviolet Nonlinear Optical Materials without Layered Growth\n\nAbstract\nDeep-ultraviolet nonlinear optical (DUV NLO) crystals are the key materials to extend the output range of solid-state lasers to below 200 nm. The only practical material KBe2BO3F2 suffers high toxicity through beryllium and strong layered growth. Herein, we propose a beryllium-free material design and synthesis strategy for DUV NLO materials. Introducing the (BO3F)4-, (BO2F2)3-, and (BOF3)2- groups in borates could break through the fixed 3D B-O network that would produce a larger birefringence without layering and simultaneously keep a short cutoff edge down to DUV. The theoretical and experimental studies on a series of 

In [17]:
results = _

In [18]:
for result in results:
    print(result.info)

[{'material': 'LiB6O9F', 'band_gap': '>8.0', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'electronic'}, {'material': 'Li2B6O9F2', 'band_gap': '>8.0', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'electronic'}, {'material': 'Li2B3O4F3', 'band_gap': '>8.0', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'electronic'}, {'material': 'Na2B6O9F2', 'band_gap': '>8.0', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'electronic'}, {'material': 'LiB3O5', 'band_gap': '8.28', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'electronic'}, {'material': 'LiB3O5', 'band_gap': '8.00', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'unknown'}]
[{'material': 'Li2B6O9F2', 'band_gap': '8.05', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'electronic'}, {'material': 'LiB6O9F', 'band_gap': '8.37', 'direct_or_indirect': 'unknown', 'optical_or_electronic': 'electronic'}, {'material': 'Li2B3O4F3', 'band_gap': '8.425', 'direct_or_i

In [19]:
lm.inspect_history(4)





[34m[2025-02-18T10:46:22.773348][0m

[31mSystem message:[0m

Your input fields are:
1. `cems` (list[str])

Your output fields are:
1. `reasoning` (str)
2. `dropped` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## cems ## ]]
{cems}

[[ ## reasoning ## ]]
{reasoning}

[[ ## dropped ## ]]
{dropped}        # note: the value you produce must be pareseable according to the following JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Please drop chemicals that exist solely as anions (e.g., OH⁻, Cl⁻, SO₄²⁻)


[31mUser message:[0m

[[ ## cems ## ]]
["(BOF3)2-", "Na2B6O9F2", "(BO3)3-", "Li2B3O4F3", "(BO2F2)3-", "Li2B6O9F2", "LiB6O9F", "(BO4)5-", "LBO", "(BO3F)4-"]

Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## dropped ## ]]` (must be formatted as a valid Python list[s

In [13]:
run_chains_with_extarction_history_multi_threads(adapted_chain, 'articles_processed', 5, 'bg_20_results_with_standard_re', None, files)

100%|██████████| 20/20 [00:58<00:00,  2.92s/it]


### sometimes if there are too many results to return, the DSPy will failed to parse data, so we need to change to langchain using response format as json_schema instead.

In [14]:
result_db = get_create_resultdb('bg_20_results_with_standard')
import json
with open('bg_20_results_with_standard.json', 'w') as f:
    json.dump(result_db.load_as_json('4o', '', 'bg_20_results_with_standard', with_doi=True), f, indent=2)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from typing import Optional
model = ChatOpenAI(model='gpt-4o', temperature=0.0)
class BandGaps(BaseModel):
    band_gaps: Optional[list[BandGap]]

extract_template = ChatPromptTemplate(
    [
        ('system', 'You are a domain expert of nonlinear optical materials. Please extract band gap information from the following paper.'),
        ('user', '[START OF PAPER]{paper}[END OF PAPER]\n\nInstruction:\n{instruction}')
    ]
)
extraction_chain = extract_template | model.with_structured_output(BandGaps, method='json_schema')

def langchain_extract(doc):
    results = extraction_chain.invoke({'paper': doc.page_content, 'instruction': ExtractBandGap.__doc__})
    return results.band_gaps

extraction_lc = ParagraphExtractionAdapter(langchain_extract, 'thread', 5)
chain_lc = retrieve + selection + extraction_lc + validation + write

In [29]:
run_chains_with_extarction_history_multi_threads(chain_lc, 'articles_processed', 5, 'bg_extraction', None, None)

100%|██████████| 1/1 [00:39<00:00, 39.44s/it]
