##### assumed you have create database from the test files

## Labeling

In [None]:
import re

from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.chain.label import BaseLabeler, Labeling, save_labeled_paras_wrapper
from sisyphus.chain.chain_elements import Filter, Writer, run_chains_with_extarction_history_multi_threads


class BandGapLabler(BaseLabeler):
    property = 'band_gap'
    regex_pattern = re.compile(r'\b(band[- ]?gaps?|bandgaps?|energy[- ]?gap|energy gap)\b', re.I)
    # You may need to define functions for semantic_label and llm_label if needed

labeler = Labeling()
labeler.add_labeler(BandGapLabler())

database = get_plain_articledb('nlo')
loader = Filter(database)


save_labeled_paras = save_labeled_paras_wrapper('nlo_labeled')

chain = loader + labeler + save_labeled_paras

run_chains_with_extarction_history_multi_threads(
    chain,
    'test_file',
    10,
    'nlo_labeled'
)


100%|██████████| 1/1 [00:00<00:00, 309.59it/s]


## Extraction

In [None]:
from dotenv import load_dotenv
_ = load_dotenv()

from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

from sisyphus.chain.paragraph import Paragraph
from sisyphus.chain.extract import BaseExtractor, Extraction

# your models
class Bandgap(BaseModel):
    """Extract bandgap information from scientific papers."""
    bandgap: Optional[str] = Field(description="Bandgap value with unit, e.g., '1.5 eV'")
    bandgap_type: Optional[Literal['direct', 'indirect']] = Field(description="Type of bandgap: direct or indirect")
    measurement_method: Optional[str] = Field(description="Method used to measure the bandgap, e.g., 'UV-Vis spectroscopy'")

class Records(BaseModel):
    records: List[Bandgap]

# you prompt, must contains text field
nlo_prompt = ChatPromptTemplate(
    [
        ('system', 'you are a helpful assistant that extracts specific information from scientific papers.'),
        ('user', '[START OF PAPER]\n{text}\n[END OF PAPER]\n\nInstruction:\n{instruction}')
    ]
)

class BGExtractor(BaseExtractor):
    target_properties = ['band_gap']
    model = ChatOpenAI(model_name='gpt-4.1', temperature=0)

    def create_model_prompt(self, paragraphs): # note the paragraphs are merged paragraphs
        for paragraph in paragraphs:
            paragraph.set_pydantic_model(Records)
            paragraph.set_prompt(
                nlo_prompt,
                {'instruction': 'Extract bandgap information including bandgap value with unit, bandgap type (direct or indirect), and measurement method from the given scientific paper paragraph. Provide the results in a structured format.'
                }
            )

def load_from_labeled_db(docs):
    return [Paragraph.from_labeled_document(doc, id_) for id_, doc in enumerate(docs)]

extractor = Extraction()
bg_extractor = BGExtractor()
extractor.add_extractor(bg_extractor)

from sisyphus.chain import Writer, Filter
from sisyphus.utils.helper_functions import get_create_resultdb, get_plain_articledb


db = get_plain_articledb('nlo_labeled')
loader = Filter(db)
result_db = get_create_resultdb('nlo_results')
writer = Writer(result_db)

chain = loader + load_from_labeled_db + extractor + writer
chain.compose('10.1002&sol;adfm.201801589.html') # example file name




#### Case view 1: multiple properties extracted parallel

In [None]:
# Just add multiple labelers to Labeling instance and add multiple extractors to Extractor instance. It was built to support parallelism.

#### Case view 2: multiple properties merged extraction
- We use regex serach, semantic search and llm model to find target paragraphs
- The targets are then merged to a broad context chunk and sent to llm. The result model is adjusted with the exists of properties

In [None]:
# some helper functions for labeling
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import dspy


lm = dspy.LM('openai/gpt-4.1', max_tokens=3000)
dspy.configure(lm=lm)
embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
chroma_db = Chroma(collection_name='synthesis_embedding', embedding_function=embeddings)
has_embedded = []

def retrieve(vector_store, source, query, sub_titles, k=3):
    if not sub_titles:
        filter_ = {"source": source}
    else:
        filter_ = {"$and":[{"sub_titles": {"$in": sub_titles}}, {"source": source}]}
    return vector_store.similarity_search(query, k=k, filter=filter_)

def match_subtitles(docs, pattern):
    sub_titles = list(set([doc.metadata["sub_titles"] for doc in docs]))
    target_titles = []
    for title in sub_titles:
        if pattern.search(title):
            target_titles.append(title)
    return target_titles

def find_candidates(paragraphs, doc_candidates):
    cands = []
    for doc_candidate in doc_candidates:
        for para in paragraphs:
            if para.document == doc_candidate:
                cands.append(para)
                break
    return cands

def semantic_label_property(paragraphs, query, k):
    """label property in results section"""
    source = paragraphs[0].metadata['source']
    res_pattern = re.compile(r'result', re.I)
    res_titles = match_subtitles(paragraphs, res_pattern)
    if source not in has_embedded:
        has_embedded.append(source)
        docs = [paragraph.document for paragraph in paragraphs]
        chroma_db.add_documents(docs)
    similar_docs = retrieve(chroma_db, source, query, res_titles, k)
    return find_candidates(paragraphs, similar_docs)

def semantic_label_synthesis(paragraphs, query, k):
    """label synthesis in method section"""
    source = paragraphs[0].metadata['source']
    syn_pattern = re.compile(r'(experiment)|(preparation)|(method)', re.I)
    method_titles = match_subtitles(paragraphs, syn_pattern)
    if source not in has_embedded:
        has_embedded.append(source)
        docs = [paragraph.document for paragraph in paragraphs]
        chroma_db.add_documents(docs)
    similar_docs = retrieve(chroma_db, source, query, method_titles, k)
    return find_candidates(paragraphs, similar_docs)

In [None]:
import re
from typing import Literal

import dspy
from langchain_openai import OpenAIEmbeddings

from sisyphus.heas.tabel import LabelTablesStrength
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.chain.label import BaseLabeler, Labeling, save_labeled_paras_wrapper
from sisyphus.chain.chain_elements import Filter, Writer, run_chains_with_extarction_history_multi_threads


class StrengthLabeler(BaseLabeler):
    property = 'strength'
    regex_pattern = re.compile(r'(\b(MPa|GPa)\b|\d+(\.\d+)?\s*%)')
    query = "The stress-strain curve of alloy, describes yield strength (ys), tensile strength (uts) and elongation properties, for example, CoCuFeMnNi shows tensile strength of 1300 MPa and total elongation of 20%."

    def semantic_label(self, paragraphs):
        return semantic_label_property(paragraphs, self.query, k=5)

class LabelTablesStrength(dspy.Signature):
    """You are an expert in materials science and mechanical testing. Given the following CSV table from a scientific paper on high entropy alloys, determine whether it contains at least one tensile or compressive test property.

    Relevant Properties for Classification:
    A table should be classified as relevant if it contains at least one of the following:

    Yield Strength (YS) (MPa)
    Ultimate Tensile Strength (UTS) (MPa)
    Compressive Strength (MPa)
    Strain (percentage or as a ratio, e.g., true strain or elongation)
    Exclusions:
    Do not classify table as relevant if they only mention:

    Fracture strength
    Hardness (e.g., Vickers, Brinell, Rockwell)
    Fatigue strength
    Shear strength"""
    table: str = dspy.InputField()
    contains: bool = dspy.OutputField()


class StrengthTableLabler(BaseLabeler):
    property = 'strength'
    llm_labeler = dspy.ChainOfThought(LabelTablesStrength)

    def llm_label(self, paragraph):
        if not paragraph.is_table():
            return False
        contains = self.llm_labeler(table=paragraph.page_content).contains
        return contains

class PhaseLabeler(BaseLabeler):
    property = 'phase'
    regex_pattern = re.compile(r'\b(FCC|BCC|HCP|L12|B2|Laves|f.c.c.|b.c.c.|h.c.p.|face-centered cubic|body-centered cubic|hexagonal close-packed|intermetallic|IM)\b', re.I)
    query = "Microstructure characterization of alloys (common phases include FCC, BCC, HCP, L12, B2 etc.), usually through technique like XRD or TEM. Describe about phase and grain size and boundaries"

    def semantic_label(self, paragraphs):
        return semantic_label_property(paragraphs, self.query, k=5)

class ClassifySyn(dspy.Signature):
    """assign topic to paragraphs of HEAs(high entropy alloys) papers. The topics include synthesis, characterization, and others.
    Note: a qualified synthesis paragraph should include the synthesis and processing of materials, including methods such as melting, casting, rolling, annealing, mechnical processes or additive manufacturing. be very strict about your decision."""
    paragraph: str = dspy.InputField()
    topic: Literal['synthesis', 'characterization', 'others'] = dspy.OutputField()

class ExperimentalLabeler(BaseLabeler):
    property = 'synthesis'
    query = "Experimental procedures describing the synthesis and processing of HEAs materials, including methods such as melting, casting, rolling, annealing, heat treatment, or other fabrication techniques. Details often include specific temperatures (e.g., °C), durations (e.g., hours, minutes), atmospheric conditions (e.g., argon, vacuum), mechanical deformation (e.g., rolling reduction)."
    llm_labeler = dspy.ChainOfThought(ClassifySyn)

    def semantic_label(self, paragraphs):
        return semantic_label_synthesis(paragraphs, self.query, k=3)
    
    def llm_label(self, paragraph):
        topic = self.llm_labeler(paragraph=paragraph.page_content).topic
        return topic == 'synthesis'

labeler = Labeling()
labeler.add_labeler(StrengthLabeler())
labeler.add_labeler(StrengthTableLabler())
labeler.add_labeler(PhaseLabeler())
labeler.add_labeler(ExperimentalLabeler())

database = get_plain_articledb('heas_1531')
loader = Filter(database)


save_labeled_paras = save_labeled_paras_wrapper('heas_labeled')

chain = loader + labeler + save_labeled_paras

chain.compose('10.1002&sol;adem.201600726.html')

In [12]:
# models, instructions
from ast import literal_eval
import re

from pydantic import BaseModel, Field, ConfigDict, field_validator
from typing import List, Optional, Literal
import json


class Phase(BaseModel):
    phases: List[str] = Field(description='list of phases present in the material')
    test_env: Optional[str] = Field(description='other test parameters if avilable (not test method), be succinct, e.g., under plastic deformation')
    model_config = ConfigDict(extra="forbid")

class Strength(BaseModel):
    ys: Optional[str] = Field(description="Yield strength with unit")
    uts: Optional[str] = Field(description="Ultimate tensile/compressive strength with unit")
    strain: Optional[str] = Field(description="Fracture strain. If in percentage form, please add '%' sign, else return as decimal form. Example: 0.5 or 50%")
    temperature: Optional[str] = Field(description="Test temperature with unit, if not specified, return 'room temperature'")
    strain_rate: Optional[str] = Field(description="Strain rate with unit, e.g., 1e-3 s^-1")
    test_env: Optional[str] = Field(description="Other tensile/compressive test environments if available, be succinct. e.g., sample geometry, salt environment")
    test_type: Literal['tensile', 'compressive']
    model_config = ConfigDict(extra="forbid")

class MetaData(BaseModel):
    composition: str = Field(description='nominal material composition with basis marker, e.g., "AlCoCrFeNi2.5@at", "AlCoCrFeNi2.1@wt", "AlCoCrFeNi@at+Al2O3@wt[5%]"')
    model_config = ConfigDict(extra="forbid")

class Synthesis(BaseModel):
    """Synthesis information for a HEAs material.
    Note:
    - Use article synthesis section as only source of information.
    - Do not contain process related to test (e.g., tensile/compression test temperature) or characterization (e.g., XRD, SEM).
    """
    steps: str = Field(
        description=(
            "List of processing steps in chronological order as JSON objects. "
            "Example: ["
            '{"induction melting": {"power": "50 kW", "coil frequency": "10 kHz", "atmosphere": "argon", '
            '"pressure": "1 atm", "crucible material": "graphite", "liquid mixing time": "5 min", "number of remelts": "2"}}, '
            '{"annealing": {"temperature": "800 K", "duration": "1 h", "atmosphere": "argon"}}'
            "]. Return [] if no processing info is found; use empty strings for unknown values."
        )
    )
    # Forbid additional properties so JSON Schema has additionalProperties=false
    model_config = ConfigDict(extra="forbid")
    # @field_validator('steps', mode='after')
    # @classmethod
    # def load(cls, value: str):
    #     try:
    #         value = json.loads(value)
    #     except:
    #         value = literal_eval(value)
    #     return value

    @field_validator('steps', mode='after')
    @classmethod
    def load(cls, value):

        value = value.replace('\x00', '').replace('\u0000', '')

        value = re.sub(r'[\x00-\x1F\x7F]', '', value)

        try:
            return json.loads(value)
        except Exception:
            pass

        try:
            return literal_eval(value)
        except Exception:
            pass

        return value

In [10]:
from pydantic import create_model


def create_result_model_dynamic(properties: List[Literal['strength', 'phase', 'grain_size']], has_synthesis: bool):
    """Dynamically create a Pydantic model for Records based on the requested properties using create_model."""
    fields = {
        'metadata': (MetaData, ...)
    }
    if 'strength' in properties:
        fields['strength'] = (List[Strength], Field(..., description='Create multiple strength items in list if multiple testing conditions (temperature, strain rate, environment) are reported and every condition has correspond strength value. Each entry should correspond to a unique set of testing conditions. Only using range if no other information is available.'))
    if 'phase' in properties:
        fields['phase'] = (List[Phase], Field(..., description='phase information'))
    if has_synthesis:
        fields['synthesis'] = (Synthesis, Field(..., description='synthesis information'))

    Record = create_model('Record', __base__=BaseModel, **fields)
    # ensure dynamic models forbid extra properties so the LM JSON schema includes additionalProperties=false
    Record.model_config = ConfigDict(extra="forbid")
    doc = "List of extracted records, encourage to split into multiple records if processing parameters varied"
    Records = create_model('Records', __base__=BaseModel, records=(List[Record], Field(..., description='list of extracted records')), __doc__=doc)
    Records.model_config = ConfigDict(extra="forbid")
    return Records

from langchain_core.prompts.chat import ChatPromptTemplate

prompt_template = ChatPromptTemplate(
    [
        ('system', 'you are a helpful assistant that extracts specific information from scientific papers.'),
        ('user', '[START OF PAPER]\n{text}\n[END OF PAPER]\n\nInstruction:\n{instruction}')
    ]
)
phase_instruction = """Extract phase information from the text
hase types:
FCC, BCC, HCP, B2, intermetallic compounds (e.g., TiNi, Ti₂Ni, γ' precipitates, silicides, aluminides, sigma (σ) phases), carbides (e.g., WC), oxides (e.g., SiO₂), amorphous phases.
The dendrites themselves are made OF a phase (could be FCC, BCC, etc.), but "dendritic" itself is not a phase.
Other Similar Terms to Exclude from Phase:
Dendritic, equiaxed, columnar (grain morphology)
Lamellar, eutectic (phase arrangement)
Fine-grained, coarse-grained (grain size)

Guideline for phase extraction:
- If the author mentions ordered/disordered, include it in the phase information.
- Same phase can be present multiple times, e.g., "FCC, FCC" extract as "FCC, FCC".
- Main phase should be listed first, followed by secondary phases, and so on."""

strength_instruction = """Extract mechanical property relevant to ys, uts and strain from the text
Follow these rules:
- If tested under different setting (temperature, strain rate, etc), extract separately
- Only collect tensile/compressive strength and strain data, exclude all other mechanical properties.
- Strain should refer to fracture strain only.
- Prioritize table values over text if there is a conflict. 
- If the value provided is a range, for example, "from 200 MPa to 300 MPa", extract it as "200-300 MPa".
- If the value is given as "greater than" or "less than", for example, "greater than 400 MPa", extract it as ">400 MPa".
- If the value is given as "approximately" or "around", for example, "approximately 250 MPa", extract it as "~250 MPa".
- Otherwise, extract the value as it is."""

def create_instruction(properties):
    instructions = []
    if 'phase' in properties:
        instructions.append(phase_instruction)
    if 'strength' in properties:
        instructions.append(strength_instruction)
    return '\n\n'.join(instructions)

In [14]:
from langchain_openai import ChatOpenAI
from sisyphus.chain.extract import BaseExtractor, Extraction
from sisyphus.chain.paragraph import Paragraph
import warnings

warnings.filterwarnings('ignore', category=UserWarning, module='pydantic') # avoide the case that we convert json string to python object will trigger pydantic warning

class Extractor(BaseExtractor):
    target_properties = ['strength', 'phase', 'synthesis']
    model = ChatOpenAI(model_name='gpt-4.1', temperature=0)
    def create_model_prompt(self, paragraphs):
        for paragraph in paragraphs:
            model = create_result_model_dynamic(paragraph.property_types, paragraph.has_property('synthesis'))
            paragraph.set_pydantic_model(model)
            instruction = create_instruction(paragraph.property_types)
            paragraph.set_prompt(
                prompt_template,
                {'instruction': instruction}
            )


def load_from_labeled_db(docs):
    return [Paragraph.from_labeled_document(doc, id_) for id_, doc in enumerate(docs)]

extractor = Extraction()
hea_extractor = Extractor()
extractor.add_extractor(hea_extractor)

from sisyphus.chain import Writer, Filter
from sisyphus.utils.helper_functions import get_create_resultdb, get_plain_articledb


db = get_plain_articledb('heas_labeled')
loader = Filter(db)
result_db = get_create_resultdb('heas_results')
writer = Writer(result_db)

chain = loader + load_from_labeled_db + extractor + writer
chain.compose('10.1002&sol;adem.201600726.html') # example file name
    

