## Labeling

In [1]:
import re

from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.chain.label import BaseLabeler, Labling, save_labeled_paras_wrapper
from sisyphus.chain.chain_elements import Filter, Writer, run_chains_with_extarction_history_multi_threads


class BandGapLabler(BaseLabeler):
    property = 'band_gap'
    regex_pattern = re.compile(r'\b(band[- ]?gaps?|bandgaps?|energy[- ]?gap|energy gap)\b', re.I)
    # You may need to define functions for semantic_label and llm_label if needed

labeler = Labling()
labeler.add_labeler(BandGapLabler())

database = get_plain_articledb('nlo')
loader = Filter(database)


save_labeled_paras = save_labeled_paras_wrapper('nlo_labeled')

chain = loader + labeler + save_labeled_paras

run_chains_with_extarction_history_multi_threads(
    chain,
    'test_file',
    10,
    'nlo_labeled'
)


100%|██████████| 1/1 [00:00<00:00, 309.59it/s]


## Extraction

In [2]:
from dotenv import load_dotenv
_ = load_dotenv()

from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

from sisyphus.chain.paragraph import Paragraph
from sisyphus.chain.extract import BaseExtractor, Extraction

# your models
class Bandgap(BaseModel):
    """Extract bandgap information from scientific papers."""
    bandgap: Optional[str] = Field(description="Bandgap value with unit, e.g., '1.5 eV'")
    bandgap_type: Optional[Literal['direct', 'indirect']] = Field(description="Type of bandgap: direct or indirect")
    measurement_method: Optional[str] = Field(description="Method used to measure the bandgap, e.g., 'UV-Vis spectroscopy'")

class Records(BaseModel):
    records: List[Bandgap]

# you prompt, must contains text field
nlo_prompt = ChatPromptTemplate(
    [
        ('system', 'you are a helpful assistant that extracts specific information from scientific papers.'),
        ('user', '[START OF PAPER]\n{text}\n[END OF PAPER]\n\nInstruction:\n{instruction}')
    ]
)

class BGExtractor(BaseExtractor):
    target_properties = ['band_gap']
    model = ChatOpenAI(model_name='gpt-4.1', temperature=0)

    def create_model_prompt(self, paragraphs): # note the paragraphs are merged paragraphs
        for paragraph in paragraphs:
            paragraph.set_pydantic_model(Records)
            paragraph.set_prompt(
                nlo_prompt,
                {'instruction': 'Extract bandgap information including bandgap value with unit, bandgap type (direct or indirect), and measurement method from the given scientific paper paragraph. Provide the results in a structured format.'
                }
            )

def load_from_labeled_db(docs):
    return [Paragraph.from_labeled_document(doc, id_) for id_, doc in enumerate(docs)]

extractor = Extraction()
bg_extractor = BGExtractor()
extractor.add_extractors(bg_extractor)

from sisyphus.chain import Writer, Filter
from sisyphus.utils.helper_functions import get_create_resultdb, get_plain_articledb


db = get_plain_articledb('nlo_labeled')
loader = Filter(db)
result_db = get_create_resultdb('nlo_results')
writer = Writer(result_db)

chain = loader + load_from_labeled_db + extractor + writer
chain.compose('10.1002&sol;adfm.201801589.html') # example file name


