In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
# Must precede any llm module imports

from langtrace_python_sdk import langtrace
langtrace.init(api_key = '8e0dafdc118df1613b10dbdda776b0b062427b0a9b1cb18b719688714e1ea445',
  disable_tracing_for_functions= {
    "open_ai": [ # All supported functions for openai
      'openai.embeddings.create',
    ]
  },
  disable_instrumentations={"all_except": ['openai']}
)

[32mInitializing Langtrace SDK..[39m
[37m⭐ Leave our github a star to stay on top of our updates - https://github.com/Scale3-Labs/langtrace[39m
Skipping openai due to error while instrumenting: No module named 'openai.resources.responses'
[34mExporting spans to Extraction..[39m
[34mLangtrace Project URL: https://app.langtrace.ai/project/cmf58kthl000f5i55fmxndtpw/traces[39m


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import json
from ast import literal_eval
from typing import Optional
from functools import partial

import dspy
from pydantic import BaseModel, Field, field_validator
from langchain_openai import ChatOpenAI

from sisyphus.heas.label import label_paras
from sisyphus.chain.paragraph import Paragraph, ParagraphExtend
from sisyphus.chain import Filter, Writer
from sisyphus.strategy.run_strategy import ExtractStrategy
from sisyphus.strategy.pydantic_models_general import Processing, Material, MaterialDescriptionBase
from sisyphus.strategy.utils import get_paras_with_props, get_synthesis_paras
from sisyphus.heas.prompt import *
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.heas.synthesis import get_synthesis_prompt
from sisyphus.strategy.llm_models import categorize_agent


lm = dspy.LM('openai/gpt-4.1-mini')
dspy.configure(lm=lm)
chat_model = ChatOpenAI(model='gpt-4.1-mini')

class Strength(BaseModel):
    """Tensile/Compressive test results"""
    ys: Optional[str] = Field(description="Yield strength with unit")
    uts: Optional[str] = Field(description="Ultimate tensile/compressive strength with unit")
    strain: Optional[str] = Field(description="Fracture strain. If in percentage form, please add '%' sign, else return as decimal form. Example: 0.5 or 50%")
    temperature: Optional[str] = Field(description="Test temperature with unit, if not specified, return 'room temperature'")
    strain_rate: Optional[str] = Field(description="Strain rate with unit")
    test_type: Literal['tensile', 'compressive']


class Phase(BaseModel):
    """Phase information"""
    phases: list[str] = Field(description="List of phases present in the material")

class Processing(BaseModel):
    """Processing route for a material
    Return field steps as '[]' if you cannot find any. For fields with unknown value, filled with empty string"""
    steps: str = Field(description="""List of processing steps in chronological order, form them as json object. For example: [{"induction melting": {"temperature": "1500 K"}}, {"annealed": {"temperature": "800 K", "duration": "1h"}}]""")

    @field_validator('steps', mode='after')
    @classmethod
    def load(cls, value: str):
        try:
            value = json.loads(value)
        except:
            value = literal_eval(value)
        return value

prompt_config = {
    'contextualized': {
        'strength': (EXTRACT_PROPERTY_SYS_GENERIC_PROMPT, STRENGTH_PROMPT),
        'phase': (EXTRACT_PROPERTY_SYS_GENERIC_PROMPT, PHASE_PROMPT),
        'synthesis': (EXTRACT_PROCESS_SYS_GENERIC_PROMPT, PROCESS_PROMPT)
    },
    'isolated': {
        'strength': (EXTRACT_PROPERTY_SYS_GENERIC_PROMPT, STRENGTH_PROMPT),
        'phase': (EXTRACT_PROPERTY_SYS_GENERIC_PROMPT, PHASE_PROMPT),
        'synthesis': (EXTRACT_PROCESS_SYS_GENERIC_PROMPT, PROCESS_ISOLATED_PROMPT)
    }
}

def reconstr_c(paragraphs):
    p_str = ParagraphExtend.from_paragraphs(get_synthesis_paras(paragraphs) + get_paras_with_props(paragraphs, 'strength') + get_paras_with_props(paragraphs, 'strain_rate'), type='strength')
    p_phase = ParagraphExtend.from_paragraphs(get_synthesis_paras(paragraphs) + get_paras_with_props(paragraphs, 'phase'), type='phase')
    p_exp = ParagraphExtend.from_paragraphs(get_synthesis_paras(paragraphs) + get_paras_with_props(paragraphs, 'composition'), type='synthesis')
    return{
        "strength": p_str,
        "phase": p_phase,
        "synthesis": p_exp
    }

def reconstr_i(paragraphs):
    p_str = ParagraphExtend.from_paragraphs(get_paras_with_props(paragraphs, 'strength') + get_paras_with_props(paragraphs, 'strain_rate'), type='strength')
    p_phase = ParagraphExtend.from_paragraphs(get_paras_with_props(paragraphs, 'phase'), type='phase')
    p_exp = ParagraphExtend.from_paragraphs(get_synthesis_paras(paragraphs), type='synthesis')
    return{
        "strength": p_str,
        "phase": p_phase,
        "synthesis": p_exp
    }

models_d = {
    'strength': Strength,
    'phase': Phase,
    'synthesis': Processing
}
 


In [3]:
from typing import Literal
from sisyphus.utils.helper_functions import get_title_abs, render_docs

class ClassifyPaper(dspy.Signature):
    """assign label to HEAs (high entropy alloys) paper based on their title and abstract."""
    context: str = dspy.InputField(desc='Title and abstract of the paper')
    label: Literal['hea_experimental', 'hea_theoretical', 'irrelevant'] = dspy.OutputField(desc="Pay attention to keywords such as 'molecular dynamics' or 'machine learning,' which should be labeled as hea_theoretical. Label keywords related to fabrication processes as hea_experimental.")
    mechanical_relevancy: bool = dspy.OutputField(desc='whether this paper describe the mechanical properties such as tensile or compressive')
classifier_paper = dspy.ChainOfThought(signature=ClassifyPaper)

def paper_filter(docs):
    title, abstract = get_title_abs(docs)
    prediction = classifier_paper(context=render_docs(abstract, title))
    if prediction.label == 'hea_experimental' and prediction.mechanical_relevancy:
        return docs
    return

In [4]:
es = ExtractStrategy(
    reconstruct_paragraph_context_func=reconstr_c,
    reconstruct_paragraph_isolate_func=reconstr_i,
    formatted_func=get_synthesis_prompt,
    categorize_agent=categorize_agent,
    pydantic_models_dict=models_d,
    save_to='op.jsonl'
)
es.build(prompt_config=prompt_config, chat_model=chat_model)
db = get_plain_articledb('heas_1531')
getter = Filter(db)
result_db = get_create_resultdb('context_isolated')
writer = Writer(result_db)



In [5]:
from sisyphus.chain.chain_elements import run_chains_with_extarction_history_multi_threads
chain = getter + paper_filter + label_paras + es + writer
run_chains_with_extarction_history_multi_threads(chain, 'heas_test', 5, 'context_isolated', 5)
# chain.compose('10.1002&sol;adem.201900587.html')

 40%|████      | 2/5 [00:33<00:50, 16.99s/it]


ValueError: result must be a dict or a pydantic model