In [1]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from tqdm import tqdm
import dspy
import re
from sisyphus.utils.helper_functions import get_plain_articledb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
)
article_db = get_plain_articledb('300_heas')

def embed_one(article):
    docs = article_db.get(article)
    with ThreadPoolExecutor(max_workers=20) as worker:
        futures = [worker.submit(vector_store.add_documents, [doc]) for doc in docs]
        for future in futures:
            future.result()

K = 3
QUERY_SYN = """Experimental procedures describing the synthesis and processing of materials, including methods such as melting, casting, rolling, annealing, heat treatment, or other fabrication techniques. Details often include specific temperatures (e.g., °C), durations (e.g., hours, minutes), atmospheric conditions (e.g., argon, vacuum), mechanical deformation (e.g., rolling reduction), and microstructural characterization steps. Mentions of material compositions, purity levels, and equipment used are common indicators."""
QUERY_MECHANICAL = """Mechanical properties of high entropy alloys, stress-strain curves, yield strength, ultimate tensile strength, tensile strain, elongation, alloy composition, alloying effects on strength, ductility, engineering stress-strain behavior."""
QUERY_PHASE = """Phase characterization of high entropy alloys, microstructure analysis, crystal structures, phase transitions, XRD patterns, lattice parameters, grain morphology, recrystallization, secondary phases, alloying effects on phases, defect structures, and phase stability."""

def match_subtitles(docs, pattern):
    sub_titles = list(set([doc.metadata["sub_titles"] for doc in docs]))
    target_titles = []
    for title in sub_titles:
        if pattern.search(title):
            target_titles.append(title)
    return target_titles

syn_pattern = re.compile(r'(experiment)|(preparation)|(method)', re.I)
res_pattern = re.compile(r'result', re.I)
test_pattern = re.compile(r'strain\srate', re.I)
def retrieve(vector_store, article, query, sub_titles):
    if sub_titles is None:
        filter_ = {"source": article}
    else:
        filter_ = {"$and":[{
            "sub_titles": {
                "$in": sub_titles
            }},
            {"source": article
        }]}
    return vector_store.similarity_search(
        query,
        k=K,
        filter=filter_
    )


In [None]:
import os

articles = os.listdir('articles_processed')[:10]
with ThreadPoolExecutor(max_workers=10) as worker:
    total = len(articles)

    for res in tqdm(worker.map(embed_one, articles), total=total):
        pass

  0%|          | 0/10 [01:26<?, ?it/s]


In [6]:
from typing import Literal

lm = dspy.LM('openai/gpt-4o-mini', cache=False)
dspy.configure(lm=lm)

def get_target_para(article, query, pattern, classifier, class_):
    docs = article_db.get(article)
    sub_titles = match_subtitles(docs, pattern) or None
    candidates = retrieve(vector_store, article, query, sub_titles)
    final = []
    with ThreadPoolExecutor(5) as worker:
        futures = [worker.submit(classifier, paragraph=candidate.page_content) for candidate in candidates]
        future_doc = {future: candidate for future, candidate in zip(futures, candidates)}
        for future in as_completed(futures):
            if future.result().topic == class_:
                final.append(future_doc[future])
    return final

class ClassifySyn(dspy.Signature):
    """assign topic to paragraphs of HEAs(high entropy alloys) papers. The topics include synthesis, characterization, and others.
    Note: a qualified synthesis paragraph should include the synthesis and processing of materials, including methods such as melting, casting, rolling, annealing, heat treatment.be very strict about your decision."""
    paragraph: str = dspy.InputField()
    topic: Literal['synthesis', 'characterization', 'others'] = dspy.OutputField()

classfier_syn = dspy.ChainOfThought(signature=ClassifySyn)

class ClassifyMech(dspy.Signature):
    """assign topic to paragraphs of HEAs(high entropy alloys) papers. The topics include tensile/compressive with value, tensile/compressive without value, characterization or others.
    Note: a qualified tensile/compressive with value paragraph should explicitly mention at least one value related to yield strength, ultimate yield strength, elongation, or compressive strain, otherwise it should be classified as tensile/compressive without value."""
    paragraph: str = dspy.InputField()
    topic: Literal['tensile/compressive with value', 'tensile/compressive without value', 'characterization', 'others'] = dspy.OutputField()
classifier_mech = dspy.ChainOfThought(signature=ClassifyMech)

class ClassifyPha(dspy.Signature):
    """assign topic to paragraphs of HEAs(high entropy alloys) papers. The topics include characterization_phase, characterization_others, or others.
    Note: a qualified phase charaterization paragraph should include the descripion of XRD patterns indicating the crystal structures such as FCC, BCC, HCP or other structures."""
    paragraph: str = dspy.InputField()
    topic: Literal['characterization_phase', 'characterization_others', 'others'] = dspy.OutputField()
classifier_pha = dspy.ChainOfThought(signature=ClassifyPha)

In [None]:
class ExtractSteps(dspy.Signature):
    """Extract synthesis steps from a HEAs experimental section. Formated as below:
    Material: [material]
    Synthesis methods
    Fabrication: [fabrication] (methods like induction melting, additive manufacturing etc.)
    Thermo-mechanical processings (if any)
    [processing_1]: [processing_1_parameters] (e.g. 900 °C, 6 h)
    [processing_2]: [processing_2_parameters]
    ...

    Note:
    Only extract the synthesis steps, do not include the characterization steps.
    For those only have melting-casting processings, do not include any thermo-mechanical processings.
    If the author indicates different material results from different processing conditions, include the corresponding material after each condition"""
    paragraph: str = dspy.InputField()
    steps: str = dspy.OutputField()
steps_extractor = dspy.Predict(signature=ExtractSteps)

one_shot_example_1 = """Ingots of Co60Ni40 alloy and Co20Cr40Ni40 MEA were fabricated by vacuum arc-melting of pure metals (purity > 99.9 wt.%) under an inert gas (high-purity argon) atmosphere. After melting, they were cooled in a water-cooled copper mold and flipped and re-melted five times to improve compositional homogeneity. Subsequently, the ingots were cold-rolled to a 30% reduction in thickness and homogenized at 1100 °C for 24 h. Then, the homogenized plate of the Co60Ni40 alloy and the Co20Cr40Ni40 MEA were further cold-rolled to a 92% reduction in thickness and annealed at 750 °C for 120 s and 850 °C for 3.6 ks, respectively. These processes yielded fully-recrystallized microstructures of FCC single phase having similar mean grain sizes of about 3 μm (including annealing twins) in the two alloys."""
one_shot_example_2 = """The master alloy of AlCoCrFeNi2.1 was prepared from commercially pure elements (Al, Co, Ni: 99.8 wt %; Cr, Fe: 99.5-99.5 wt %). The experimented material was received in vacuum arc remelted condition, the chemical composition of which is listed in Table 1 ."""
answer = ["""Material: Co60Ni40 alloy and Co20Cr40Ni40 MEA
Synthesis methods
Fabrication: Vacuum arc-melting
Thermo-mechanical processings
Cold-rolling: 30% reduction in thickness
Homogenization: 1100 °C, 24 h
Cold-rolling: 92% reduction in thickness
Annealing:
- 750 °C, 120 s (Co60Ni40 alloy)
- 850 °C, 3.6 ks (Co20Cr40Ni40 MEA)""",
"Material: AlCoCrFeNi2.1\nSynthesis methods\nFabrication: Vacuum arc remelting"]
examples = [dspy.Example(paragraph=para, steps=s).with_inputs('paragraph') for para, s in zip([one_shot_example_1, one_shot_example_2], answer)]
compiler = dspy.LabeledFewShot()
two_shot_steps_extractor = compiler.compile(steps_extractor, trainset=examples)

class ExtractPhases(dspy.Signature):
    """Extract HEAs material microstructure phases from text. Formated as below:
    [material]: [phases] (choose from FCC, BCC, HCP, B2, Laves, L12 or others)
    ...(if multiple materials)
    Note: if there are multiple phases, separate them with commas"""
    paragraph: str = dspy.InputField()
    phases: str = dspy.OutputField()
phases_extractor = dspy.Predict(signature=ExtractPhases)

In [None]:
from pydantic import BaseModel, Field
from typing import Literal, List
from typing import Optional
class AlloyRecord(BaseModel):
    composition: str = Field(description='The nominal composition of the alloy')
    phase: Optional[str] = Field(description='The phase of the alloy, such as FCC, BCC, HCP etc. If there are multiple phases, separate them with commas')
    ys: Optional[float] = Field(description='the value of yield strength, convert to MPa if the unit is not MPa, e.g. 1 GPa -> 1000 MPa')
    uts: Optional[float] = Field(description='the value of ultimate tensile strength, convert to MPa if the unit is not MPa, e.g. 1GPa -> 1000 MPa')
    elongation: Optional[float] = Field(description='the value of elongation, convert to percentage if the unit is not percentage, e.g. 1%')
    fabrication: str = Field(description='The fabrication method of the alloy, e.g. vacuum arc-melting')
    thermal_mechanical_processings: Optional[str]  = Field(description='The sequential post-processing steps of the alloy separated by vertical bar "|", be briefly, eg., annealed at 900 °C for 4 h | homogenized at 1200 °C for 2 h')

    test_type: Literal['tensile', 'compressive']
    test_temperature: Optional[str] = Field(description='The temperature at which the mechanical properties were tested, e.g. 25 °C')

class Records(BaseModel):
    records: Optional[List[AlloyRecord]] = Field(description='The records of the alloy properties')