In [1]:
import logging

import dspy
from langchain.globals import set_verbose, set_debug

from sisyphus.heas.label import label_paras, label_only_syn_paras
from sisyphus.heas.extract_lc import extract, extract_full_docs
from sisyphus.utils.helper_functions import get_plain_articledb, get_create_resultdb
from sisyphus.chain.chain_elements import Filter, Writer, run_chains_with_extarction_history_multi_threads

# set_debug(True)
# set_verbose(True)
lm = dspy.LM('openai/gpt-4o-mini', max_tokens=3000)
dspy.configure(lm=lm)
logger = logging.getLogger(__name__)
logger.setLevel(10)
fh = logging.FileHandler('heas.log')
fh.setLevel(10)
ft = logging.Formatter('%(message)s')
fh.setFormatter(ft)
logger.addHandler(fh)

In [2]:
# filter out paper first
from typing import Literal
from sisyphus.utils.helper_functions import get_title_abs, render_docs

class ClassifyPaper(dspy.Signature):
    """assign label to HEAs (high entropy alloys) paper based on their title and abstract."""
    context: str = dspy.InputField(desc='Title and abstract of the paper')
    label: Literal['hea_experimental', 'hea_theoretical', 'irrelevant'] = dspy.OutputField(desc="Pay attention to keywords such as 'molecular dynamics' or 'machine learning,' which should be labeled as hea_theoretical. Label keywords related to fabrication processes as hea_experimental.")
    mechanical_relevancy: bool = dspy.OutputField(desc='whether this paper describe the mechanical properties such as tensile or compressive')
classifier_paper = dspy.ChainOfThought(signature=ClassifyPaper)

def paper_filter(docs):
    title, abstract = get_title_abs(docs)
    prediction = classifier_paper(context=render_docs(abstract, title))
    if prediction.label == 'hea_experimental' and prediction.mechanical_relevancy:
        return docs
    else:
        source = docs[0].metadata['source']
        logger.info('irrelevant paper: %s', source)
    return

In [None]:
from functools import partial
db = get_plain_articledb('heas_1531')
getter = Filter(db)
result_db = get_create_resultdb('heas_test_traditional')
 

chain = getter + paper_filter + label_paras + extract + writer
chain_t = getter + paper_filter + label_paras + extract  

#### FULL TEXT

In [4]:
# from functools import partial
# db = get_plain_articledb('heas_1531')
# getter = Filter(db)
# result_db = get_create_resultdb('heas_comp_full')
# writer = Writer(result_db)

# chain = getter + paper_filter + label_only_syn_paras + extract_full_docs + writer

#### PARTIAL TEXT

In [None]:
from functools import partial
db = get_plain_articledb('heas_1531')
getter = Filter(db)
result_db = get_create_resultdb('heas_comp_select_mini')
writer = Writer(result_db)

# change model to gpt-4o-mini for faster processing
from langchain_openai import ChatOpenAI
extract_mini = partial(extract,synthesis_extract_model=lm, extraction_model=ChatOpenAI(model='gpt-4o-mini', temperature=0.0, max_tokens=3000)) 

chain = getter + paper_filter + label_paras + extract + writer
chain_mini = getter + paper_filter + label_only_syn_paras + extract_mini + writer



In [6]:
run_chains_with_extarction_history_multi_threads(chain, 'heas_test', 5, 'partial')

100%|██████████| 1/1 [00:23<00:00, 23.57s/it]


In [7]:
run_chains_with_extarction_history_multi_threads(chain, 'test_another_10', 5, 'partial')

100%|██████████| 10/10 [00:39<00:00,  3.96s/it]


In [8]:
run_chains_with_extarction_history_multi_threads(chain, 'test_another_20', 5, 'partial')

100%|██████████| 20/20 [00:55<00:00,  2.76s/it]


In [7]:
results[0].info

[StrengthRecord(composition='V10Cr15Mn5Fe35Co10Ni25', composition_type='atomic', phase=['FCC'], ys=430.0, uts=720.0, strain=48.1, processes=['induction melting: atmosphere: vacuum', 'homogenized: temperature: 1100 °C, duration: 6 h, atmosphere: Ar', 'quenching: medium: water', 'cold rolled: reduction: 79%, temperature: room temperature', 'annealed: temperature: 900 °C, duration: 10 min, atmosphere: air'], test_type='tensile', test_temperature='25 °C'),
 StrengthRecord(composition='V10Cr15Mn5Fe35Co10Ni25', composition_type='atomic', phase=['FCC'], ys=230.0, uts=532.0, strain=57.6, processes=['induction melting: atmosphere: vacuum', 'homogenized: temperature: 1100 °C, duration: 6 h, atmosphere: Ar', 'quenching: medium: water', 'cold rolled: reduction: 79%, temperature: room temperature', 'annealed: temperature: 1100 °C, duration: 60 min, atmosphere: air'], test_type='tensile', test_temperature='25 °C'),
 StrengthRecord(composition='V10Cr15Mn5Fe35Co10Ni25', composition_type='atomic', phas

In [13]:
print(results_full[0].doc.page_content)

Effect of Initial Grain Size on Deformation Mechanism during High‐Pressure Torsion in V10Cr15Mn5Fe35Co10Ni25 High‐Entropy Alloy

Abstract
The transition of the deformation mechanism from the dislocation slip-mediated mechanism to the twin-mediated mechanism with increasing grain size is a well-observed phenomenon in materials with low stacking fault energy during compression/tensile tests. To understand this effect further at large strains, a V10Cr15Mn5Fe35Co10Ni25 (at%) high-entropy alloy with two initial average grain sizes is processed by high-pressure torsion (HPT) at different numbers of turns. The results indicate that initial grain size plays a significant role in the deformation mechanism during the HPT process. The fine-grained (FG) sample exhibits only a tangled dislocation structure, whereas mechanical twins are observed along with the formation of dislocations in the coarse-grained (CG) sample after the one-fourth turn. High dislocation density is observed in the CG sample 

In [4]:
run_chains_with_extarction_history_multi_threads(chain, 'articles_processed', 5, 'vintage', 300, None)

100%|██████████| 2/2 [05:07<00:00, 153.67s/it]


In [8]:
import json
with open('heas_test_res.json', 'w', encoding='utf8') as f:
    json.dump(result_db.load_as_json('4o', 'too long', 'heas_1531', True), f, ensure_ascii=False, indent=2)

In [3]:
paper = 'The excellent ductility is mainly attributed to the formation of twinning during plastic deformation, as shown in Fig. 6b and e. As reported by Laplanche et al. [42], the critical resolved shear stress (CRSS) for twinning in CoCrNi was 260 ± 30 MPa, which was further verified by the modified Peierls-Nabarro (P-N) twinning nucleation model [55]. Furthermore, the true tensile stress for twinning onset was estimated to be ~790 MPa for the recrystallized CoCrNi [42]. Regardless of the effect of grain size, the twinning onset true strain for LAAMed CoCrNi in the present study is estimated to be ~7.6 % at a true stress of 790 MPa in the true stress-strain curve (Fig. 4b), which is lower than that for recrystallized one ( ~ 9.7 % [42]). Hence, twinning is able to be activated earlier in the LAAMed CoCrNi compared with the recrystallized one. Along with the formation of twinning, the deformation process is accompanied by the interactions between the heterogeneous structures mentioned earlier and twinning boundaries. For example, the interactions between dislocations (in slip bands) and twinning boundaries is clearly revealed in Fig. 6e. Moreover, the twinning boundaries penetrating across the cellular boundaries (Fig. 6e, g, h) are expected to subdivide the cellular structures, further promoting the interactions between the initial dislocations with twinning and cellular boundaries. In this case, steady WHR (Fig. 4b) is guaranteed by the dynamic Hall-Petch effect as that in TWIP steels [51]. Hence, non-uniform deformation (necking) is postponed with better ductility obtained. In addition, some other deformation mechanisms, such as the regional deformation induced FCC to HCP transformation [56], also can play important roles in the excellent mechanical behavior, which deserves further investigation for the LAAMed CoCrNi MEA in future work.'
instruction = """Extract all HEAs' tensile and compressive properties along with their synthesis methods from the text.  
- **Composition Format:** (e.g., `Hf0.5Mo0.5NbTiZrC0.3`).  
- **Handling Unknown Compositions:** If a nominal composition is missing (e.g., due to doping), use a **descriptive name** (e.g., `W-Co0.5Cr0.3FeMnNi`).  
- **Acronyms Prohibited:** Do **not** use labels like `HEA-1` or `Sample A`.  
---
### **General Guidelines:**  
- **Extract data for all materials** with reported mechanical properties, even if they are:  
  - Mentioned in **comparisons** with other samples.  
  - Referenced from a **previous study** but include numerical properties in the current text.  
  - Not the main focus of the paragraph.
- If multiple materials are described under different conditions, **each must be recorded separately** with its corresponding processing conditions.  

### **Mechanical Properties to Extract:**  
- **Tensile properties:** yield strength, ultimate tensile strength, elongation
- **Compressive properties:** Compressive yield strength, ultimate compressive strength, compressive strain
- **Synthesis method** If present, extract the synthesis information from text

### **Properties to Explicitly EXCLUDE:**  
- **Shear strength/stress** (do NOT misinterpret this as yield strength)
- **Critical resolved shear stress (CRSS)**
- **Fracture strength**
- **Hardness** (e.g., Vickers, Brinell, Rockwell)
- **Fatigue strength**
- ** Young's modulus**

### **Extraction Rules:**  
- If properties are reported as "mean ± standard deviation," use the **mean value** (e.g., **700 ± 30 → 700**).  
- If properties are reported as a range, use the **lower bound** (e.g., **500–600 → 500**).  
- **Prioritize table values** over text if there is a conflict. 
- **If a material is mentioned in comparison to another material, still extract its properties.**  
- **If a material from a previous study is mentioned with numerical values, extract it as well.**

### **Ensuring Comprehensive Extraction:**  
- **Materials with mechanical properties mentioned in comparisons must be extracted.**  
- **Materials referenced from past studies must be extracted if numerical values are given.**
- **Materials which are not the main focus of the paper should also be extracted if they have mechanical properties.**
"""

from sisyphus.heas.extract_lc import model, template, Records
chain = template | model.with_structured_output(Records, method='json_schema')
records = chain.invoke({'paper': paper, 'instruction': instruction}).records

In [4]:
records

[StrengthRecord(composition='CoCrNi', composition_type='atomic', phase=None, ys=None, uts=790.0, strain=7.6, processes=None, test_type='tensile', test_temperature='25 °C')]