In [None]:
import langfun as lf
from IPython import display
from pathlib import Path
import base64
import os

In [None]:
from dotenv import load_dotenv
load_dotenv()  # reads .env in project root directory

ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

# Load Helper functions

In [2]:
#Replace some utility functions to enable pdf encoding in textgrad

def is_pdf(data):
    """
    Checks if the given data starts with the PDF file signature.

    :param data: bytes object, the file data to check
    :return: True if it's a PDF file, otherwise False
    """
    pdf_signature = b'%PDF-'
    return data.startswith(pdf_signature)

from textgrad.engine_experimental.engine_utils import *
import textgrad.engine_experimental.engine_utils
from typing import List, Union

def new_get_image_type_from_bytes(data):
    if is_jpeg(data):
        return "jpeg"
    elif is_png(data):
        return "png"
    elif is_pdf(data):
        return "pdf"
    else:
        raise ValueError("Image type not supported, only jpeg and png supported.")

def format_content_anthropic(content: List[Union[str, bytes]]) -> List[dict]:
    """Helper function to format a list of strings and bytes into a list of dictionaries to pass as messages to the API.
    """
    formatted_content = []
    for item in content:
        if isinstance(item, bytes):
            # For now, bytes are assumed to be images
            image_type = new_get_image_type_from_bytes(item)
            base64_image = base64.b64encode(item).decode('utf-8')
            if image_type =='pdf':
                formatted_content.append({
                "type": "document",
                "source": {
                    "type": "base64",
                    "media_type": "application/pdf",
                    "data": base64_image
            }
                })
            else:
                formatted_content.append({
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": image_type,
                        "data": base64_image,
                    },
                })
        elif isinstance(item, str):
            formatted_content.append({
                "type": "text",
                "text": item
            })
        else:
            raise ValueError(f"Unsupported input type: {type(item)}")
    return formatted_content

# textgrad.engine_experimental.engine_utils.open_ai_like_formatting = _format_content_new

In [3]:
#Custom loss function for PDF files
from textgrad.engine import EngineLM, get_engine
from textgrad.variable import Variable
from typing import List, Union
from textgrad.autograd import LLMCall, FormattedLLMCall, OrderedFieldsMultimodalLLMCall
from textgrad.autograd import Module
from textgrad.config import SingletonBackwardEngine

class PdfQAEvaluation(Module):
    def __init__(self,
                 evaluation_instruction: str,
                 engine: Union[EngineLM, str] = None,
                 system_prompt: Variable = None):
        super().__init__()
        self.evaluation_instruction = Variable(evaluation_instruction, role_description="evaluation instruction", requires_grad=False)
        if ((engine is None) and (SingletonBackwardEngine().get_engine() is None)):
            raise Exception("No engine provided. Either provide an engine as the argument to this call, or use `textgrad.set_backward_engine(engine)` to set the backward engine.")
        elif engine is None:
            engine = SingletonBackwardEngine().get_engine()
        if isinstance(engine, str):
            engine = get_engine(engine)
        self.engine = engine
        if system_prompt:
            self.system_prompt = system_prompt
        else:
            self.system_prompt = Variable("You are an evaluation system that evaluates the correctness of knowledge extraction tasks from scientific papers.",
                                            requires_grad=False,
                                            role_description="system prompt for the evaluation")

        self.multimodal_llm_call = OrderedFieldsMultimodalLLMCall(engine=self.engine,
                                                                  system_prompt=self.system_prompt,
                                                                  fields=["Evaluation Instruction", "Question", "Image", "Answer", "Ground Truth"])

    def forward(self, image: Variable, question: Variable, response: Variable, groundtruth: Variable) -> Variable:
        
        inputs = {
            "Evaluation Instruction": self.evaluation_instruction,
            "Question": question,
            "Image": image,
            "Answer": response,
            "Ground Truth": groundtruth
        }
        return self.multimodal_llm_call(inputs=inputs,
                                        response_role_description=f"evaluation of the {response.get_role_description()}")





# Prompt Optimization

 **Get labeled data from files and wrap them into HEA instance as defined in langfun**

In [None]:
#Read labeled csv/excel files
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv("./papers/annotation/small_train.csv") #Replace with your actual file path

In [6]:
df

Unnamed: 0,Paperid,HEAid,HEAname,nominal_compositions,measured_compositions,Phase,Processing_Method,RT_Lattice_Constant_A,Ref
0,1,Al1.0Co1.0Fe1.0Ni1.0,AlCoFeNi,Al1.0Co1.0Fe1.0Ni1.0,Al28.42Co23.92Fe24.38Ni23.28,BCC,As-Cast,2.8824,10.1016/j.jmmm.2014.07.023
1,1,Al0.25Co1.0Fe1.0Ni1.0,Al0.25CoFeNi,Al0.25Co1.0Fe1.0Ni1.0,Al9.63Co30.73Fe30.45Ni29.19,FCC,As-Cast,3.6004,10.1016/j.jmmm.2014.07.023
2,1,Co1.0Fe1.0Ni1.0,CoFeNi,Co1.0Fe1.0Ni1.0,Co31.810Fe35.510Ni32.680,FCC,As-Cast,3.599,10.1016/j.jmmm.2014.07.023
3,1,Co1.0Fe1.0Ni1.0Si0.25,CoFeNiSi0.25,Co1.0Fe1.0Ni1.0Si0.25,Co30.52Fe30.89Ni30.27Si8.31,FCC,As-Cast,3.5815,10.1016/j.jmmm.2014.07.023
4,2,Al1.0Co1.0Cr1.0Fe1.0Mo0.1Ni1.0,AlCoCrFeMo0.1Ni,Al1.0Co1.0Cr1.0Fe1.0Mo0.1Ni1.0,Al19.8Co19.4Cr21.2Fe19.2Mo2.0Ni18.5,BCC,As-Cast,2.886,10.1016/j.msea.2010.07.028
5,2,Al1.0Co1.0Cr1.0Fe1.0Ni1.0,AlCoCrFeNi,Al1.0Co1.0Cr1.0Fe1.0Ni1.0,Al16.7Co21.1Cr21.2Fe21.9Ni19.1,BCC,As-Cast,2.878,10.1016/j.msea.2010.07.028
6,3,Al1.0Co1.0Cr1.0Fe1.0Ni1.0,AlCoCrFeNi,Al1.0Co1.0Cr1.0Fe1.0Ni1.0,,BCC,As-Cast,2.875,10.1016/j.msea.2011.10.110
7,3,Al1.0Co1.0Cr1.0Fe1.0Nb0.1Ni1.0,AlCoCrFeNb0.1Ni,Al1.0Co1.0Cr1.0Fe1.0Nb0.1Ni1.0,,BCC,As-Cast,2.887,10.1016/j.msea.2011.10.110
8,4,Al1.0Co1.0Cr1.0Fe1.0Ni1.0,AlCoCrFeNi,Al1.0Co1.0Cr1.0Fe1.0Ni1.0,Al16.7Co21.1Cr21.2Fe21.9Ni19.1,BCC,As-Cast,2.878,10.1016/j.msea.2010.07.049
9,4,Al1.0Co1.0Cr1.0Fe1.0Ni1.0Si0.2,AlCoCrFeNiSi0.2,Al1.0Co1.0Cr1.0Fe1.0Ni1.0Si0.2,Al17.6Co20.1Cr20.2Fe19.9Ni18.5Si3.7,BCC,As-Cast,2.875,10.1016/j.msea.2010.07.049


In [7]:
set(df['Paperid'])

{1, 2, 3, 4, 5, 6, 10}

In [None]:
import json

# Define the JSON structure, Replace with your actual data structure
def row_to_json(row):
    return {
        "name": row["HEAname"],
        "nominal_composition": row["nominal_compositions"],
        "measured_composition": row["measured_compositions"],
        "lattice_constant": row["RT_Lattice_Constant_A"],
        "phases": row["Phase"],
        "alloy_condition": row["Processing_Method"],
        "doi": row["Ref"]
    }


In [None]:
training_set = []
df = df.replace({np.nan: None})
grouped = df.groupby('Paperid')
paper_path = "/home/shunl/Downloads/paper-extraction/paper-extraction/papers/annotation/" #Replace with your actual file path
for i in set(df['Paperid']):
    training_path = paper_path + str(i) + '.pdf' 
    hea_list = []
    try: 
        grouped_heas = grouped.get_group(i)
        heas = grouped_heas.apply(row_to_json, axis=1)
        hea_list = heas.to_list()
        training_set.append((training_path,hea_list))
    except Exception as e:
        print(e)
        pass

In [10]:
training_set

[('./papers/annotation/1.pdf',
  [{'name': 'AlCoFeNi',
    'nominal_composition': 'Al1.0Co1.0Fe1.0Ni1.0',
    'measured_composition': 'Al28.42Co23.92Fe24.38Ni23.28',
    'lattice_constant': 2.8824,
    'phases': 'BCC',
    'alloy_condition': 'As-Cast',
    'doi': '10.1016/j.jmmm.2014.07.023'},
   {'name': 'Al0.25CoFeNi',
    'nominal_composition': 'Al0.25Co1.0Fe1.0Ni1.0',
    'measured_composition': 'Al9.63Co30.73Fe30.45Ni29.19',
    'lattice_constant': 3.6004,
    'phases': 'FCC',
    'alloy_condition': 'As-Cast',
    'doi': '10.1016/j.jmmm.2014.07.023'},
   {'name': 'CoFeNi',
    'nominal_composition': 'Co1.0Fe1.0Ni1.0',
    'measured_composition': 'Co31.810Fe35.510Ni32.680',
    'lattice_constant': 3.599,
    'phases': 'FCC',
    'alloy_condition': 'As-Cast',
    'doi': '10.1016/j.jmmm.2014.07.023'},
   {'name': 'CoFeNiSi0.25',
    'nominal_composition': 'Co1.0Fe1.0Ni1.0Si0.25',
    'measured_composition': 'Co30.52Fe30.89Ni30.27Si8.31',
    'lattice_constant': 3.5815,
    'phases': 

In [None]:
start_prompt_json="""
Read the PDF file and extract required parameters.
You will ONLY report parameters of single phase high entropy alloy. 
Lattice constant should be in the unit of angstrom, if lattice constant does not present, only report *None*.
The output MUST be in json format, follow the schema example:
{
  "name": "Al0.25NbTiMoV"
  "nominal_composition": "Al0.25Nb1.0Ti1.0Mo1.0V1.0",
  "measured_composition": "Al5.4Ti22.0V22.1Nb25.5Mo25.0",
  "lattice_constant": 3.206,
  "phases": "BCC",
  "alloy_condition": "As-cast",
  "doi": "10.3390/e16020870"
}
"""

Load LLM engines
Note: If you see errors complaining the LLM model is not multimodal, first check if the LLM actually surpports PDFs. If you're sure about the LLM accept PDFs but the error persists, you need to go textgrad's source code manually add your LLM to multimoal LLM list.  

In [None]:
from textgrad.engine_experimental.litellm import LiteLLMEngine
import textgrad as tg

# llm_engine = LiteLLMEngine("gemini/gemini-1.5-flash", cache=False,is_multimodal=True)
# llm_backward_engine = LiteLLMEngine("gemini/gemini-1.5-flash", cache=False,is_multimodal=True)
# tg.set_backward_engine(llm_backward_engine, override=True)

llm_engine = get_engine("claude-3-5-sonnet-20240620")
llm_backward_engine = get_engine("claude-3-5-sonnet-20240620")

llm_engine._format_content = format_content_anthropic
llm_backward_engine._format_content = format_content_anthropic

tg.set_backward_engine(llm_backward_engine, override=True)

In [15]:
from textgrad.tasks import DataLoader
from tqdm import tqdm
train_loader = DataLoader(training_set, batch_size=3, shuffle=True)

In [22]:
evaluation_instruction = """Below is the output from a data extraction task for a given PDF file, the PDF data, 
and the result from human of this task.
You need to evaluate the output and the result from human. 
ONLY compare presented entries, do not perform task yourself. 
Is the output matches with human results? i.e., Does the reported parameter match with the PDF data and the human?
i.e., does the number of material matches with PDF and human annotation? Does the lattice constant matches with the PDF and human annotation?
Say match if it does, otherwise say it doesn't match
Be super consice.
"""
eval_instruction = tg.Variable(evaluation_instruction, requires_grad=False, role_description="evaluation instruction for the task")
eval_fn = PdfQAEvaluation(evaluation_instruction=evaluation_instruction,engine=llm_engine)

# question = tg.Variable(start_prompt,requires_grad=True,role_description="question input for the PDF file")
question = tg.Variable(start_prompt_json,requires_grad=True,role_description="question input for the PDF file")
model = tg.BlackboxLLM(llm_engine, question)
optimizer = tg.TGD(parameters=[question],engine=llm_backward_engine)

In [23]:
for epoch in range(3):
    for steps, (batch_x, batch_y) in enumerate((pbar := tqdm(train_loader, position=0))):
        pbar.set_description(f"Training step {steps}. Epoch {epoch}")
        optimizer.zero_grad()
        losses = []
        current_question = question.value
        for (x, y) in zip(batch_x, batch_y):
            pdf_to_read_lf = lf.PDF(x)
            pdf_bytes = pdf_to_read_lf.to_bytes()
            x = tg.Variable(pdf_bytes, requires_grad=False, role_description="query to the language model")
            y = tg.Variable(str(y), requires_grad=False, role_description="The ground truth answer")
            response = lf.query(prompt=question.value + '{{PDF}}', PDF=pdf_to_read_lf,schema=None, lm=lf.llms.Claude35Sonnet())
            # response = model(x)
            # print(response)
            response_tg_var = tg.Variable(str(response),requires_grad=False, role_description="Result of a data extraction task, which returns as a list of python class called HEA")
            eval_output_variable = eval_fn(question=question, image = x, response=response_tg_var, groundtruth=y)
            losses.append(eval_output_variable)
        total_loss = tg.sum(losses)
        total_loss.backward()
        optimizer.step()
        new_question = question.value
        print(new_question)
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([current_question,new_question])
        cos_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
        print(f"TF-IDF Cosine Similarity: {cos_sim[0][0]:.4f}")
        #if steps == 3:
         #   break

Training step 1. Epoch 0: : 1it [01:25, 85.58s/it]

Read the PDF file and extract the following parameters for each single-phase high-entropy alloy: name, nominal_composition, measured_composition (if available), lattice_constant (in angstroms, or None if absent), phases, alloy_condition, and doi (if available). Extract the parameters primarily from the text of the PDF. Figures and tables may be used to support the extraction, but the primary source of information should be the text. For multi-phase alloys, report that the alloy is multi-phase and list the constituent phases, but do not include them in the main output. The output should be in JSON format for each single-phase alloy identified.
TF-IDF Cosine Similarity: 0.4733


Training step 2. Epoch 0: : 2it [03:11, 97.69s/it]

Read the PDF file and extract the following parameters for each high-entropy alloy: name, nominal_composition, measured_composition (if available), lattice_constant (in angstroms, or None if absent), phases, alloy_condition, and doi (if available). Extract the parameters from both the text of the PDF and supporting figures and tables. If conflicting information exists, clearly indicate the source and explain the discrepancy.

Consider an alloy single-phase if a single phase constitutes at least 95% of its composition as reported in the paper. For multi-phase alloys, list all phases with their relative abundance and include them in the output with a flag indicating multi-phase status.

Represent nominal and measured compositions as molar ratios (e.g., Al1.0Co1.0Cr1.0Fe1.0Ni1.0). If a parameter cannot be determined with confidence, indicate 'Not Available' and state the reason (e.g., 'information not provided in the text or figures').

The output should be in the following JSON format fo

Training step 2. Epoch 0: : 3it [04:02, 80.78s/it]


Read the PDF file and extract the following parameters for each high-entropy alloy mentioned in the paper:

1. name
2. nominal_composition (in atomic percentages)
3. measured_composition (in atomic percentages, or "Not Available" if absent)
4. lattice_constant (in angstroms, rounded to 3 decimal places, or "Not Available" if absent)
5. phases (list all identified phases)
6. phase_abundances (for each phase, report percentage if available and >5%, otherwise "Not Available")
7. alloy_condition
8. doi (if available, otherwise "Not Available")
9. is_single_phase (true if one phase constitutes ≥95% of composition, false otherwise)

Extract parameters from both text and supporting figures/tables. If conflicting information exists between text and figures/tables, prioritize text data and note the discrepancy.

Report all numerical values to 2 decimal places unless otherwise specified. If a parameter cannot be determined with confidence, use "Not Available" and briefly explain why.

The output

Training step 1. Epoch 1: : 1it [02:34, 154.55s/it]

Read the PDF file and extract the following parameters for each high-entropy alloy mentioned in the paper, using the specified formats:

1. name (string, e.g., "AlNbTiMoV")
2. nominal_composition (string, molar ratios, e.g., "Al1.0Nb1.0Ti1.0Mo1.0V1.0")
3. measured_composition (string, atomic percentages or "Not Available" if absent, e.g., "Al: 20.00%, Nb: 20.00%, Ti: 20.00%, Mo: 20.00%, V: 20.00%")
4. lattice_constant (float, in angstroms, rounded to 3 decimal places, or "Not Available" if absent)
5. phases (list of strings, e.g., ["BCC"])
6. alloy_condition (string, e.g., "As-Cast")
7. doi (string, or "Not Available" if absent)

Extract parameters from both text and supporting figures/tables. If conflicting information exists between text and figures/tables, prioritize text data and note the discrepancy. If a parameter cannot be determined with certainty, use "Not Available". Do not attempt to infer or estimate missing values.

Report all numerical values to 3 decimal places unless ot

Training step 2. Epoch 1: : 2it [04:26, 129.68s/it]

Read the PDF file and extract the following parameters for each high-entropy alloy mentioned in the paper, using the specified formats:

1. name (string, e.g., "AlNbTiMoV")
2. nominal_composition (string, molar ratios, e.g., "Al1.0Nb1.0Ti1.0Mo1.0V1.0")
3. measured_composition (string, atomic percentages without % sign, e.g., "Al20.00Nb20.00Ti20.00Mo20.00V20.00")
4. lattice_constant (float, in angstroms, rounded to 3 decimal places)
5. phases (string, e.g., "BCC")
6. alloy_condition (string, e.g., "As-Cast")
7. doi (string)

Extract parameters from both text and supporting figures/tables. If conflicting information exists between text and figures/tables, prioritize text data and note the discrepancy in a separate "data_conflict" field (string). If a parameter cannot be determined with high confidence (>90% certainty), use "Not Available". Do not attempt to infer or estimate missing values.

Report all numerical values to 3 decimal places unless otherwise specified.

The output should be

Training step 2. Epoch 1: : 3it [05:04, 101.37s/it]


Read the PDF file and extract the following parameters for the AlCoCrFeNi and AlCoCrFeNiMo0.1 alloys only:

1. name (string, e.g., "AlNbTiMoV")
2. nominal_composition (string, molar ratios to one decimal place, e.g., "Al1.0Nb1.0Ti1.0Mo1.0V1.0")
3. measured_composition (string, atomic percentages to one decimal place without % sign, e.g., "Al20.0Nb20.0Ti20.0Mo20.0V20.0")
4. lattice_constant (float, in angstroms, rounded to 3 decimal places)
5. phases (string, e.g., "BCC")
6. alloy_condition (string, e.g., "As-Cast")
7. doi (string)

Extract parameters primarily from the text. Use data from figures/tables only if the text data is incomplete and this incompleteness is explicitly stated. If a parameter cannot be determined with high confidence (>90% certainty), omit it from the output rather than using a placeholder.

The output should be a list of JSON objects, one for each of the two specified alloys, in the following format:

[
    {
        "name": "AlloyName",
        "nominal_composi

Training step 1. Epoch 2: : 1it [01:32, 92.20s/it]

Read the PDF file and extract the following parameters for the AlCoCrFeNbxNi alloy series, focusing specifically on the compositions where x = 0 (AlCoCrFeNi) and x = 0.1 (AlCoCrFeNb0.1Ni):

1. name (string, e.g., "AlCoCrFeNi")
2. nominal_composition (string, molar ratios to one decimal place, e.g., "Al1.0Co1.0Cr1.0Fe1.0Ni1.0")
3. measured_composition (string, atomic percentages to one decimal place without % sign, e.g., "Al20.0Co20.0Cr20.0Fe20.0Ni20.0")
4. lattice_constant (float, in angstroms, rounded to 3 decimal places)
5. phases (string, e.g., "BCC")
6. alloy_condition (string, e.g., "As-Cast")
7. doi (string)

Extract parameters primarily from the text. Use data from figures/tables only if the text data is incomplete and this incompleteness is explicitly stated. If a parameter cannot be determined with high confidence (>90% certainty), omit it from the output rather than using a placeholder.

The output should be a list of JSON objects, one for each of the two specified alloys, in

Training step 2. Epoch 2: : 2it [03:12, 96.82s/it]

Read the PDF file and extract the following parameters for all high-entropy alloys discussed in the results section:

1. name (string, e.g., "AlCoCrFeNi")
2. nominal_composition (string, exactly as written in the paper)
3. measured_composition (string, exactly as written in the paper)
4. lattice_constant (float, in angstroms, rounded to 3 decimal places)
5. phases (string, e.g., "BCC")
6. alloy_condition (string, e.g., "As-Cast")
7. doi (string)

Extract parameters primarily from the text. Use data from tables only if the text data is incomplete. Use figures as a last resort. If a parameter is truly missing from the PDF for a given alloy, explicitly report it as "Not found" rather than omitting it.

For each parameter, include a confidence score (0-100) indicating your certainty in the extracted information. Consider a score of 90 or above as high confidence.

The output should be a list of JSON objects, one for each alloy discussed in the paper, in the following format:

[
    {
     

Training step 2. Epoch 2: : 3it [04:26, 88.72s/it]

Read the PDF file and extract the following parameters for all high-entropy alloys discussed in the results section:

1. name (string, e.g., "AlCoCrFeNi")
2. nominal_composition (string, representing the stoichiometric ratio of each element, e.g., "Al1.0Co1.0Cr1.0Fe1.0Ni1.0". If an element's ratio is not explicitly stated, assume 1.0)
3. measured_composition (string, exactly as written in the paper)
4. lattice_constant (float, in angstroms, rounded to 3 decimal places)
5. phases (string, e.g., "BCC")
6. alloy_condition (string, e.g., "As-Cast")
7. doi (string)

Extract parameters primarily from the text. Use data from tables only if the text data is incomplete. Use figures as a last resort. If a parameter is truly missing from the PDF for a given alloy, explicitly report it as "Not found" rather than omitting it.

For each parameter, include a confidence score (0-100) indicating your certainty in the extracted information. Consider a score of 90 or above as high confidence.

The output




Now you have a optimized prompt, the next step is to copy the prompt to "large_scale_extraction.py"