In [None]:
import langfun as lf
import pyglove as pg
from pathlib import Path
import base64
import os
from IPython import display

In [None]:
from dotenv import load_dotenv
load_dotenv()  # reads .env in project root directory

ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

# Helper functions

In [2]:
#Replace some utility functions to enable pdf encoding in textgrad

def is_pdf(data):
    """
    Checks if the given data starts with the PDF file signature.

    :param data: bytes object, the file data to check
    :return: True if it's a PDF file, otherwise False
    """
    pdf_signature = b'%PDF-'
    return data.startswith(pdf_signature)

from textgrad.engine_experimental.engine_utils import *
import textgrad.engine_experimental.engine_utils
from typing import List, Union

def new_get_image_type_from_bytes(data):
    if is_jpeg(data):
        return "jpeg"
    elif is_png(data):
        return "png"
    elif is_pdf(data):
        return "pdf"
    else:
        raise ValueError("Image type not supported, only jpeg and png supported.")

def format_content_anthropic(content: List[Union[str, bytes]]) -> List[dict]:
    """Helper function to format a list of strings and bytes into a list of dictionaries to pass as messages to the API.
    """
    formatted_content = []
    for item in content:
        if isinstance(item, bytes):
            # For now, bytes are assumed to be images
            image_type = new_get_image_type_from_bytes(item)
            base64_image = base64.b64encode(item).decode('utf-8')
            if image_type =='pdf':
                formatted_content.append({
                "type": "document",
                "source": {
                    "type": "base64",
                    "media_type": "application/pdf",
                    "data": base64_image
            }
                })
            else:
                formatted_content.append({
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": image_type,
                        "data": base64_image,
                    },
                })
        elif isinstance(item, str):
            formatted_content.append({
                "type": "text",
                "text": item
            })
        else:
            raise ValueError(f"Unsupported input type: {type(item)}")
    return formatted_content

# textgrad.engine_experimental.engine_utils.open_ai_like_formatting = _format_content_new

# Output Evaluation

In [4]:
prompt = """
Read the PDF file and extract the following parameters for all high-entropy alloys discussed in the results section:

1. name (string, e.g., "AlCoCrFeNi")
2. nominal_composition (string, representing the stoichiometric ratio of each element, e.g., "Al1.0Co1.0Cr1.0Fe1.0Ni1.0". If an element's ratio is not explicitly stated, assume 1.0)
3. measured_composition (string, exactly as written in the paper)
4. lattice_constant (float, in angstroms, rounded to 3 decimal places)
5. phases (string, e.g., "BCC")
6. alloy_condition (string, e.g., "As-Cast")
7. doi (string)

Extract parameters primarily from the text. Use data from tables only if the text data is incomplete. Use figures as a last resort. If a parameter is truly missing from the PDF for a given alloy, explicitly report it as "Not found" rather than omitting it.

For each parameter, include a confidence score (0-100) indicating your certainty in the extracted information. Consider a score of 90 or above as high confidence.

The output should be a list of JSON objects, one for each alloy discussed in the paper, in the following format:

[
    {
        "name": "AlloyName",
        "nominal_composition": "Element11.0Element21.0...",
        "measured_composition": "Composition as written",
        "lattice_constant": X.XXX,
        "phases": "Phase1,Phase2,...",
        "alloy_condition": "condition",
        "doi": "DOI",
        "confidence_scores": {
            "name": XX,
            "nominal_composition": XX,
            "measured_composition": XX,
            "lattice_constant": XX,
            "phases": XX,
            "alloy_condition": XX,
            "doi": XX
        }
    },
    ...
]

Include an alloy in the output only if it is explicitly discussed in the results section. Ensure that the output format and data closely match the provided schema. If information for a specific parameter is not available, use "Not found" and assign a low confidence score.

Example of correct output:
[
    {
        "name": "HfNbTaTiZr",
        "nominal_composition": "Hf1.0Nb1.0Ta1.0Ti1.0Zr1.0",
        "measured_composition": "Hf20.8Nb18.9Ta20.2Ti20.2Zr19.9",
        "lattice_constant": 3.414,
        "phases": "BCC",
        "alloy_condition": "As-Cast",
        "doi": "10.1016/j.jallcom.2014.11.064",
        "confidence_scores": {
            "name": 100,
            "nominal_composition": 90,
            "measured_composition": 95,
            "lattice_constant": 100,
            "phases": 100,
            "alloy_condition": 95,
            "doi": 100
        }
    }
]

Example of output with missing data:
[
    {
        "name": "AlCoCrFeNi",
        "nominal_composition": "Al1.0Co1.0Cr1.0Fe1.0Ni1.0",
        "measured_composition": "Not found",
        "lattice_constant": 3.567,
        "phases": "FCC",
        "alloy_condition": "Not found",
        "doi": "10.1016/j.example.2023.01.001",
        "confidence_scores": {
            "name": 100,
            "nominal_composition": 90,
            "measured_composition": 0,
            "lattice_constant": 95,
            "phases": 100,
            "alloy_condition": 0,
            "doi": 100
        }
    }
]

    """

In [None]:
#Custom loss function for PDF files
from textgrad.engine import EngineLM, get_engine
from textgrad.variable import Variable
from typing import List, Union
from textgrad.autograd import LLMCall, FormattedLLMCall, OrderedFieldsMultimodalLLMCall
from textgrad.autograd import Module
from textgrad.config import SingletonBackwardEngine

class EvaluatePdfOutputLoss(Module):
    def __init__(self,
                 evaluation_instruction: str,
                 engine: Union[EngineLM, str] = None,
                 system_prompt: Variable = None):
        super().__init__()
        self.evaluation_instruction = Variable(evaluation_instruction, role_description="evaluation instruction", requires_grad=False)
        if ((engine is None) and (SingletonBackwardEngine().get_engine() is None)):
            raise Exception("No engine provided. Either provide an engine as the argument to this call, or use `textgrad.set_backward_engine(engine)` to set the backward engine.")
        elif engine is None:
            engine = SingletonBackwardEngine().get_engine()
        if isinstance(engine, str):
            engine = get_engine(engine)
        self.engine = engine
        if system_prompt:
            self.system_prompt = system_prompt
        else:
            self.system_prompt = Variable("You are an evaluation system that evaluates the correctness of knowledge extraction tasks from scientific papers.",
                                            requires_grad=False,
                                            role_description="system prompt for the evaluation")

        self.multimodal_llm_call = OrderedFieldsMultimodalLLMCall(engine=self.engine,
                                                                  system_prompt=self.system_prompt,
                                                                  fields=["Evaluation Instruction", "Question", "Image", "Answer"])

    def forward(self, image: Variable, question: Variable, response: Variable) -> Variable:
        
        inputs = {
            "Evaluation Instruction": self.evaluation_instruction,
            "Question": question,
            "Image": image,
            "Answer": response
        }
        return self.multimodal_llm_call(inputs=inputs,
                                        response_role_description=f"evaluation of the {response.get_role_description()}")





In [6]:
from textgrad.engine_experimental.litellm import LiteLLMEngine
import textgrad as tg

# llm_engine = LiteLLMEngine("gemini/gemini-1.5-flash", cache=False,is_multimodal=True)
# llm_backward_engine = LiteLLMEngine("gemini/gemini-1.5-flash", cache=False,is_multimodal=True)
# tg.set_backward_engine(llm_backward_engine, override=True)

llm_engine = get_engine("claude-3-5-sonnet-20240620")
llm_backward_engine = get_engine("claude-3-5-sonnet-20240620")

llm_engine._format_content = format_content_anthropic
llm_backward_engine._format_content = format_content_anthropic
# tg.set_backward_engine(llm_engine, override=True)
tg.set_backward_engine(llm_backward_engine, override=True)

In [None]:
evaluation_instruction = """Below is the output from a data extraction task for a given PDF file, and the PDF itself.
You need to evaluate the output according to the provided PDF for each extracted material, be super concise:
Does the output matches the format provided in the question? Answer yes or no.
Are the extract material high-entropy alloys? Answer yes or no.
Are the alloy composition in the output correct and reflect the material? Answer yes or no.
Are the lattice constants value in the output formatted in angstrom? Answer yes or no.
Are the lattice constants in the output truly the lattice constant of the material not others? Answer yes or no.
Are the phase information in the output correct and reflect the material? Answer yes or no.
Are the alloy condition in the output correct and reflect the material? Answer yes or no.
Is the DOI in the output matches with the original paper? Answer yes or no.
Is there any HEA high-entropy alloys missed in the provided ouput? yes or no.
"""
eval_instruction = tg.Variable(evaluation_instruction, requires_grad=False, role_description="evaluation instruction for the task")
eval_fn = EvaluatePdfOutputLoss(evaluation_instruction=evaluation_instruction,engine=llm_engine)

# question = tg.Variable(start_prompt,requires_grad=True,role_description="question input for the PDF file")
question = tg.Variable(prompt,requires_grad=False,role_description="question input for the PDF file")

# optimizer = tg.TGD(parameters=[question],engine=llm_backward_engine)

In [15]:
from pathlib import Path
import json
import re

def load_json_and_pdf(json_path: Path):
    """Load a JSON file as text and its corresponding PDF using lf.PDF"""
    with open(json_path, "r", encoding="utf-8") as f:
        json_text = f.read()

    pdf_path = json_path.with_suffix(".pdf")
    if not pdf_path.exists():
        print(f"⚠️ PDF missing for {json_path.name}")
        return None

    pdf_doc = lf.PDF(str(pdf_path))
    return json_text, pdf_doc

In [16]:
import re, json
from pathlib import Path
from json.decoder import JSONDecodeError

# ---------- fast, safe scanners ----------
def _scan_balanced(text: str, opener: str, closer: str, start_idx: int = 0):
    """
    Return (start,end) of the first balanced JSON chunk from the first opener found
    at or after start_idx. Handles strings and escapes. Returns None if not found.
    """
    i = text.find(opener, start_idx)
    if i == -1:
        return None
    depth = 0
    in_str = False
    esc = False
    quote = None
    for j in range(i, len(text)):
        ch = text[j]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == quote:
                in_str = False
            # stay inside string
            continue
        else:
            if ch in ('"', "'"):
                in_str = True
                quote = ch
            elif ch == opener:
                depth += 1
            elif ch == closer:
                depth -= 1
                if depth == 0:
                    return (i, j + 1)
    return None  # unbalanced

def _find_all_top_level(text: str, opener: str, closer: str, max_items: int = 2):
    """Yield up to max_items balanced chunks (substrings) for the given bracket type."""
    pos = 0
    found = 0
    while found < max_items:
        rng = _scan_balanced(text, opener, closer, start_idx=pos)
        if rng is None:
            break
        s, e = rng
        yield text[s:e]
        found += 1
        pos = e

# ---------- candidate extraction ----------
_FENCE_JSON_RE = re.compile(r"```json\s*(.*?)```", re.DOTALL | re.IGNORECASE)
_FENCE_ANY_RE  = re.compile(r"```(?!json)(?:[a-zA-Z0-9_+-]*)\s*(.*?)```", re.DOTALL)

def _extract_json_candidates(text: str, max_candidates: int = 6):
    # 1) ```json fenced blocks (fast + accurate)
    for m in _FENCE_JSON_RE.finditer(text):
        yield m.group(1).strip()
    # 2) other fenced blocks (sometimes people put JSON in ```text)
    for m in _FENCE_ANY_RE.finditer(text):
        yield m.group(1).strip()
    # 3) top-level arrays and objects via balanced scan (no regex backtracking)
    for chunk in _find_all_top_level(text, "[", "]", max_items=2):
        yield chunk.strip()
    for chunk in _find_all_top_level(text, "{", "}", max_items=2):
        yield chunk.strip()

def _light_fix_json(text: str) -> str:
    # very light fixes: remove trailing commas; swap single->double quotes in simple cases
    import re
    fixed = re.sub(r",(\s*[}\]])", r"\1", text)
    fixed = re.sub(r"(?P<prefix>{|\[|,)\s*'(?P<key>[^'\\]*?)'\s*:", lambda m: f'{m.group("prefix")} "{m.group("key")}":', fixed)
    fixed = re.sub(r":\s*'(?P<val>[^'\\]*?)'(\s*[,}\]])", lambda m: f': "{m.group("val")}"{m.group(2)}', fixed)
    return fixed

def response_to_json(response: str, filename: str):
    """
    Extract the first valid JSON from `response` and save to <filename>.optimized.json.
    Uses balanced scanning (no catastrophic regex).
    """
    out_base = Path(filename)
    out_json = out_base.with_suffix("").with_name(out_base.stem + ".optimized.json")
    out_raw  = out_base.with_suffix("").with_name(out_base.stem + "_raw.txt")

    tried = 0
    for cand in _extract_json_candidates(response):
        tried += 1
        # Try strict parse
        try:
            obj = json.loads(cand)
            out_json.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8")
            print(f"✅ Parsed candidate #{tried} (strict). Saved to {out_json}")
            return obj
        except JSONDecodeError:
            pass
        # Try light fix
        try:
            fixed = _light_fix_json(cand)
            obj = json.loads(fixed)
            out_json.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8")
            print(f"✅ Parsed candidate #{tried} after light fix. Saved to {out_json}")
            return obj
        except JSONDecodeError:
            continue

    # Nothing worked — save raw once for inspection
    out_raw.write_text(response, encoding="utf-8")
    print(f"⚠️ No valid JSON found after {tried} candidates. Raw saved to {out_raw}")
    return None


In [30]:
# set your folder path
import time
folder = Path("./")
json_files = list(folder.glob("*.json"))
print(f"Found {len(json_files)} JSON files")

for json_path in json_files:
    json_text, pdf_to_read_lf = load_json_and_pdf(json_path)

    response = tg.Variable(json_text, requires_grad=True, role_description="solution to the data extraction task")
    pdf_bytes = pdf_to_read_lf.to_bytes()
    pdf_image = tg.Variable(pdf_bytes, requires_grad=False, role_description="PDF file of a research paper")
    optimizer = tg.TGD(parameters=[response],engine=llm_backward_engine)
    loss = eval_fn(question=question, image = pdf_image, response=response)
    print(loss.value)
    loss_file = json_path.with_suffix(".txt")
    with open(loss_file, "w", encoding="utf-8") as f:
        f.write(str(loss.value))
    loss.backward()
    optimizer.step()
    # print(response.value)
    response_to_json(response=response.value,filename=json_path.stem)
    optimizer.zero_grad()
    # break
    # time.sleep(30)

Found 1 JSON files
I'll evaluate the output for each extracted material according to the provided criteria:

1. Does the output match the format provided in the question?
Yes. The output is a list of JSON objects with the required fields.

2. Are the extracted materials high-entropy alloys?
Yes. All extracted alloys are high-entropy alloys from the CoFeNiMnTixAl1-x system.

3. Are the alloy compositions in the output correct and reflect the material?
Yes. The compositions match those given in the paper for each alloy.

4. Are the lattice constants value in the output formatted in angstrom?
Yes. The lattice constants are given in angstroms.

5. Are the lattice constants in the output truly the lattice constant of the material not others?
Yes. The lattice constants match those reported in the paper for the BCC phase.

6. Are the phase information in the output correct and reflect the material?
Yes. The phase information matches what is reported in the paper for each composition.

7. Are 