In [1]:
example_records = [
[{'composition': 'V10Cr15Mn5Fe35Co10Ni25',
  'label': 'FG, annealed',
  'processing_kw': ['homogenized at 1100C 6h Ar',
   'cold rolled 79%',
   'annealed 900C 10min']},
 {'composition': 'V10Cr15Mn5Fe35Co10Ni25',
  'label': 'CG, annealed',
  'processing_kw': ['homogenized at 1100C 6h Ar',
   'cold rolled 79%',
   'annealed 1100C 60min']},
 {'composition': 'V10Cr15Mn5Fe35Co10Ni25',
  'label': 'FG, HPT 1/4 turn',
  'processing_kw': ['homogenized at 1100C 6h Ar',
   'cold rolled 79%',
   'annealed 900C 10min',
   'HPT 1/4 turn 6GPa 1rpm']}],
   [{'composition': 'V10Cr15Mn5Fe35Co10Ni25',
  'label': 'FG annealed 900C 10min',
  'processing_kw': ['vacuum induction melting',
   'homogenized 1100C 6h Ar',
   'water quenched',
   'cold rolled 79%',
   'EDM disk',
   'annealed 900C 10min']},
 {'composition': 'V10Cr15Mn5Fe35Co10Ni25',
  'label': 'CG annealed 1100C 60min',
  'processing_kw': ['vacuum induction melting',
   'homogenized 1100C 6h Ar',
   'water quenched',
   'cold rolled 79%',
   'EDM disk',
   'annealed 1100C 60min']}]
]

In [4]:
import json
from sisyphus.urgent import entity_resolution_utils as er
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# 1) Prepare your per-source groups (phase_list, strength_list, ...)

record_groups = example_records  # lists of dicts (original metadata only)

# 2) Flatten and build normalized metadata (you send only minimal fields to LLM)
flat_records, group_indices = er.flatten_record_groups(record_groups)
normalized = er.build_normalized_metadata(flat_records)

# Build payload that is safe & concise for the LLM:
payload = {
    "records": [
        {
            "index": i,
            "group": group_indices[i],
            "composition_norm": normalized[i]["composition_norm"],
            "label_norm": normalized[i]["label_norm"],
            "processing_kw_norm": normalized[i]["processing_kw_norm"],
        }
        for i in range(len(flat_records))
    ]
}
def escape_unescaped_braces(text: str) -> str:
    """
    Return a copy of text where every single '{' or '}' that is not already part of
    a '{{' or '}}' pair is replaced by '{{' or '}}' respectively.
    """
    out = []
    i = 0
    n = len(text)
    while i < n:
        # preserve already-escaped double braces
        if i + 1 < n and text[i] == '{' and text[i + 1] == '{':
            out.append('{{')
            i += 2
        elif i + 1 < n and text[i] == '}' and text[i + 1] == '}':
            out.append('}}')
            i += 2
        # escape single braces
        elif text[i] == '{':
            out.append('{{')
            i += 1
        elif text[i] == '}':
            out.append('}}')
            i += 1
        else:
            out.append(text[i])
            i += 1
    return ''.join(out)

# 3) Create prompt that includes the payload. IMPORTANT: keep temperature=0.
user_text = er.LLM_PROMPT_IDS + "\n\nHere is the normalized records list (index+group+normalized metadata):\n\n"
user_text += json.dumps(payload, indent=2)

prompt = ChatPromptTemplate.from_messages([('system', 'You are a JSON-only assistant.'), ('user', escape_unescaped_braces(user_text))])
model = ChatOpenAI(model='gpt-4.1', temperature=0)

# 4) Ask model to return the partition indices only using structured output
#    Use the Pydantic LLMPartitionOutput if your wrapper supports it:
chain = prompt | model.with_structured_output(er.LLMPartitionOutput, method='json_schema')
result = chain.invoke({})   # your prompt already includes the payload
# result.partitions is a list of lists of ints
partitions_indices = result.partitions

# 5) Final deterministic merge locally (computes canonical metadata and merges identical ones)
final = er.finalize_merge_from_partitions(flat_records, partitions_indices)

# final is a PartitionsOutput; you can inspect final.partitions
print(final.json(indent=2))

TypeError: `dumps_kwargs` keyword arguments are no longer supported.

In [7]:
print(final.model_dump_json(indent=2))

{
  "partitions": [
    {
      "canonical_metadata": {
        "composition": "V10Cr15Mn5Fe35Co10Ni25",
        "label": "fg, annealed",
        "processing_kw": [
          "homogenized at 1100C 6h Ar",
          "cold rolled 79%",
          "annealed 900C 10min"
        ]
      },
      "members": [
        0,
        3
      ],
      "records": [
        {
          "composition": "V10Cr15Mn5Fe35Co10Ni25",
          "label": "FG, annealed",
          "processing_kw": [
            "homogenized at 1100C 6h Ar",
            "cold rolled 79%",
            "annealed 900C 10min"
          ]
        },
        {
          "composition": "V10Cr15Mn5Fe35Co10Ni25",
          "label": "FG annealed 900C 10min",
          "processing_kw": [
            "vacuum induction melting",
            "homogenized 1100C 6h Ar",
            "water quenched",
            "cold rolled 79%",
            "EDM disk",
            "annealed 900C 10min"
          ]
        }
      ],
      "confidence": "high"
 