In [None]:
import os
import json
from extraction_schemas import OpportunitySchema

os.environ["GROQ_API_KEY"] = "YOUR_GROQ_API_KEY"  # Replace with your actual Groq API key

from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0.5,
    max_retries=4,
    disable_streaming=True,
)

In [None]:
with open("test.json", encoding="utf-8") as f:
    d = json.load(f)

In [2]:
import uuid
from typing import List, TypedDict

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    ToolMessage,
)
from pydantic import BaseModel

class Example(TypedDict):
    """A representation of an example consisting of text input and expected tool calls.

    For extraction, the tool calls are represented as instances of pydantic model.
    """

    input: str  # This is the example text
    tool_calls: List[BaseModel]  # Instances of pydantic model that should be extracted


def tool_example_to_messages(example: Example) -> List[BaseMessage]:
    """Convert an example into a list of messages that can be fed into an LLM.

    This code is an adapter that converts our example to a list of messages
    that can be fed into a chat model.

    The list of messages per example corresponds to:

    1) HumanMessage: contains the content from which content should be extracted.
    2) AIMessage: contains the extracted information from the model
    3) ToolMessage: contains confirmation to the model that the model requested a tool correctly.

    The ToolMessage is required because some of the chat models are hyper-optimized for agents
    rather than for an extraction use case.
    """
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    openai_tool_calls = []
    for tool_call in example["tool_calls"]:
        openai_tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "type": "function",
                "function": {
                    # The name of the function right now corresponds
                    # to the name of the pydantic model
                    # This is implicit in the API right now,
                    # and will be improved over time.
                    "name": tool_call.__class__.__name__,
                    "arguments": tool_call.json(),
                },
            }
        )
    messages.append(
        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
    )
    tool_outputs = example.get("tool_outputs") or [
        "You have correctly called this tool."
    ] * len(openai_tool_calls)
    for output, tool_call in zip(tool_outputs, openai_tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages

In [3]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import json


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. Your task is to extract only the relevant information "
            "from the provided text. For each attribute, extract the value based on the information available "
            "in the text. If an attribute's value is not explicitly mentioned or cannot be determined, return "
            "null or empty list (depends on attribut description). Ensure all dates and times are formatted as dd.mm.yy hh:mm and that "
            "extracted information is written in **perfect Ukrainian** without any spelling or grammatical errors. "
            "Proper names, such as names of people, organizations, and locations, should be extracted as-is and not translated. "
            "Only provide the requested fields.",
        ),
        MessagesPlaceholder("examples"),
        ("human", "{text}"),
    ]
)

In [4]:
examples = []
with open('examples.json', 'r', encoding='utf-8') as f:
    examples_json = json.load(f)
    for ex in examples_json["examples"]:
        examples.append((ex["input"], OpportunitySchema.model_validate(ex["output"])))

messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages({"input": text, "tool_calls": [tool_call]})
    )

In [5]:
runnable = prompt | llm.with_structured_output(schema=OpportunitySchema, include_raw=True)

In [6]:
from IPython.display import JSON, display

with open("test.json", encoding="utf-8") as f:
    d = json.load(f)


responses = []
for i, msg in enumerate(d["messages"]):
    try:
        print(f"Processing message {i}")
        resp = runnable.invoke({"text": msg["text"], "examples": messages})
        responses.append(resp)
        display(JSON(resp["parsed"].dict()))
    except Exception as e:
        print(f"Error processing message {i}: {e}")

Processing message 0
Error processing message 0: 'text'
Processing message 1
Error processing message 1: 'text'
Processing message 2
Error processing message 2: 'text'
Processing message 3
Error processing message 3: 'text'
Processing message 4
Error processing message 4: 'text'
Processing message 5
Error processing message 5: 'text'
Processing message 6
Error processing message 6: 'text'
Processing message 7
Error processing message 7: 'text'
Processing message 8
Error processing message 8: 'text'
Processing message 9
Error processing message 9: 'text'
