In [5]:
from openai import OpenAI, RateLimitError, APITimeoutError
from tqdm import tqdm
import tiktoken
import concurrent
import time
import json
from pprint import pprint
openai_client = OpenAI(api_key=open("api_key").read().strip())
def save_json(data, filename):
    with open (filename, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def request_gpt(
    client, messages, model="gpt-4o-mini", temperature=0.5, format=None, seed=None
):
    with open("request_log.txt", "a", encoding="utf-8") as f:
        f.write(f"model: {model}, temperature: {temperature}, format: {format}\n")
        f.write(json.dumps(messages, ensure_ascii=False) + "\n")
        f.write("=====================================\n")
    try:
        if format == "json":
            response = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    response_format={"type": "json_object"},
                    temperature=temperature,
                    seed=seed,
                )
            
            try:
                json.loads(response.choices[0].message.content)
                return response.choices[0].message.content
            except json.JSONDecodeError as e:
                print("JSON Decode Error")
                print(e)
                time.sleep(5)
                return request_gpt(client, messages, model, temperature=1.0, format=format)

        else:
            response = client.chat.completions.create(
                model=model, messages=messages, temperature=temperature, seed=seed
            )
        return response.choices[0].message.content
    except RateLimitError as e:
        print("RateLimitError")
        print(e)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)
    except APITimeoutError as e:
        print("APITimeoutError")
        print(messages)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)

def multithread_prompts(
    client,
    prompts,
    model="gpt-4o-mini",
    temperature=0.5,
    response_format=None,
    seed=None,
):
    l = len(prompts)
    # results = np.zeros(l)
    with tqdm(total=l) as pbar:
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=100)
        futures = [
            executor.submit(
                request_gpt, client, prompt, model, temperature, response_format, seed
            )
            for prompt in prompts
        ]
        for _ in concurrent.futures.as_completed(futures):
            pbar.update(1)
    concurrent.futures.wait(futures)
    return [future.result() for future in futures]

In [6]:
semantic_task = [
    {
        "label": "Document Preprocessing",
        "description": "Clean and preprocess the text from the Wikipedia documents.",
        "explanation": "Preprocessing is essential to remove noise and irrelevant information, which will enhance the quality of the extraction process and ensure consistency in the analysis.",
        "depend_on": [],
        "id": "0",
        "parentIds": []
    },
    {
        "label": "Entity Recognition",
        "description": "Identify key entities (like people, places, organizations) within the preprocessed documents.",
        "explanation": "Entity recognition is crucial as it helps in identifying the main subjects or nodes that will form the foundation of the knowledge graph.",
        "depend_on": [
            "Document Preprocessing"
        ],
        "id": "1",
        "parentIds": [
            "0"
        ]
    },
    {
        "label": "Relationship Extraction",
        "description": "Extract relationships between the identified entities.",
        "explanation": "Understanding how entities relate to each other is key to building the connections in the knowledge graph, which provides context and meaning.",
        "depend_on": [
            "Entity Recognition"
        ],
        "id": "2",
        "parentIds": [
            "1"
        ]
    },
    {
        "label": "Ontology Definition",
        "description": "Define the structure and schema of the knowledge graph, including classes and properties.",
        "explanation": "Creating an ontology is needed to formalize the relationships and hierarchy of the entities, ensuring the knowledge graph is organized and interpretable.",
        "depend_on": [
            "Entity Recognition",
            "Relationship Extraction"
        ],
        "id": "3",
        "parentIds": [
            "1",
            "2"
        ]
    },
    {
        "label": "Graph Construction",
        "description": "Construct the knowledge graph using the identified entities, relationships, and defined ontology.",
        "explanation": "This step is where the actual graph is built, combining all the previous outputs into a structured format that can be utilized for querying and analysis.",
        "depend_on": [
            "Ontology Definition"
        ],
        "id": "4",
        "parentIds": [
            "3"
        ]
    },
    {
        "label": "Validation and Refinement",
        "description": "Validate the knowledge graph for accuracy and completeness, and refine it as necessary.",
        "explanation": "Validation ensures the knowledge graph accurately represents the information extracted from the documents; refinement helps improve its quality and usability.",
        "depend_on": [
            "Graph Construction"
        ],
        "id": "5",
        "parentIds": [
            "4"
        ]
    }
]
semantic_tasks_str = json.dumps(semantic_task)

In [11]:
# read defs
elementary_task_defs = json.load(open("elementary_task_defs.json"))
elementary_task_defs_str = ""
for elementary_task in elementary_task_defs:
    elementary_task_defs_str += "<elementary_task>\n"
    for key, value in elementary_task.items():
        elementary_task_defs_str += f"<{key}>{value}</{key}>\n"
    elementary_task_defs_str += "</elementary_task>\n"
# Decompose a tree of semantic tasks into elementary tasks
prompts = [
    {
        "role": "system",
        "content": """
        ** Context **
        You are a Natural Language Processing (NLP) assistant. You are given a list of elementary NLP tasks that could be used.
        Here is the list of elementary NLP tasks:
        {elementary_task_defs}
        ** Task **
        The user will describe a series of real-world tasks for you. First, for each of the task, decide if it can be formulated as an NLP task. If yes, you need to find the proper elementary NLP tasks and arrange them to accomplish user's goal. 
        You can ignore the need to handle formats or evaluate outputs.
        ** Requirements **
        The labels of the elementary task must match exactly as those provided above. Reply with the following JSON format: 
        {{ "elementary_tasks": [ 
                {{ 
                    "id": int,
                    "label": (string) (one of the above)
                    "description": (string, describe implementation procedure)
                    "explanation": (string, explain why this task is needed)
                    "depend_on": (string[], the tasks task this step depends on)
                    "input": string,
                    "output": string,
                    "example_output": string,
                }}, 
                {{ 
                    "id": int,
                    "label": (string) (one of the above)
                    "description": (string, describe implementation procedure)
                    "explanation": (string, explain why this task is needed)
                    "depend_on": (int[], the task ids that this task depends on)
                    "input": string,
                    "output": string,
                    "example_output": string,
                }}, 
                ... 
            ] 
        }}
        """.format(elementary_task_defs=elementary_task_defs)
    },
    {
        "role": "user",
        "content": f"{semantic_tasks_str}"
    }
]
response = request_gpt(openai_client, prompts, model="gpt-4o-mini", temperature=0.0, format="json")

In [None]:
print(response)

In [4]:
# print(response)
import json
test_response = """
{
  "elementary_tasks": [
    {
      "label": "Information Extraction",
      "description": "Identify and extract key entities (e.g., people, places, organizations) from the documents using defined techniques.",
      "explanation": "Entity extraction is the first step in building a knowledge graph, identifying primary components that will form the nodes of the graph.",
      "depend_on": [],
      "input": "Documents",
      "output": "Extracted Entities",
      "example_output": "{'Person': 'Barack Obama', 'Location': 'Honolulu', 'Organization': 'United States'}"
    },
    {
      "label": "Information Extraction",
      "description": "Determine the relationships between the extracted entities based on the context of the documents.",
      "explanation": "Understanding the relationships allows us to connect the entities and define the edges of the knowledge graph, which is crucial for representing the semantic structure.",
      "depend_on": ["Extract Entities"],
      "input": "Extracted Entities and Context",
      "output": "Identified Relationships",
      "example_output": "{'Entity A': 'Barack Obama', 'Entity B': 'United States', 'Relationship': 'Presidency'}"
    },
    {
      "label": "Define Ontology",
      "description": "Establish a schema or ontology that categorizes the types of entities and relationships defined from the previous steps.",
      "explanation": "An ontology helps in standardizing how entities and relationships are represented, making the knowledge graph more structured and interpretable.",
      "depend_on": ["Extract Entities", "Identify Relationships"],
      "input": "Entity Types and Relationships",
      "output": "Defined Ontology",
      "example_output": "{'Entity Types': ['Person', 'Organization', 'Location'], 'Relationships': ['Works At', 'Lives In']}"
    },
    {
      "label": "Construct Knowledge Graph",
      "description": "Create the knowledge graph by integrating the extracted entities and their relationships according to the defined ontology.",
      "explanation": "This step involves the actual creation of the graph structure that can be visualized or queried, which is the end goal of the task.",
      "depend_on": ["Define Ontology", "Identify Relationships"],
      "input": "Extracted Entities and Relationships",
      "output": "Knowledge Graph",
      "example_output": "{'Nodes': ['Barack Obama', 'United States'], 'Edges': ['Barack Obama - Presidency -> United States']}"
    },
    {
      "label": "Iterate and Refine",
      "description": "Review and refine the knowledge graph to improve accuracy and completeness.",
      "explanation": "It’s essential to validate the graph to ensure it accurately represents the information from the documents and to address any gaps or errors.",
      "depend_on": ["Construct Knowledge Graph"],
      "input": "Initial Knowledge Graph",
      "output": "Refined Knowledge Graph",
      "example_output": "{'Refined Nodes': ['Barack Obama', 'United States', 'Democrat'], 'Refined Edges': ['Barack Obama - Presidency -> United States']}"
    }
  ]
}
"""
elementary_tasks = json.loads(test_response)["elementary_tasks"]

In [None]:
print(elementary_task_defs_str)

In [None]:
test = [{'id': '1', 'label': 'Extract Entities', 'description': 'Identify and extract key entities (people, places, organizations, etc.) from the documents.', 'explanation': 'This step is crucial for populating the knowledge graph with relevant and meaningful nodes, which represent different concepts and facts from the documents.', 'depend_on': [], 'parentIds': [], 'children': ['2', '3']}, {'id': '2', 'label': 'Identify Relationships', 'description': 'Determine relationships between the extracted entities based on the context of the documents.', 'explanation': 'Identifying relationships helps in defining how entities are connected in the knowledge graph, enabling a better representation of the information landscape.', 'depend_on': ['1'], 'parentIds': ['1'], 'children': ['3', '4']}, {'id': '3', 'label': 'Define Schema', 'description': 'Create a schema that outlines the types of entities and their possible relationships.', 'explanation': 'A well-defined schema is necessary to ensure that the knowledge graph is structured, understandable, and queryable, facilitating accurate representation of the data.', 'depend_on': ['1', '2'], 'parentIds': ['1', '2'], 'children': ['4']}, {'id': '4', 'label': 'Construct Graph', 'description': 'Assemble the entities and relationships into a graphical representation based on the schema.', 'explanation': 'Constructing the graph is the final step in visually organizing the extracted entities and relationships into a coherent structure that can be analyzed and queried.', 'depend_on': ['2', '3'], 'parentIds': ['2', '3'], 'children': []}]

decomposed_semantic_tasks = list(
    map(
        lambda t: t.update({"parentIds": t["depend_on"], "children": []}),
        test,
    )
)
decomposed_semantic_tasks

In [18]:
import csv
all_rows = []
elementary_task_defs = json.load(open("elementary_task_defs.json"))
for task_def in elementary_task_defs:
    all_rows.append({
        "label": task_def['label'],
        "definition": task_def['definition'],
        "input": task_def['input'],
        "output": task_def['output'],
    })

with open('elementary_task_defs.csv', 'w',) as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(list(all_rows[0].keys()))
    for row in all_rows:
        writer.writerow(list(row.values()))


In [None]:
import json

from autogen_core import CancellationToken
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.messages import TextMessage
from pydantic import BaseModel

class Node(BaseModel):
    id: str
    label: str
    description: str
    explanation: str
    depend_on: list[str]
    parentIds: list[str]
    children: list["Node"] = []
    confidence: float
    complexity: float

    class Config:
        orm_mode = True


async def run_prompt_generation_agent(task: Node, existing_keys: str, model: str, api_key: str):
    model_client = OpenAIChatCompletionClient(
        model=model,
        api_key=api_key,
        temperature=0.0,
    )
    prompt_generation_agent = AssistantAgent(
        name="prompt_generation_agent",
        model_client=model_client,
        system_message= """
        ** Context **
        You are an expert in writing prompts for Large Language Models, especially for generating prompts that analyze a given piece of text.
        ** Task **
        The user will describe the task and provide a piece of text. You need to generate a prompt that can be used to analyze the text for the given task.
        ** Requirements **
        First, decide what data in each document is needed to complete the task. Then, generate a prompt that instructs an LLM to analyze the text for the given task.
        Organize the prompt into these three sections:
        1. Input Keys: List the keys that are required from the document to complete the task.
        2. Context: Give instructions on what the user is trying to do.
        3. Task: Give instructions on how to analyze the text.
        4. Requirements: Provide any specific requirements or constraints for the prompt.
        In addition, give a key name suitable to store the result of the prompt, and define a JSON format for the output.
        Here are the data already exists in the dataset: {existing_keys}
        They do not need to be included in the JSON format.
        Reply with this JSON format:
            {{
               "prompt": {{
                    "Context": str,
                    "Task": str,
                    "Requirements": str
                    "JSON_format": str
                }}
            }}
        """.format(existing_keys=existing_keys),
    )
    task_message = f"""
        <task_name> {task['label']} </task_name>
        <description> {task['description']} </description>
        <explanation> {task['explanation']} </explanation>
    """
    response = await prompt_generation_agent.on_messages(
        [TextMessage(content=task_message, source="user")],
        cancellation_token=CancellationToken(),
    )
    return json.loads(response.chat_message.content)["prompt"]

async def run_input_key_generation_agent(task: Node, existing_keys: str, model: str, api_key: str):
    model_client = OpenAIChatCompletionClient(
        model=model,
        api_key=api_key,
        temperature=0.0,
    )
    prompt_generation_agent = AssistantAgent(
        name="input_key_generation_agent",
        model_client=model_client,
        system_message= """
        ** Context **
        You are an expert in text analytics.
        ** Task **
        The user will describe a task for you, and what data is available in the dataset. Your task is to pick the keys that are required from the dataset to complete the task.
        ** Requirements **
        Reply with this JSON format:
            {{
               "required keys": str[]
            }}
        """,
    )
    task_message = f"existing keys: {existing_keys}"
    response = await prompt_generation_agent.on_messages(
        [TextMessage(content=task_message, source="user")],
        cancellation_token=CancellationToken(),
    )
    return json.loads(response.chat_message.content)["required keys"]

api_key = open("api_key").read().strip()
primitive_tasks = json.load(open("../test_primitive_tasks.json"))
test_task, existing_keys = primitive_tasks[0], ["content"]
test_task, existing_keys = primitive_tasks[1], ["content", "entities"]
input_keys = await run_input_key_generation_agent(test_task, existing_keys, "gpt-4o-mini", api_key)
print(input_keys)
prompt = await run_prompt_generation_agent(test_task, input_keys, "gpt-4o-mini", api_key)
prompt


In [None]:
"{content}"
keys = ["content", "entities"]
"\n".join([f"{key}: {{{key}}}" for key in keys])

In [None]:
prompt_structured = {'Input Keys': 'content', 'Context': 'You are tasked with extracting structured information from a given text document. The goal is to identify key entities that can be used as nodes in a knowledge graph.', 'Task': 'Analyze the provided text and extract relevant entities such as names, organizations, locations, dates, and other significant terms. Ensure that the extracted entities are categorized appropriately.', 'Requirements': 'The extracted entities should be presented in a structured format, clearly indicating their types. Avoid including any irrelevant information or context from the original text.', 'JSON_format': "{ 'entities': [ { 'name': 'entity_name', 'type': 'entity_type' } ] }"}
print(str(prompt_structured['JSON_format']).replace("'", '"'))
list(json.loads(str(prompt_structured['JSON_format']).replace("'", '"')).keys())

In [None]:
import re
format_schema = "{ 'relationships': [ { 'name': 'entity_name', 'type': 'entity_type' }, ... ] }"
pattern = r"\{\s*'(\w+)'\s*:\s*\["

# Search for the match
match = re.search(pattern, format_schema)

# Extract and print the result
if match:
    print(match.group(1))  # Output: entities
else:
    print("No match found")

In [None]:
format_schema = "{ 'relationships': [ { 'entity1': 'string', 'entity2': 'string', 'relationship': 'string', 'context': 'string' } ] }"
format_schema = re.sub(
    r"\{", r"{{", prompt_structured["JSON_format"]
)
format_schema

In [None]:
format_schema = {'JSON_format': "{'entities': [{'name': 'string', 'category': 'string'}...]}"}
print(isinstance(format_schema['JSON_format'], dict))
print(isinstance(format_schema['JSON_format'], str))
# format_schema['JSON_format'] = json.dumps(format_schema['JSON_format'])
pattern = r'\{\s*"(\w+)"\s*:\s*\['
print(format_schema['JSON_format'])
format_schema['JSON_format'] = format_schema['JSON_format'].replace("'", '"')
match = re.search(pattern, format_schema['JSON_format'])
match.group(1)

In [None]:

async def run_json_schema_correction_agent(schema: str, model: str, api_key: str):
    model_client = OpenAIChatCompletionClient(
        model=model,
        api_key=api_key,
        temperature=0.0,
    )
    json_schema_correction_agent = AssistantAgent(
        name="json_schema_correction_agent",
        model_client=model_client,
        system_message= """
        ** Context **
        You are an expert in JSON schema
        ** Task **
        The user will give you an invalid JSON schema. Your task is to correct the JSON schema to make it valid.
        ** Requirements **
        Reply with this JSON format:
            {{
               "corrected_schema": str[]
            }}
        """,
    )
    task_message = f"existing keys: {existing_keys}"
    response = await prompt_generation_agent.on_messages(
        [TextMessage(content=task_message, source="user")],
        cancellation_token=CancellationToken(),
    )
    return json.loads(response.chat_message.content)["required keys"]


## Beam search


In [57]:
import json
from autogen_core import CancellationToken
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.messages import TextMessage
import concurrent
from tqdm import tqdm
import asyncio


async def run_goal_decomposition_agent_stepped(goal: str, previous_steps: list, model: str, api_key: str, n=1):
    model_client = OpenAIChatCompletionClient(
        model=model,
        api_key=api_key,
        temperature=0.0,
        model_capabilities={
            "vision": False,
            "function_calling": False,
            "json_output": True,
        },
    )
    goal_decomposition_agent = AssistantAgent(
        name="goal_decomposition_agent",
        model_client=model_client,
        system_message="""
        ** Context **
        You are a text analytics task planner. 
        Users have collected a dataset of documents. The user will describe a goal to achieve through some text analytics, and what they have done already.
        ** Task **
        Your task is to provide a single next step based on what the user have done so far.
        ** Requirements **
        Please specify the logical next step to take.
        Ignore the practical steps such as data collection, cleaning or visualization.
        Focus on the conceptual next step. If no further steps are needed, label the next step with "END".
        Reply with this JSON format:
            {
                "next_step": {
                        "id": (int),
                        "label": (string) or "END"
                        "description": (string)
                        "explanation": (string, explain why this step is needed)
                        "depend_on": (int[], ids of the steps that this step depends on)
                    },
            }  """,
    )
    user_message = "My goal is: {goal}".format(goal=goal) + "\n"
    if len(previous_steps) > 0:
        previous_steps_str = "\n".join(list(map(lambda s: f"{s['label']}: {s['description']}", previous_steps)))
        user_message += "Here are the steps that I have done so far: \n{previous_steps}".format(previous_steps=previous_steps_str)
    if n == 1:
        response = await goal_decomposition_agent.on_messages(
            [TextMessage(content=user_message, source="user")],
            cancellation_token=CancellationToken(),
        )
        return json.loads(response.chat_message.content)["next_step"]
    else:
        responses = await parallel_call_agents(n, goal_decomposition_agent, user_message)
        responses = [json.loads(response.chat_message.content)["next_step"] for response in responses]
        return responses

async def run_decomposition_self_evaluation_agent(goal: str, previous_steps: list, next_step: str, model: str, api_key: str, n=1):
    model_client = OpenAIChatCompletionClient(
        model=model,
        api_key=api_key,
        temperature=0.0,
        model_capabilities={
            "vision": False,
            "function_calling": False,
            "json_output": True,
        },
    )
    decomposition_self_evaluation_agent = AssistantAgent(
        name="decomposition_self_evaluation_agent",
        model_client=model_client,
        system_message="""
        ** Context **
        You are a text analytics expert.
        Users will describe a text analytics goal and the steps they have taken to achieve it.
        ** Task **
        Your task is to evaluate the correctness of the next step provided by the user.
        ** Requirements **
        Give a score from 0 to 5, where 0 is completely incorrect and 5 is perfect.
        Reply with this JSON format:
            {
                "evaluation_score": (int) 0-5
            }  """,
    )
    user_message = "My goal is: {goal}".format(goal=goal) + "\n" 
    if len(previous_steps) > 0:
        previous_steps_str = "\n".join(list(map(lambda s: f"{s['label']}: {s['description']}", previous_steps)))
        user_message += "Here are the steps that I have done so far: \n{previous_steps}".format(previous_steps=previous_steps_str) 
    user_message += "\nHere is the next step that I think I should take: {next_step}".format(next_step=next_step)

    
    if n == 1:
        response = await decomposition_self_evaluation_agent.on_messages(
            [TextMessage(content=user_message, source="user")],
            cancellation_token=CancellationToken(),
        )
        return json.loads(response.chat_message.content)["evaluation_score"]
    else:
        responses = await parallel_call_agents(n, decomposition_self_evaluation_agent, user_message)
        responses = [json.loads(response.chat_message.content)["evaluation_score"] for response in responses]
        return responses

async def call_agent(agent, user_message):
    response = await agent.on_messages(
        [TextMessage(content=user_message, source="user")],
        cancellation_token=CancellationToken(),
    )
    return response

async def parallel_call_agents(n, agent, user_message):
    tasks = [call_agent(agent, user_message) for _ in range(n)]
    
    results = []
    with tqdm(total=n) as pbar:
        for coro in asyncio.as_completed(tasks):
            result = await coro
            results.append(result)
            pbar.update(1)
    
    return results


In [58]:
all_steps = []
api_key = open("api_key").read().strip()
model = "gpt-4o-mini"
goal = "I need to construct a knowledge graph from a collection of documents from wikipedia."

In [None]:
next_step = await run_goal_decomposition_agent_stepped(goal, all_steps, model, api_key)
evaluation_score = await run_decomposition_self_evaluation_agent(goal, all_steps, next_step, model, api_key)
all_steps.append(next_step)
print(evaluation_score)
next_step

In [94]:
import copy
async def decode_n_samples(goal: str, previous_steps: list, model: str, api_key: str, n: int=2):
    candidate_steps = []
    next_steps = await run_goal_decomposition_agent_stepped(goal, previous_steps, model, api_key, n)
    evaluation_scores = await run_decomposition_self_evaluation_agent(goal, previous_steps, next_steps, model, api_key, n)
    for next_step, evaluation_score in zip(next_steps, evaluation_scores):
        new_beam = copy.deepcopy(previous_steps)
        new_beam.append(next_step)
        candidate_steps.append((new_beam, evaluation_score))
    return candidate_steps

async def beam_search_decomposition(goal: str, model: str, api_key: str, k: int = 2, n: int = 2):
    candidate_steps = []
    iteration = 0
    candidate_steps += await decode_n_samples(goal, [], model, api_key, n)
    print(f"candidate steps after iteration {iteration}: \n{candidate_steps}")
    iteration += 1
    while True:
        candidate_steps = await beam_search_decomposition_step(goal, candidate_steps, model, api_key, k, n)
        print("iteration: ", iteration)
        print(list(map(lambda s: s[0][-1]['label'], candidate_steps)))
        iteration += 1
        # check if all beams are END
        if len(candidate_steps) > 0 and all([beam[-1]["label"] == "END" for beam, _ in candidate_steps]):
            break
    return candidate_steps

async def beam_search_decomposition_step(goal: str, candidate_steps: list, model: str, api_key: str, k: int = 2, n: int = 2):
    new_candidates = []
    for beam, score in candidate_steps:
        previous_steps = beam
        if previous_steps[-1]["label"] == "END":
            continue
        new_candidates += await decode_n_samples(goal, previous_steps, model, api_key, n)
    candidate_steps = new_candidates
    # sort by score
    candidate_steps.sort(key=lambda x: x[1], reverse=True)
    # select top k steps
    candidate_steps = candidate_steps[:k]
    return candidate_steps

# api_key = open("api_key").read().strip()
# goal = "I need to construct a knowledge graph from a collection of documents from wikipedia."
# model = "gpt-4o-mini"
# top_k_decomposition = await beam_search_decomposition(goal, model, api_key, k=2, n=2)

In [153]:
import time
from fastapi.responses import StreamingResponse

api_key = open("api_key").read().strip()
goal = "I need to construct a knowledge graph from a collection of documents from wikipedia."
model = "gpt-4o-mini"
k=2
n=2
async def iter_response(candidate_steps):
    while True:
        candidate_steps = await beam_search_decomposition_step(goal, candidate_steps, model, api_key, k=k, n=n)
        yield candidate_steps
        if len(candidate_steps) > 0 and all([beam[-1]["label"] == "END" for beam, _ in candidate_steps]):
            break
        # time.sleep(5)  # Simulating processing time

candidate_steps = []
candidate_steps += await decode_n_samples(goal, [], model, api_key, n)
stream = iter_response(candidate_steps)

100%|██████████| 2/2 [00:01<00:00,  1.09it/s]
100%|██████████| 2/2 [00:00<00:00,  2.30it/s]


In [157]:
for i, (semantic_tasks, eval_score) in enumerate(candidate_steps):
    print(f"iteration {i}", eval_score)
    print(semantic_tasks, eval_score)
    print("==================================")
# candidate_steps = await anext(stream)
# async for steps in stream:
#     print(steps[0][-1], steps[1][-1])
#     print("==================================")

iteration 0 4
[{'id': 1, 'label': 'Entity Extraction', 'description': 'Extract entities and their relationships from the documents.', 'explanation': 'This step is crucial as it allows you to identify the key components (entities) and how they are related to each other, which is foundational for constructing a knowledge graph.', 'depend_on': []}] 4
iteration 1 5
[{'id': 1, 'label': 'Entity Extraction', 'description': 'Extract entities and their relationships from the documents.', 'explanation': 'Entity extraction is crucial for constructing a knowledge graph as it identifies the key components (entities) and their interconnections (relationships) within the text, which will form the nodes and edges of the graph.', 'depend_on': []}] 5


In [140]:
candidate_steps

[([{'id': -1,
    'label': 'Entity Extraction',
    'description': 'Extract entities and their relationships from the documents.',
    'explanation': 'This step is crucial as it allows you to identify the key components (entities) and how they are related to each other, which is foundational for constructing a knowledge graph.',
    'depend_on': []},
   {'id': 2,
    'label': 'Relation Mapping',
    'description': 'Map the extracted entities to their relationships to form a structured representation.',
    'explanation': 'This step is crucial as it will help in organizing the extracted entities into a coherent structure that reflects their interconnections, which is essential for constructing a knowledge graph.',
    'depend_on': [1]}],
  5),
 ([{'id': -1,
    'label': 'Entity Extraction',
    'description': 'Extract entities and their relationships from the documents.',
    'explanation': 'This step is crucial as it allows you to identify the key components (entities) and how they are

## MCTS

In [2]:
from pydantic import BaseModel
from typing import Optional
import math
import random
import os
import sys
sys.path.append(os.path.abspath(".")) 
sys.path.append(os.path.abspath("../")) 
sys.path.append(os.path.abspath("../../")) 

from server.AutoGenUtils import query
class MCT_Node(BaseModel):
    # Task properties
    id: str
    label: str
    description: str
    explanation: str
    parentIds: list[str]    
    # MCT properties
    print_label: str = "N/A"
    MCT_id: str
    MCT_parent_id: Optional[str]
    MCT_children_ids: list[str] = []
    visits: int = 0
    value: float = 0.0

async def MCTS_step(root: MCT_Node, node_dict: dict, goal: str, model: str, api_key: str) -> MCT_Node:
    node = select(root, node_dict)
    child = await expand(node, node_dict, goal, model, api_key)
    reward_value = await reward(child)
    backpropagate(child, reward_value, node_dict)
    return root

def UCT(node: MCT_Node, parent_node: MCT_Node | None, exploration_weight=1.41) -> float:
    """ Upper Confidence Bound for Trees (UCT) selection """
    if node.visits == 0:
        return float('inf')  # Prioritize unvisited nodes
    if node.label == "END":
        return float('-inf')
    if parent_node is None:
        parent_visits = 1
    else:
        parent_visits = parent_node.visits
    return (node.value / node.visits) + exploration_weight * (math.sqrt(math.log(parent_visits) / node.visits))

def select(node: MCT_Node, node_dict: dict) -> MCT_Node:
    while node.MCT_children_ids:
        parent_node = node_dict[node.MCT_parent_id] if node.MCT_parent_id else None
        node = max(list(map(lambda node_id: node_dict[node_id], node.MCT_children_ids)), key=lambda node: UCT(node, parent_node))
    return node

async def expand(parent_node: MCT_Node, node_dict: dict, goal: str, model: str, api_key: str, n=2) -> MCT_Node:
    """ Expands the node by adding one of its possible children """

    # new_node = MCT_Node(id=f"{parent_node.MCT_id}/{-1}", label="END", description="END", explanation="END", parentIds=[parent_node.MCT_id], MCT_id=f"{parent_node.MCT_id}/{-1}", MCT_parent_id=parent_node.MCT_id)
    # node_dict[new_node.MCT_id] = new_node
    # parent_node.MCT_children_ids.append(new_node.MCT_id)
    # return new_node
    previous_steps = get_previous_steps(parent_node, node_dict)
    if not is_END(parent_node):
        children = await query.run_goal_decomposition_agent_stepped(goal, previous_steps, model=model, api_key=api_key, temperature=1.0, n=n)
        for index, child_node in enumerate(children):
            print(child_node)
            child_as_MCT_node = MCT_Node(**child_node, 
                                         MCT_id=f"{parent_node.MCT_id}/{index}", 
                                         print_label=f"{child_node['label']} (0/0)", 
                                         MCT_parent_id=parent_node.MCT_id
                                        )
            node_dict[child_as_MCT_node.MCT_id] = child_as_MCT_node
            parent_node.MCT_children_ids.append(child_as_MCT_node.MCT_id)
        return node_dict[random.choice(parent_node.MCT_children_ids)]
    return parent_node  # No expansion if node is terminal

async def reward(node: MCT_Node) -> float:
    return random.random()
    # evaluation_score = await run_decomposition_self_evaluation_agent(goal, all_steps, node, model, api_key)
    return evaluation_score

def backpropagate(node: MCT_Node, reward: float, node_dict: dict) -> None:
    """ Updates the tree with the simulation results """
    while node is not None:
        node.visits += 1
        node.value += reward  # Should we do some normalization here to avoid inflation?
        node.print_label = f"{node.label} ({node.value}/{node.visits})"
        node = node_dict[node.MCT_parent_id] if node.MCT_parent_id else None

# def best_child(node: MCT_Node, node_dict: dict) -> MCT_Node:
#     return max(list(map(lambda id: node_dict[id], node.MCT_children_ids)), key=lambda c: c.visits) # most visits or highest value?

def get_previous_steps(node: MCT_Node, node_dict: dict) -> list[dict]:
    steps = []
    while node.MCT_parent_id:
        steps.append(dict(node))
        node = node_dict[node.MCT_parent_id]
    return steps

def is_END(node: MCT_Node):
    return node.label == "END"
    
from treelib import Node, Tree
def visualize_tree(root: MCT_Node, node_dict: dict):
    tree = Tree()
    # bread-first traversal
    queue = [root]
    while queue:
        node = queue.pop(0)
        tree.create_node(node.print_label, node.MCT_id, parent=node.MCT_parent_id)
        queue += list(map(lambda id: node_dict[id], node.MCT_children_ids))
    print(tree.show(stdout=False))

In [3]:
api_key = open("api_key").read().strip()
model = "gpt-4o-mini"
goal = "I need to construct a knowledge graph from a collection of documents from wikipedia."
root = MCT_Node(id="root", label="Root", MCT_id="-1", print_label="Root", description="Root node", explanation="Root node", parentIds=[], MCT_parent_id=None)
node_dict = {root.MCT_id: root}

In [10]:
root = await MCTS_step(root, node_dict, goal, model, api_key)
visualize_tree(root, node_dict)

100%|██████████| 2/2 [00:06<00:00,  3.18s/it]

{
    "next_step": {
        "id": "END",
        "label": "END",
        "description": "You have successfully completed the necessary steps to construct a knowledge graph from the Wikipedia documents.",
        "explanation": "Since you have extracted entities and relationships, constructed the knowledge graph, validated it, and analyzed it for insights, all essential tasks for achieving your goal are completed.",
        "parentIds": []
    }
}
{
    "next_step": {
        "id": "step5",
        "label": "Update the knowledge graph",
        "description": "Incorporate new information, corrections, or additional relationships as needed and iteratively refine the knowledge graph.",
        "explanation": "This step is needed to ensure that the knowledge graph remains accurate and relevant over time as new data becomes available or as existing data is validated.",
        "parentIds": ["step4", "step3"]
    }
}
{'id': 'END', 'label': 'END', 'description': 'You have successfully comple




In [143]:
def all_END(node: MCT_Node, node_dict: dict):
    # dfs to check if all paths end in END
    if not node.MCT_children_ids:
        return is_END(node)
    return all(all_END(node_dict[child], node_dict) for child in node.MCT_children_ids)
all_END(root, node_dict)

True

In [15]:
import json
json.dumps({"id": root.model_dump(mode="json")})

'{"id": {"id": "root", "label": "Root", "print_label": "Root (2.999588340678221/7)", "description": "Root node", "explanation": "Root node", "parentIds": [], "MCT_id": "-1", "MCT_parent_id": null, "MCT_children_ids": ["-1/0", "-1/1"], "visits": 7, "value": 2.999588340678221}}'