In [5]:
from openai import OpenAI, RateLimitError, APITimeoutError
from tqdm import tqdm
import tiktoken
import concurrent
import time
import json
from pprint import pprint
openai_client = OpenAI(api_key=open("api_key").read().strip())
def save_json(data, filename):
    with open (filename, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def request_gpt(
    client, messages, model="gpt-4o-mini", temperature=0.5, format=None, seed=None
):
    with open("request_log.txt", "a", encoding="utf-8") as f:
        f.write(f"model: {model}, temperature: {temperature}, format: {format}\n")
        f.write(json.dumps(messages, ensure_ascii=False) + "\n")
        f.write("=====================================\n")
    try:
        if format == "json":
            response = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    response_format={"type": "json_object"},
                    temperature=temperature,
                    seed=seed,
                )
            
            try:
                json.loads(response.choices[0].message.content)
                return response.choices[0].message.content
            except json.JSONDecodeError as e:
                print("JSON Decode Error")
                print(e)
                time.sleep(5)
                return request_gpt(client, messages, model, temperature=1.0, format=format)

        else:
            response = client.chat.completions.create(
                model=model, messages=messages, temperature=temperature, seed=seed
            )
        return response.choices[0].message.content
    except RateLimitError as e:
        print("RateLimitError")
        print(e)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)
    except APITimeoutError as e:
        print("APITimeoutError")
        print(messages)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)

def multithread_prompts(
    client,
    prompts,
    model="gpt-4o-mini",
    temperature=0.5,
    response_format=None,
    seed=None,
):
    l = len(prompts)
    # results = np.zeros(l)
    with tqdm(total=l) as pbar:
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=100)
        futures = [
            executor.submit(
                request_gpt, client, prompt, model, temperature, response_format, seed
            )
            for prompt in prompts
        ]
        for _ in concurrent.futures.as_completed(futures):
            pbar.update(1)
    concurrent.futures.wait(futures)
    return [future.result() for future in futures]

In [6]:
semantic_task = [
    {
        "label": "Document Preprocessing",
        "description": "Clean and preprocess the text from the Wikipedia documents.",
        "explanation": "Preprocessing is essential to remove noise and irrelevant information, which will enhance the quality of the extraction process and ensure consistency in the analysis.",
        "depend_on": [],
        "id": "0",
        "parentIds": []
    },
    {
        "label": "Entity Recognition",
        "description": "Identify key entities (like people, places, organizations) within the preprocessed documents.",
        "explanation": "Entity recognition is crucial as it helps in identifying the main subjects or nodes that will form the foundation of the knowledge graph.",
        "depend_on": [
            "Document Preprocessing"
        ],
        "id": "1",
        "parentIds": [
            "0"
        ]
    },
    {
        "label": "Relationship Extraction",
        "description": "Extract relationships between the identified entities.",
        "explanation": "Understanding how entities relate to each other is key to building the connections in the knowledge graph, which provides context and meaning.",
        "depend_on": [
            "Entity Recognition"
        ],
        "id": "2",
        "parentIds": [
            "1"
        ]
    },
    {
        "label": "Ontology Definition",
        "description": "Define the structure and schema of the knowledge graph, including classes and properties.",
        "explanation": "Creating an ontology is needed to formalize the relationships and hierarchy of the entities, ensuring the knowledge graph is organized and interpretable.",
        "depend_on": [
            "Entity Recognition",
            "Relationship Extraction"
        ],
        "id": "3",
        "parentIds": [
            "1",
            "2"
        ]
    },
    {
        "label": "Graph Construction",
        "description": "Construct the knowledge graph using the identified entities, relationships, and defined ontology.",
        "explanation": "This step is where the actual graph is built, combining all the previous outputs into a structured format that can be utilized for querying and analysis.",
        "depend_on": [
            "Ontology Definition"
        ],
        "id": "4",
        "parentIds": [
            "3"
        ]
    },
    {
        "label": "Validation and Refinement",
        "description": "Validate the knowledge graph for accuracy and completeness, and refine it as necessary.",
        "explanation": "Validation ensures the knowledge graph accurately represents the information extracted from the documents; refinement helps improve its quality and usability.",
        "depend_on": [
            "Graph Construction"
        ],
        "id": "5",
        "parentIds": [
            "4"
        ]
    }
]
semantic_tasks_str = json.dumps(semantic_task)

In [11]:
# read defs
elementary_task_defs = json.load(open("elementary_task_defs.json"))
elementary_task_defs_str = ""
for elementary_task in elementary_task_defs:
    elementary_task_defs_str += "<elementary_task>\n"
    for key, value in elementary_task.items():
        elementary_task_defs_str += f"<{key}>{value}</{key}>\n"
    elementary_task_defs_str += "</elementary_task>\n"
# Decompose a tree of semantic tasks into elementary tasks
prompts = [
    {
        "role": "system",
        "content": """
        ** Context **
        You are a Natural Language Processing (NLP) assistant. You are given a list of elementary NLP tasks that could be used.
        Here is the list of elementary NLP tasks:
        {elementary_task_defs}
        ** Task **
        The user will describe a series of real-world tasks for you. First, for each of the task, decide if it can be formulated as an NLP task. If yes, you need to find the proper elementary NLP tasks and arrange them to accomplish user's goal. 
        You can ignore the need to handle formats or evaluate outputs.
        ** Requirements **
        The labels of the elementary task must match exactly as those provided above. Reply with the following JSON format: 
        {{ "elementary_tasks": [ 
                {{ 
                    "id": int,
                    "label": (string) (one of the above)
                    "description": (string, describe implementation procedure)
                    "explanation": (string, explain why this task is needed)
                    "depend_on": (string[], the tasks task this step depends on)
                    "input": string,
                    "output": string,
                    "example_output": string,
                }}, 
                {{ 
                    "id": int,
                    "label": (string) (one of the above)
                    "description": (string, describe implementation procedure)
                    "explanation": (string, explain why this task is needed)
                    "depend_on": (int[], the task ids that this task depends on)
                    "input": string,
                    "output": string,
                    "example_output": string,
                }}, 
                ... 
            ] 
        }}
        """.format(elementary_task_defs=elementary_task_defs)
    },
    {
        "role": "user",
        "content": f"{semantic_tasks_str}"
    }
]
response = request_gpt(openai_client, prompts, model="gpt-4o-mini", temperature=0.0, format="json")

In [12]:
print(response)

{
    "elementary_tasks": [
        {
            "id": 0,
            "label": "Text Mining",
            "description": "Analyze the Wikipedia documents to discover patterns and insights, focusing on cleaning and preprocessing the text.",
            "explanation": "Preprocessing is essential to remove noise and irrelevant information, which will enhance the quality of the extraction process and ensure consistency in the analysis.",
            "depend_on": [],
            "input": "Wikipedia Documents",
            "output": "Cleaned Text",
            "example_output": "The text has been cleaned of irrelevant information and noise."
        },
        {
            "id": 1,
            "label": "Information Extraction",
            "description": "Identify key entities (like people, places, organizations) within the preprocessed documents.",
            "explanation": "Entity recognition is crucial as it helps in identifying the main subjects or nodes that will form the foundation 

In [4]:
# print(response)
import json
test_response = """
{
  "elementary_tasks": [
    {
      "label": "Information Extraction",
      "description": "Identify and extract key entities (e.g., people, places, organizations) from the documents using defined techniques.",
      "explanation": "Entity extraction is the first step in building a knowledge graph, identifying primary components that will form the nodes of the graph.",
      "depend_on": [],
      "input": "Documents",
      "output": "Extracted Entities",
      "example_output": "{'Person': 'Barack Obama', 'Location': 'Honolulu', 'Organization': 'United States'}"
    },
    {
      "label": "Information Extraction",
      "description": "Determine the relationships between the extracted entities based on the context of the documents.",
      "explanation": "Understanding the relationships allows us to connect the entities and define the edges of the knowledge graph, which is crucial for representing the semantic structure.",
      "depend_on": ["Extract Entities"],
      "input": "Extracted Entities and Context",
      "output": "Identified Relationships",
      "example_output": "{'Entity A': 'Barack Obama', 'Entity B': 'United States', 'Relationship': 'Presidency'}"
    },
    {
      "label": "Define Ontology",
      "description": "Establish a schema or ontology that categorizes the types of entities and relationships defined from the previous steps.",
      "explanation": "An ontology helps in standardizing how entities and relationships are represented, making the knowledge graph more structured and interpretable.",
      "depend_on": ["Extract Entities", "Identify Relationships"],
      "input": "Entity Types and Relationships",
      "output": "Defined Ontology",
      "example_output": "{'Entity Types': ['Person', 'Organization', 'Location'], 'Relationships': ['Works At', 'Lives In']}"
    },
    {
      "label": "Construct Knowledge Graph",
      "description": "Create the knowledge graph by integrating the extracted entities and their relationships according to the defined ontology.",
      "explanation": "This step involves the actual creation of the graph structure that can be visualized or queried, which is the end goal of the task.",
      "depend_on": ["Define Ontology", "Identify Relationships"],
      "input": "Extracted Entities and Relationships",
      "output": "Knowledge Graph",
      "example_output": "{'Nodes': ['Barack Obama', 'United States'], 'Edges': ['Barack Obama - Presidency -> United States']}"
    },
    {
      "label": "Iterate and Refine",
      "description": "Review and refine the knowledge graph to improve accuracy and completeness.",
      "explanation": "It’s essential to validate the graph to ensure it accurately represents the information from the documents and to address any gaps or errors.",
      "depend_on": ["Construct Knowledge Graph"],
      "input": "Initial Knowledge Graph",
      "output": "Refined Knowledge Graph",
      "example_output": "{'Refined Nodes': ['Barack Obama', 'United States', 'Democrat'], 'Refined Edges': ['Barack Obama - Presidency -> United States']}"
    }
  ]
}
"""
elementary_tasks = json.loads(test_response)["elementary_tasks"]

In [None]:
print(elementary_task_defs_str)

In [17]:
test = [{'id': '1', 'label': 'Extract Entities', 'description': 'Identify and extract key entities (people, places, organizations, etc.) from the documents.', 'explanation': 'This step is crucial for populating the knowledge graph with relevant and meaningful nodes, which represent different concepts and facts from the documents.', 'depend_on': [], 'parentIds': [], 'children': ['2', '3']}, {'id': '2', 'label': 'Identify Relationships', 'description': 'Determine relationships between the extracted entities based on the context of the documents.', 'explanation': 'Identifying relationships helps in defining how entities are connected in the knowledge graph, enabling a better representation of the information landscape.', 'depend_on': ['1'], 'parentIds': ['1'], 'children': ['3', '4']}, {'id': '3', 'label': 'Define Schema', 'description': 'Create a schema that outlines the types of entities and their possible relationships.', 'explanation': 'A well-defined schema is necessary to ensure that the knowledge graph is structured, understandable, and queryable, facilitating accurate representation of the data.', 'depend_on': ['1', '2'], 'parentIds': ['1', '2'], 'children': ['4']}, {'id': '4', 'label': 'Construct Graph', 'description': 'Assemble the entities and relationships into a graphical representation based on the schema.', 'explanation': 'Constructing the graph is the final step in visually organizing the extracted entities and relationships into a coherent structure that can be analyzed and queried.', 'depend_on': ['2', '3'], 'parentIds': ['2', '3'], 'children': []}]

decomposed_semantic_tasks = list(
    map(
        lambda t: t.update({"parentIds": t["depend_on"], "children": []}),
        test,
    )
)
decomposed_semantic_tasks

[None, None, None, None]

In [18]:
import csv
all_rows = []
elementary_task_defs = json.load(open("elementary_task_defs.json"))
for task_def in elementary_task_defs:
    all_rows.append({
        "label": task_def['label'],
        "definition": task_def['definition'],
        "input": task_def['input'],
        "output": task_def['output'],
    })

with open('elementary_task_defs.csv', 'w',) as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(list(all_rows[0].keys()))
    for row in all_rows:
        writer.writerow(list(row.values()))
