In [51]:
from openai import OpenAI, RateLimitError, APITimeoutError
from tqdm import tqdm
import tiktoken
import concurrent
import time
import json
from pprint import pprint
openai_client = OpenAI(api_key=open("api_key").read().strip())
def save_json(data, filename):
    with open (filename, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)


In [10]:
def request_gpt(
    client, messages, model="gpt-4o-mini", temperature=0.5, format=None, seed=None
):
    with open("request_log.txt", "a", encoding="utf-8") as f:
        f.write(f"model: {model}, temperature: {temperature}, format: {format}\n")
        f.write(json.dumps(messages, ensure_ascii=False) + "\n")
        f.write("=====================================\n")
    try:
        if format == "json":
            response = (
                client.chat.completions.create(
                    model=model,
                    messages=messages,
                    response_format={"type": "json_object"},
                    temperature=temperature,
                    seed=seed,
                ),
            )

        else:
            response = client.chat.completions.create(
                model=model, messages=messages, temperature=temperature, seed=seed
            )
        return response.choices[0].message.content
    except RateLimitError as e:
        print("RateLimitError")
        print(e)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)
    except APITimeoutError as e:
        print("APITimeoutError")
        print(messages)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)

def multithread_prompts(
    client,
    prompts,
    model="gpt-4o-mini",
    temperature=0.5,
    response_format=None,
    seed=None,
):
    l = len(prompts)
    # results = np.zeros(l)
    with tqdm(total=l) as pbar:
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=100)
        futures = [
            executor.submit(
                request_gpt, client, prompt, model, temperature, response_format, seed
            )
            for prompt in prompts
        ]
        for _ in concurrent.futures.as_completed(futures):
            pbar.update(1)
    concurrent.futures.wait(futures)
    return [future.result() for future in futures]



## Decompose a goal into semantic steps

In [52]:
system_prompt = """
You are a text analytics task planner. 
Users have collected their own dataset, and they need help with text analytics tasks.
Users will describe a task to you, and you need to help them decompose the task into subtasks, and then provide a plan to complete the task. 
You can ask questions to clarify the task, and you can also ask for more information if needed. 
You can also provide examples to help the user understand the task better.
Reply with this JSON format:
{
    "steps": [
        {
            "label": (string)
            "description": (string)
            "explanation": (string, explain why this step is needed)
        },
        {
            "label": (string)
            "description": (string)
            "explanation": (string, explain why this step is needed)
        },
        ...
    ]
}
"""
user_goal_kgc = """I need to construct a knowledge graph from a collection of documents from wikipedia."""
user_goal_bsa = """I need to perform business management analysis using the SWOT strategy on a dataset of company financial reports."""
prompt = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": user_goal_kgc,
    }
]
response = request_gpt(openai_client, prompt, model="gpt-4o-mini", temperature=0.5)
print(response)
steps = json.loads(response)["steps"]
save_json(steps, "kgc_semantic.json")

{
    "steps": [
        {
            "label": "Data Collection",
            "description": "Gather the relevant Wikipedia documents that you want to use for constructing the knowledge graph.",
            "explanation": "This step is essential to ensure that you have all the necessary data at hand before starting the knowledge graph construction process."
        },
        {
            "label": "Text Preprocessing",
            "description": "Clean and preprocess the text data by removing unnecessary elements such as HTML tags, special characters, and stop words.",
            "explanation": "Preprocessing helps improve the quality of the data by making it easier to extract meaningful information and relationships from the text."
        },
        {
            "label": "Entity Recognition",
            "description": "Use Named Entity Recognition (NER) techniques to identify and extract entities (such as people, places, organizations) from the text.",
            "explanation":

## Decompose a semantic step into more semantic steps

In [None]:
semantic_steps = json.load(open("kgc_semantic.json"))
test_index = 0
test_target_step = semantic_steps[test_index]['label'] + ": " + semantic_steps[test_index]['description']
system_prompt = """You are a text analytics task planner. 
The use will describe a task to you, you need to help them decompose the task into subtasks, and then provide a plan to complete the task. 
You can also provide examples to help the user understand the task better. 
Reply with this JSON format: 
{ "steps": [ 
        { 
            "label": (string) 
            "description": (string) 
            "explanation": (string, explain why this step is needed)
        }, 
        { 
            "label": (string) 
            "description": (string) 
            "explanation": (string, explain why this step is needed)
        }, 
        ... 
    ] 
}"""
prompt = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": test_target_step,
    }
]
response = request_gpt(openai_client, prompt, model="gpt-4o-mini", temperature=0.5)
print(response)
more_steps = json.loads(response)["steps"]
save_json(more_steps, "kgc_semantic_step_0.json")
# merge into original steps
original_steps = json.load(open("kgc_semantic.json"))
original_steps[test_index] = more_steps
# flatten
original_steps = [step for steps in original_steps for step in steps]
save_json(original_steps, "kgc_semantic_decomposed.json")
# merge into original steps
original_steps = json.load(open("kgc_semantic.json"))
original_steps[test_index:test_index + 1] = more_steps
# flatten
save_json(original_steps, "kgc_semantic_decomposed.json")

{
  "steps": [
    {
      "label": "Text Preprocessing",
      "description": "Clean and preprocess the text data to remove noise and irrelevant information.",
      "explanation": "This step is essential to ensure that the text is in a suitable format for analysis. Preprocessing may include lowercasing, removing punctuation, and tokenization, which helps improve the accuracy of subsequent NER, RE, and EE processes."
    },
    {
      "label": "Named Entity Recognition (NER)",
      "description": "Apply NER techniques to identify and classify entities in the text into predefined categories such as persons, organizations, locations, dates, etc.",
      "explanation": "NER helps in pinpointing the key entities present in the text, which is the first step in understanding the content and context of the data. Identifying these entities is crucial for further analysis and relationship extraction."
    },
    {
      "label": "Relation Extraction (RE)",
      "description": "Analyze the i

## Add Flags to semantic steps

In [68]:
semantic_steps = json.load(open("kgc_semantic_decomposed.json"))
system_prompt = """You are a text analytics task helper. 
The user will describe a list practical tasks for you, you need to determine if each of the tasks can be done with typical text analytics tasks, and give an explanation.
You can also provide examples to help the user understand the task better. 
Reply with this JSON format: 
{ "steps": [ 
        { 
            "label": (string) 
            "analytical": (boolean, true if the task can be done with text analytics tasks)
            "explanation": (string, explain why this step is needed) 
        },
        , ... 
    ] 
}"""
prompt = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": json.dumps(semantic_steps),
    }
]
formulate_flags = request_gpt(openai_client, prompt, model="gpt-4o-mini", temperature=0.5)
print(formulate_flags)
save_json(json.loads(formulate_flags)["steps"], "kgc_flags.json")

{
  "steps": [
    {
      "label": "Text Preprocessing",
      "analytical": true,
      "explanation": "Text preprocessing is a fundamental step in text analytics, as it prepares the raw text data for analysis by removing noise and irrelevant information."
    },
    {
      "label": "Named Entity Recognition (NER)",
      "analytical": true,
      "explanation": "NER is a core text analytics task that identifies and classifies entities within the text, allowing for a better understanding of the content."
    },
    {
      "label": "Relation Extraction (RE)",
      "analytical": true,
      "explanation": "RE is a critical aspect of text analytics that focuses on determining relationships between identified entities, contributing to deeper insights from the text."
    },
    {
      "label": "Event Extraction (EE)",
      "analytical": true,
      "explanation": "EE is an important text analytics task that identifies events within the text, capturing dynamic information essential fo

## An attempt to add previous and next steps when decomposing a semantic task

In [None]:
previous_steps = steps[0]["label"] + ", " + steps[1]['label'] + ", "
step_3 = steps[2]
user_prompt = user_goal_kgc + """I have already done the following steps: """ + previous_steps + """ Now I need to """ + step_3["description"] + "which means " + step_3["explanation"]
prompts = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": user_prompt
    }
]
subtask_response = request_gpt(openai_client, prompts, model="gpt-4o-mini", temperature=0.5)
print(user_prompt)
print(subtask_response)

I need to perform business management analysis using the SWOT strategy.I have already done the following steps: Define the Objective, Gather Relevant Data,  Now I need to List the internal strengths of the business, such as resources, capabilities, and competitive advantages.which means Identifying strengths allows you to understand what the business excels at and can leverage for growth or competitive advantage.
{
    "steps": [
        {
            "label": "Identify Resources",
            "description": "List all tangible and intangible resources available to the business, such as financial resources, human capital, technology, and physical assets.",
            "explanation": "Identifying resources is crucial because it provides a foundation for understanding what the business has at its disposal to achieve its objectives and compete effectively in the market."
        },
        {
            "label": "Assess Capabilities",
            "description": "Evaluate the skills, compet

## Convert a semantic task into analytical tasks

In [69]:
semantic_steps = json.load(open("kgc_flags.json"))
semantic_analytics_tasks = list(filter(lambda x: x['analytical'] is True, semantic_steps))
assert(all(x['label'] in list(map(lambda x: x['label'], steps))) for x in semantic_analytics_tasks)
pprint(semantic_analytics_tasks)
system_prompt = """You are a text analytics task helper. 
The user will give you a practical task, you need to decompose it into subtasks that can be done with text analytics techniques.
Here are some examples of text analytics techniques: Named Entity Recognition, Annotation, Text Classification, Summarization, Sentiment Analysis, etc.
You can add more techniques as needed, but the techniques need to be strictly defined from the perspective of natural language processing.
Reply with the following JSON format:
{
    "subtasks": [
        {
            "name": (string, the name of the technique), 
            "description": (string, the description of the technique),
            "explanation": (string, explain why this technique is needed)
            "input": (string, the input needed for this technique)
            "output": (string, the output of this technique)
        },
        ...
    ]
}
"""
step_index = 1
assert(step_index > 0 and step_index < len(steps) - 1)
user_prompt = f"""{user_goal_kgc}. I have done {steps[step_index-1]['label']}. I need to {steps[step_index]['label']}, which means {steps[step_index]['description']}, which prepares me for the next step {steps[step_index + 1]['label']}."""
prompt = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {
        "role": "user",
        "content": user_prompt
    }
]
analytics_tasks = request_gpt(openai_client, prompt, model="gpt-4o-mini", temperature=0.5)
print(user_prompt)
print(analytics_tasks)
save_json(json.loads(analytics_tasks)["subtasks"], f"kgc_analytical_tasks_step_{step_index}.json")

[{'analytical': True,
  'explanation': 'Text preprocessing is a fundamental step in text analytics, '
                 'as it prepares the raw text data for analysis by removing '
                 'noise and irrelevant information.',
  'label': 'Text Preprocessing'},
 {'analytical': True,
  'explanation': 'NER is a core text analytics task that identifies and '
                 'classifies entities within the text, allowing for a better '
                 'understanding of the content.',
  'label': 'Named Entity Recognition (NER)'},
 {'analytical': True,
  'explanation': 'RE is a critical aspect of text analytics that focuses on '
                 'determining relationships between identified entities, '
                 'contributing to deeper insights from the text.',
  'label': 'Relation Extraction (RE)'},
 {'analytical': True,
  'explanation': 'EE is an important text analytics task that identifies '
                 'events within the text, capturing dynamic information '
        

## Implement an analytical task

In [80]:
step_index = 1
analytical_tasks = json.load(open(f"kgc_analytical_tasks_step_{step_index}.json"))
print(analytical_tasks[0])
system_prompt = """You are a text analytics task code generator. 
The user will give you a text analytics task, you need to generate code snippet for the task.
Write a function that takes the input and returns the output. You don't need to include examples or main function.
Add comments in the code to explain the codes.
Reply with the following JSON format:
{
    "code": (string, in python)
}"""
for analytics_task in analytical_tasks:
    task_name = analytics_task['name']
    description = analytics_task['description']
    input = analytics_task['input']
    output = analytics_task['output']
    user_prompt = f"""I want to implement {task_name}: {description}. The input is {input}, and the output is {output}."""
    prompt = [
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ]
    code = request_gpt(openai_client, prompt, model="gpt-4o-mini", temperature=0.5)
    print(user_prompt)
    print(code)
    analytics_task['code'] = json.loads(code)["code"]
save_json(analytical_tasks, f"kgc_analytical_tasks_step_{step_index}_codes.json")

{'name': 'Named Entity Recognition (NER)', 'description': 'A technique used to identify and classify key entities in the text into predefined categories such as people, organizations, locations, etc.', 'explanation': 'NER is needed to extract relevant entities from the cleaned text, which will serve as the foundational components for constructing the knowledge graph.', 'input': 'Cleaned text data obtained from the text preprocessing step.', 'output': 'A list of identified entities along with their types and associated metadata.'}
I want to implement Named Entity Recognition (NER): A technique used to identify and classify key entities in the text into predefined categories such as people, organizations, locations, etc.. The input is Cleaned text data obtained from the text preprocessing step., and the output is A list of identified entities along with their types and associated metadata..
{
    "code": "import spacy\n\n# Load the pre-trained spaCy model for NER\nnlp = spacy.load('en_co

In [82]:
import black

# String of Python code
code_string = analytical_tasks[0]['code']
# Specify the file name
file_name = "test_output.py"

# Step 1: Write the code string to a file
with open(file_name, "w") as file:
    file.write(code_string)

# Step 2: Format the file using black
try:
    black.format_file_in_place(
        src=file_name,
        fast=False,
        mode=black.FileMode(),
        write_back=black.WriteBack.YES,
    )
    print(f"Code has been formatted and saved to {file_name}.")
except Exception as e:
    print(f"Formatting failed: {e}")


Formatting failed: 'str' object has no attribute 'suffix'
