In [12]:
import os
import json
source_dir = "../source_code"
prompt_dir = "./prompt"
answer_dir = "./answer"
os.makedirs(prompt_dir, exist_ok=True)
os.makedirs(answer_dir, exist_ok=True)

with open("../source_code/210.py", "r") as f:
    example_code = f.read()
with open("./static_ast/210.py.json", "r") as f:
    example_ast = json.load(f)

for file in os.listdir(source_dir):
    file_path = os.path.join(source_dir, file)
    with open(file_path, "r") as f:
        code = f.read()

    
    prompt = f"""You are an Abstract Syntax Tree (AST) parser. I will give you a code file. You give me its AST in Json. Each AST node only has three attributes, children, type and value. Don't use any tool.

The input file is 
```python
{code}
```
    """

    with open(os.path.join(prompt_dir, file.replace(".py", ".txt")), "w") as f:
        f.write(prompt)

    with open(os.path.join(answer_dir, file.replace(".py", ".json")), "w") as f:
        f.write("")


In [5]:
from llm import get_llm_answers
import json
import os

answer_dir = "./gpt-4o_answer"
os.makedirs(answer_dir, exist_ok=True)
system_prompt = """Convert the given code file into an Abstract Syntax Tree (AST) in JSON format with specific attributes.

- Parse the provided code to identify its structure.
- Construct an AST where each node contains exactly three attributes: 'children', 'type', and 'value'.
- Do not use any external tools or libraries to assist in this conversion.

# Steps

1. **Analyze Code Structure**: Break down the code into its syntactic components (e.g., expressions, statements, declarations, etc.).
2. **Node Construction**: For each syntactic component:
   - Determine the 'type' of component (e.g., "function_declaration", "expression", "if_statement").
   - Extract the 'value', if applicable (e.g., operator in an expression).
   - Identify 'children' nodes that represent sub-structures or components within the larger structure.
3. **Build JSON**: Create a JSON object for each node including the attributes 'children', 'type', and 'value'.
4. **Recursion**: Recursively process sub-components, nesting them as 'children' within their parent node.

# Output Format

- The output should be a JSON representation of the AST.
- Each node should include only 'children', 'type', and 'value' attributes.
- Maintain the hierarchical structure of nodes as they appear in the code.

# Examples

## Example Input:
```python
def add(a, b):
    return a + b
```

## Example Output:
```json
{
  "type": "function_definition",
  "value": "add",
  "children": [
    {
      "type": "parameter_list",
      "value": "",
      "children": [
        {"type": "parameter", "value": "a", "children": []},
        {"type": "parameter", "value": "b", "children": []}
      ]
    },
    {
      "type": "return_statement",
      "value": "",
      "children": [
        {
          "type": "binary_expression",
          "value": "+",
          "children": [
            {"type": "identifier", "value": "a", "children": []},
            {"type": "identifier", "value": "b", "children": []}
          ]
        }
      ]
    }
  ]
}
```

# Notes

- Ensure that each node's 'value' attribute is populated only when relevant, leaving it empty for nodes that do not directly carry a value.
- Carefully maintain the hierarchy and order of nodes as it appears in the code to correctly reflect the structure.
"""

def process_ast(code):
    messages = [
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": code,
        },
    ]
    llm_output = json.loads(get_llm_answers(messages, "gpt-4o", require_json=True, temperature=0))
    return llm_output

from concurrent.futures import ThreadPoolExecutor

def process_file(file_path):
    print(f"Processing {file_path}")
    with open(file_path, "r") as f:
        code = f.read()
    ast = process_ast(code)
    with open(os.path.join(answer_dir, file_path.replace(".txt", ".json")), "w") as f:
        json.dump(ast, f, indent=4)
    print(f"Processed {file_path}")

from tqdm import tqdm

source_code_dir = "../source_code"
with ThreadPoolExecutor() as executor:
    files = os.listdir(source_code_dir)[:10]
    files = [os.path.join(source_code_dir, file) for file in files]
    list(tqdm(executor.map(process_file, files), total=len(files), desc="Processing files"))


Processing ../source_code/11.py
Processing ../source_code/60.py
Processing ../source_code/180.py
Processing ../source_code/195.py
Processing ../source_code/202.py
Processing ../source_code/208.py
Processing ../source_code/176.py
Processing ../source_code/151.py
Processing ../source_code/98.py
Processing ../source_code/13.py


Processing files:   0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
META_PROMPT = """
Given a task description or existing prompt, produce a detailed system prompt to guide a language model in completing the task effectively.

# Guidelines

- Understand the Task: Grasp the main objective, goals, requirements, constraints, and expected output.
- Minimal Changes: If an existing prompt is provided, improve it only if it's simple. For complex prompts, enhance clarity and add missing elements without altering the original structure.
- Reasoning Before Conclusions**: Encourage reasoning steps before any conclusions are reached. ATTENTION! If the user provides examples where the reasoning happens afterward, REVERSE the order! NEVER START EXAMPLES WITH CONCLUSIONS!
    - Reasoning Order: Call out reasoning portions of the prompt and conclusion parts (specific fields by name). For each, determine the ORDER in which this is done, and whether it needs to be reversed.
    - Conclusion, classifications, or results should ALWAYS appear last.
- Examples: Include high-quality examples if helpful, using placeholders [in brackets] for complex elements.
   - What kinds of examples may need to be included, how many, and whether they are complex enough to benefit from placeholders.
- Clarity and Conciseness: Use clear, specific language. Avoid unnecessary instructions or bland statements.
- Formatting: Use markdown features for readability. DO NOT USE ``` CODE BLOCKS UNLESS SPECIFICALLY REQUESTED.
- Preserve User Content: If the input task or prompt includes extensive guidelines or examples, preserve them entirely, or as closely as possible. If they are vague, consider breaking down into sub-steps. Keep any details, guidelines, examples, variables, or placeholders provided by the user.
- Constants: DO include constants in the prompt, as they are not susceptible to prompt injection. Such as guides, rubrics, and examples.
- Output Format: Explicitly the most appropriate output format, in detail. This should include length and syntax (e.g. short sentence, paragraph, JSON, etc.)
    - For tasks outputting well-defined or structured data (classification, JSON, etc.) bias toward outputting a JSON.
    - JSON should never be wrapped in code blocks (```) unless explicitly requested.

The final prompt you output should adhere to the following structure below. Do not include any additional commentary, only output the completed system prompt. SPECIFICALLY, do not include any additional messages at the start or end of the prompt. (e.g. no "---")

[Concise instruction describing the task - this should be the first line in the prompt, no section header]

[Additional details as needed.]

[Optional sections with headings or bullet points for detailed steps.]

# Steps [optional]

[optional: a detailed breakdown of the steps necessary to accomplish the task]

# Output Format

[Specifically call out how the output should be formatted, be it response length, structure e.g. JSON, markdown, etc]

# Examples [optional]

[Optional: 1-3 well-defined examples with placeholders if necessary. Clearly mark where examples start and end, and what the input and output are. User placeholders as necessary.]
[If the examples are shorter than what a realistic example is expected to be, make a reference with () explaining how real examples should be longer / shorter / different. AND USE PLACEHOLDERS! ]

# Notes [optional]

[optional: edge cases, details, and an area to call or repeat out specific important considerations]
""".strip()

def generate_prompt(task_or_prompt: str):
    messages = [
        {
            "role": "system",
            "content": META_PROMPT,
        },
        {
            "role": "user",
            "content": "Task, Goal, or Current Prompt:\n" + task_or_prompt,
        },
    ]
    prompt = get_llm_answers(messages, "gpt-4o", temperature=0)
    return prompt

prompt =generate_prompt("You are an Abstract Syntax Tree (AST) parser. I will give you a code file. You give me its AST in Json. Each AST node only has three attributes, children, type and value. Don't use any tool.")
print(prompt)

Convert the given code file into an Abstract Syntax Tree (AST) in JSON format with specific attributes.

- Parse the provided code to identify its structure.
- Construct an AST where each node contains exactly three attributes: 'children', 'type', and 'value'.
- Do not use any external tools or libraries to assist in this conversion.

# Steps

1. **Analyze Code Structure**: Break down the code into its syntactic components (e.g., expressions, statements, declarations, etc.).
2. **Node Construction**: For each syntactic component:
   - Determine the 'type' of component (e.g., "function_declaration", "expression", "if_statement").
   - Extract the 'value', if applicable (e.g., operator in an expression).
   - Identify 'children' nodes that represent sub-structures or components within the larger structure.
3. **Build JSON**: Create a JSON object for each node including the attributes 'children', 'type', and 'value'.
4. **Recursion**: Recursively process sub-components, nesting them as 'c