A jupyter notebook to test the code generation module

In [77]:
import os
import re
import string
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

In [78]:
# Read the CSV file into a Pandas DataFrame
# print(os.getcwd())
calendar_data = pd.read_csv("../data/calendar_data.csv")
print(f"The number of calendar data is {len(calendar_data)}.")

with open("../data/question.json") as json_file:
    json_data = json.load(json_file)
    # print(json_data[question_index])

print(f"The number of the question is {len(json_data)}.")
print(f"Selected question is: {json_data[0]['question']}.")
print(f"Selected answer is: {json_data[0]['answer']}.")

The number of calendar data is 20.
The number of the question is 47.
Selected question is: How many meetings do I have attended in total?.
Selected answer is: 18.


In [79]:

# MODEL = "gpt-4"
MODEL = "gpt-3.5-turbo"

question_index = 9 # for question test

In [80]:
def get_completion(MODEL, PROMPT):
    if MODEL == "gpt-4":
        client = OpenAI()
    else:
        client = OpenAI(
            api_key=os.environ.get("OPENAI_API_KEY"),
        )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": PROMPT,
            }
        ],
        model=MODEL, 
    )

    # verify the output
    return chat_completion.choices[0].message.content

In [81]:
def get_prompt(question):
    
    PROMPT = f"""You are provided a calendar. This calendar is a Pandas dataframe named calendar_data, columns = [ID, status, summary, start, end , duration, attendees].
    This DataFrame calendar_data includes all of your meeting schedule.
    Your task is generate python function to query this dataframe and answer the question. Output a python code by enclosing it in triple backticks. 

    The input have following columns:
    - ID: meeting ID;
    - status: meeting status, including the following status: cancelled, confirmed, tentative; cancelled means that meeting is cancelled.
    - summary: meeting or event topic;
    - start: the start date of meeting, date format: YYYY-MM-DD hh:mm:ss.fff-zz:xx. for example "2024-02-05 12:00:00-00:00";
    - end: the start date of meeting, date format: YYYY-MM-DD hh:mm:ss.fff-zz:xx, for example "2024-02-05 13:00:00-00:00";
    - duration: meeting duration (second);
    - attendees: people who attend the meeting delimited by the line terminator within 1 sentence.

    The output should be a markdown python code snippet formatted in the following schema, including the leading and trailing "```python" and "```":
    ```python
    <your code here>
    ```

    The input of python code is a Pandas dataframe named calendar_data, and the answer is saved in variable answer.

    For example, the output have the following format:
    ```
    import pandas as pd 
    def query(calendar_data):
        return calendar_data[0]
    answer = query(calendar_data)
    ```
    
    Today's date is '2024-04-02 09:02:30', date format: %Y-%m-%d %H:%M:%S. Today is Tuesday.

    Question to be resolved: {question} 
    """
    return PROMPT



In [82]:
def get_prompt_for_refine(question, generated_code):
    
    PROMPT = f"""
    
    """
    return PROMPT



In [83]:
def get_prompt_with_error(question, generated_code, error_info):
    
    PROMPT = f"""
    """
    return PROMPT



In [84]:
def get_python_code(llm_reply_with_code):
    _python_code_re_pattern = "```python\n(.*?)```"
    llm_reply_without_code = re.sub(
                _python_code_re_pattern, "", llm_reply_with_code, flags=re.DOTALL
                )
    python_code_list = re.findall(_python_code_re_pattern, llm_reply_with_code, re.DOTALL)
    return python_code_list[0]

In [85]:
# for question in json_data: 
#     print(question['question'])
#     print(question['answer'])

In [86]:
result = {'success': 0, 'wrong_answer': 0, 'error': 0}
question_answer_summary = {'question': '',
                           'generated_code': '',
                           'true_answer':'', 
                           'answer': ''
                        }
result_list = []

for q_a_pair in json_data: 
    PROMPT = get_prompt(q_a_pair['question'])
    llm_reply_with_code = get_completion(MODEL, PROMPT)
    python_code_list = get_python_code(llm_reply_with_code)
    
    question_answer_summary['question'] = q_a_pair['question']
    question_answer_summary['true_answer'] = q_a_pair['answer']
    question_answer_summary['generated_code'] = python_code_list
    
    print(f"""Question: {q_a_pair['question']}\n True answer: {q_a_pair['answer']}""")

    try:
        exec(python_code_list)
    except Exception as E:
        question_answer_summary['answer'] = repr(E)
        print(f"""Non-executable code: {E}""")
        result['error'] += 1
        pass
    else:
        # print(f"""generated answer: {answer}""")
        if q_a_pair['answer'] == answer:
            question_answer_summary['answer'] = 'success'
            result['success'] += 1
        else:   
            question_answer_summary['answer'] = repr(answer)
            print(f"""Wrong answer: {answer}""")
            result['wrong_answer'] += 1
    
    result_list.append(question_answer_summary.copy())
    print(result)
    
# print(json.dumps(result_list, indent=4))
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(result_list, f, indent=4)
    

Question: How many meetings do I have attended in total?
 True answer: 18
{'success': 1, 'wrong_answer': 0, 'error': 0}
Question: How many events do I have scheduled for today?
 True answer: 1
Non-executable code: Can only use .dt accessor with datetimelike values
{'success': 1, 'wrong_answer': 0, 'error': 1}
Question: How many attendees are there for the meeting with ID masbk72a24cb0a8k9c7jo0e9s6?
 True answer: 2
Wrong answer: walter@ninjatech.ai(accepted)\n salesteam@ninjatech.ai(rejected)
{'success': 1, 'wrong_answer': 1, 'error': 1}
Question: What is the longest meeting ID on my calendar?
 True answer: malrq85j74yb0m3n8j8ro2v5d9
{'success': 2, 'wrong_answer': 1, 'error': 1}


IndexError: list index out of range

In [91]:
print(_python_code_re_pattern)

NameError: name '_python_code_re_pattern' is not defined

In [None]:
print(result_list) 

[{'question': 'How many meetings do I have attended in total?', 'generated_code': "import pandas as pd \n\ndef total_meetings(calendar_data):\n    # based on the logic that cancelled meetings won't be attended, we filtered out cancelled meetings\n    not_cancelled_meetings = calendar_data[calendar_data['status'] != 'cancelled']  \n    total_meetings_attended = not_cancelled_meetings.shape[0]\n    return total_meetings_attended\n\nanswer = total_meetings(calendar_data)\n", 'true_answer': 18, 'answer': 'success'}, {'question': 'How many events do I have scheduled for today?', 'generated_code': "import pandas as pd\nfrom datetime import datetime\n\ndef num_of_events_today(calendar_data):\n    # Convert the 'start' column to datetime datatype\n    calendar_data['start'] = pd.to_datetime(calendar_data['start'])\n\n    # Filter the dataframe to include the events of today\n    events_today = calendar_data[(calendar_data['start'].dt.date == datetime.now().date())]\n\n    return events_today.s