# Experiment 0527 - how good are GPT models in Fermi Estimation

In [None]:
# dependency
%pip install python-dotenv
%pip install openai
%pip install pandas

In [1]:
import os
from dotenv import load_dotenv, find_dotenv # load keys from .env file
import openai # use OpenAI API
import random
import json
import pandas as pd
import numpy as np
import datetime

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key  = os.getenv('OPENAI_API_KEY')

In [2]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
        max_tokens=500, # this is the maximum number of returned tokens
    )
    print(">>>>RESPONSE>>>>")
    print(str(response.choices[0].message["content"]))
    print("<<<<END<<<<")
    return response.choices[0].message["content"]

def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=20
    )
    return response.choices[0].message["content"]

In [None]:
# Uncomment and Run this code block to get an example output

# prompt_example= f"""What is Fermi Estimation and how does it work?"""
# response = get_completion(prompt_example)

## Prompt Design

In [3]:
# prompt 1 - baseline (minimal context and instructions + 0-shot)
def prompt_1(question):
    return f"""\
TASK: Here is a question: {question}. State your reason followed by your estimated answer in the order of magnitute 10^x.
REASON:
ANSWER: x = ?
"""

# prompt 2 - with classic example (rich context + 1-shot + CoT instruction )
def prompt_2(question):
    return f"""\
TASK: You are an AI assistant helping people practice Fermi estimation techniques.\
CONTEXT: Fermi estimation is a problem-solving technique that involves making rough estimates and approximations to arrive at a reasonable answer.
The basic idea behind Fermi estimation is to break down a complex problem into simpler parts and make educated guesses about the values of the variables involved.
For example, if you were asked to estimate the number of people in a city, you might start by estimating the population of a few neighborhoods and then extrapolating that to the entire city.
To make a Fermi estimate, you typically follow these steps:
1. Identify the key variables involved in the problem.
2. Make educated guesses about the values of these variables.
3. Use these guesses to calculate an approximate answer.
4. Check your answer for reasonableness and adjust your estimates if necessary.

Here is a question: {question}. State your reason followed by your estimated answer in the order of magnitute 10^x.
REASON:
ANSWER: x = ?
"""

# prompt 3 - competence prompt: "you KNOW the answer in the order of magnitude" (context 0-shot)
def prompt_3(question):
    return f"""\
TASK: A curious learner is asking you a question:
{question}
You know the answer in the order of magnitude. You will teach the learner, step by step, \
how they can work out an estimated answer themselves. You are allowed to make assumptions. Answer in the order of magnitude (10^x) and report value of x in ANSWER.
REASON:
ANSWER: x = ?
"""

# prompt 4 - prompt with uncertainty "you DON'T KNOW the exact answer, but you know it's in the order of magnitude of 10^x" (context 0-shot)
def prompt_4(question):
    return f"""\
TASK: A curious learner is asking you a question:
{question}
You don't know the exact answer, but you will teach the learner to work out a rough answer, step by step. Answer in the order of magnitude (10^x) and report value of x in ANSWER.
REASON:
ANSWER: x = ?
"""

# prompt 5 - explicit step-by-step instruction (CoT instruction + zero-shot)

# ask LLM to give its estimation chain of thought.
def prompt_5(question):
    return f"""\
TASK: You are an AI assistant helping people practice Fermi estimation techniques.\
A human will post a question and ask for help to estimate a quantity in the order of magnitude.\
The estimation should rely on human's common senses and knowledge about the world.\

To make a Fermi estimate, you follow these steps in REASON:
1. Identify the final quantity of interest, T, or the target quantity, that needs to be estimated.\
2. Analyse factors will determine the answer to the target quantity, and name those sub-quantities.\
3. Logically link relevant factors together and state your assumptions, to form a mathematical model for computation.\
4. Compute and convert your final answer in the order of magnitude (10^x)
You will answer succinctly and report the value of x in ANSWER.\

Question: {question}
REASON:
ANSWER: x = ?
"""

## Data collection / query the model

In [4]:
# sample questions from Eric or Allen
# open json file
database = ["Allen_realFP", "Eric_FP", "SOINC_FP"]
file_path = f"../../src/data/{database[0]}_suggestion.json"

def sample_questions(json_file):
    with open(json_file, 'r', encoding="utf-8") as f:
        data = json.load(f)

    sampled_questions = random.sample(data, 20)
    return sampled_questions


sampled_questions = sample_questions(file_path)
for entry in sampled_questions:
    print(f"{entry['name']} : {entry['answer']}") 


How hot does the superhero, the human torch, have to be to evaporate bullets in flight? : 3
How many times could you say the alphabet in 24 hours? : 4
How many bricks are in the exterior of our school building? : 6
On average and to be approximate, how much carbon emissions will be averted if 30,000 to 50,000 violent gang members were to be suddenly eliminated? : 6
How much volume would 417234 loosely stacked cherries occupy? : 6
The density of water is just about 1000 kg/m3. The density of air is just about 1000 times less. Estimate the height of the atmosphere. As a simplifying assumption, take the atmosphere to be of uniform density up to some height after which the density rapidly falls to zero. : 4
If all of the people in the United States move to Colorado, what would be the population density, people per square mile, of that state? : 5
What is the weight of air in your living room? : 2
How long would it have taken ENIAC (or another early computer) to mine 1 bitcoin? : 8
What is t

### Experiment with test data

In [None]:
# questions selected for testing purpose:
# sampled_questions_text = """How many hairs are on the average human head?
# What is the annual consumption of ice cream, in pounds, for the State of Ohio?
# How many toothpicks are required to cover a baseball diamond if the tooth picks are lying flat and sideby-side?
# The number of Earths that would fit inside of the sun?
# What is the maximum weight, in tons, of a Giant Sequoia tree?
# If every person who ever lived crowded together in one spot, how much area would be covered, in terms of the area of Rhode Island?
# If the annual interest that is owed on the national debt each year were divided equally amongst all U.S. citizens, how much would each citizen owe?
# How many lawyers are there in the United States?
# How many red blood cells are needed to blanket the surface area of an M&M?
# The My Lai Massacre occurred on this date in 1968. According to the US army, how many people were killed?
# How many car miles were avoided in 2010 by buses that took kids to school?
# How many tons one cubic cm of a neutron star would be?
# How many miles of railroad are operational in the U.S.?
# Hurricane Sandy was devastating and expensive. If you had a pile of pennies equal in value to Sandy's cost, what would be its weight in troy ounces?
# How many stacked 2 x 4 LEGO blocks would it take to reach the height of the Empire state Building?
# How many steps does the average American walk during their lifetime?
# According to the Census of Marine Life, with a membership of about 1000 scientists in 70 countries, what is the estimated number of life forms in the world's oceans?
# How many active volcanoes are on the surface of the earth?
# How many cups of coffee does Starbucks sell each year?
# Determine the density of oxygen in the air of this room at 25°C and 760 torr in g/L."""
# def string_to_list(input_string):
#     return input_string.split('\n')


# # prepare output in a csv file

# # start querying
# sampled_questions=string_to_list(sampled_questions_text)
# Define your DataFrame columns

In [5]:
# Get the current date and time
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

columns = ['Question', 'Prompt_1_result', 'Prompt_2_result', 'Prompt_3_result', 'Prompt_4_result', 'Prompt_5_result']

# Create an empty DataFrame
df = pd.DataFrame(columns=columns)

counter = 0
prompt_list = [prompt_1, prompt_2, prompt_3, prompt_4, prompt_5]
n = 10 # experiment will query the first n questions from the sampled questions
model = 'gpt-3.5-turbo'
print(f"Let's ROCK! with {model}\n*****************************************")

# iterate through each question in sampled question list
for number, question in enumerate(sampled_questions):
    # print(f"Question {number+1}: {question}")
    # Create a dictionary to store the results for this question
    results = {'Question': question}
    for index, prompt in enumerate(prompt_list[:n]):
        # print(f"Prompt {index+1}")
        num_trials = 3
        for i in range(num_trials): # max 3 trials
            try:
                counter += 1
                # Assuming that get_completion function returns the answer
                response = get_completion(prompt(question), model=model)
                # Add the result to the results dictionary
                results[f'Prompt_{index+1}_result'] = response
                break
            except Exception as e:
                print(f"Encountered an error at question {number} in trial {i+1}: {e}")
                response = "ERROR"
                # If it's the last trial, don't try again
                if i == num_trials - 1:
                    results[f'Prompt_{index+1}_result'] = response
                    break
                else:
                    print("Retrying...")
                # Add 'ERROR' to the results dictionary 
        # At the end of the inner loop, append the results to the DataFrame
    df = pd.concat([df, pd.DataFrame([results])], ignore_index=True)

# console output
print(df)


# save data to csv with timestamp for post analysis
csv_filename = f"estimation_output_{timestamp}.csv"
df.to_csv(csv_filename, index=False)

# save flattened data to txt file for easy copy-paste
txt_filename = f"estimation_output_{timestamp}.txt"
with open(txt_filename, 'w') as f:
    for answer in df.values.flatten():
        f.write(str(answer) + '\n')
        f.write("========================================\n")

Let's ROCK! with gpt-3.5-turbo
*****************************************
>>>>RESPONSE>>>>
As an AI language model, I cannot provide an estimated answer without sufficient information or context. Please provide more details or clarify the question.
<<<<END<<<<
>>>>RESPONSE>>>>
REASON: To estimate the temperature required to evaporate bullets in flight, we need to consider the melting point and boiling point of the material the bullets are made of, as well as the speed at which they are traveling. Assuming the bullets are made of lead, which has a melting point of 327.5°C and a boiling point of 1749°C, and that they are traveling at a typical speed of 1000 meters per second, we can estimate the temperature required to evaporate them in flight.

ANSWER: x = 4
Based on these assumptions, we can estimate that the temperature required to evaporate bullets in flight would be around 10^4 degrees Celsius. However, it's important to note that this is a very rough estimate and there are many fact

In [6]:
# An agent to handle the task of extracting answers from GPT output
df = pd.read_csv(f"./estimation_output_{timestamp}.csv")

# Step 1: Create a copy of the dataframe and set all columns but the first to NaN
df_copy = df.copy()
df_copy.iloc[:, 1:] = np.nan

# Function to apply on each string
def promptExtractOOM(question, context):
    # the prompt for GPT3.5 to extract the specific answer
    return f"""\
QUESTION: {question}
DESCRIPTION: {context}
TASK: Extract the answer to the given QUESTION from DESCRIPTION in the form of 10^(x).\
return only the value of x. If no answer is found, return NULL.
ANSWER:"""

def extract_answer_from_gpt_output(question, context, model="gpt-3.5-turbo", num_trials=3):
    # extract answer from GPT output
    messages =  [  
    {'role':'system', 'content':'You are an assistant to extract an answer from text'},    
    {'role':'user', 'content': promptExtractOOM(question, context)}]
    
    for i in range(num_trials): # max 3 trials
        try:
            # Assuming that get_completion function returns the answer
            response = get_completion_from_messages(messages,  model=model)
            print(response)
            return response
        except Exception as e:
            print(f"Encountered an error")
            # If it's the last trial, don't try again
            if i == num_trials - 1:
                return "ERROR"
            else:
                print("Retrying...")

# Step 2: Iterate over the remaining columns
cols = df.columns.tolist()
for col in cols[1:]:
    # Step 3: Apply the function to each value in the column
    df_copy[col] = df.apply(lambda row: extract_answer_from_gpt_output(row[cols[0]], row[col]), axis=1)

print(df_copy)


# save data to csv with timestamp for post analysis
csv_filename = f"extraction_output_{timestamp}.csv"
df_copy.to_csv(csv_filename, index=False)
# save flattened data to txt file for easy copy-paste
txt_filename = f"extraction_output_{timestamp}.txt"
with open(txt_filename, 'w') as f:
    for answer in df_copy.values.flatten():
        f.write(str(answer) + '\n')
        f.write("========================================\n")

NULL. The DESCRIPTION does not provide any information or context to extract the answer in the form of 
x = 4, so the answer is 10^4 = 10,000.
NULL
Encountered an error
Retrying...
NULL
6
x = 4.5
4
NULL
NULL
NULL
NULL
12
x = 5
1
5
NULL
The answer cannot be extracted from the given DESCRIPTION. Therefore, the answer is NULL.
NULL
NULL
NULL
4
x = 3
5.35
5
6
3
3
1
NULL
14
1
12
8
x = 22
x = 5.
13
-7
-7
x = 8
9.2
1
x = 4
6 bricks are not mentioned in the given DESCRIPTION, so the answer is NULL.
NULL
6
3
x = 3
x = 2
3
NULL
0
1
9
22
x = 5
3
-9
-8
x = 0
x = 9-10
0
4
5
NULL
6
NULL
3
1
-2
7
0
1
x = 11
x = 22
6
7
x = 0
-8
0
x = 9.13
7
4
3
4
5
2
3
1
x = 1
x = 18
0
14
Encountered an error
Retrying...
6
x = 25
x = 5
13
-10
-4
x = 9
x = 10
                                             Question   
0   {'id': 2364, 'name': 'How hot does the superhe...  \
1   {'id': 2147, 'name': 'How many times could you...   
2   {'id': 2126, 'name': 'How many bricks are in t...   
3   {'id': 2530, 'name': 'On average

In [32]:
# once the answer is obtained, manual labeling is required to determine if the answer is correct or not
df = pd.read_csv(f"./labelled_output_{timestamp}.csv") # vulnerable! timestamp may not be consistent

# Convert columns 2 to 6 to numeric, errors='coerce' turns non-numeric values into NaN
df.iloc[:, 1:6] = df.iloc[:, 1:6].apply(pd.to_numeric, errors='coerce')

# calculate the average of columns 2 to 6
df['Average'] = df.iloc[:, 1:6].mean(axis=1)

def calculate_score(truth, avg):
    return max(0, 1 - (truth - avg) / 3)

# Calculate performance score based on ground truth (col 6) and average (col 7)
# and store the value in the 'Performance_Score' column
df['Performance_Score'] = df.apply(lambda row: calculate_score(row[6], row[7]), axis=1)

for i in range(1, 6):
    df[f'Score_{i}'] = df.apply(lambda row: calculate_score(row[6], row[i]), axis=1)
print(df)

# output the average performance score
print(f"Average performance score: {df['Performance_Score'].mean()}")

# output the average performance score for each prompt
for i in range(1, 6):
    print(f"Average performance score for prompt {i}: {df[f'Score_{i}'].mean()}")

                                                                                                                   Question   
0   {'id': 2364, 'name': 'How hot does the superhero, the human torch, have to be to evaporate bullets in flight?', 'ans...  \
1                   {'id': 2147, 'name': 'How many times could you say the alphabet in 24 hours?', 'answer': 4, 'unit': ''}   
2              {'id': 2126, 'name': 'How many bricks are in the exterior of our school building?', 'answer': 6, 'unit': ''}   
3   {'id': 2530, 'name': 'On average and to be approximate, how much carbon emissions will be averted if 30,000 to 50,00...   
4       {'id': 2101, 'name': 'How much volume would 417234 loosely stacked cherries occupy?', 'answer': 6, 'unit': 'cm**3'}   
5   {'id': 2090, 'name': 'The density of water is just about 1000 kg/m3. The density of air is just about 1000 times les...   
6   {'id': 2209, 'name': 'If all of the people in the United States move to Colorado, what would be the populat

## Result inspection and data analysis

In [28]:
#print the entire rows where 'Performance_Score' is effectively 0.00
zero_score_rows = df[df['Performance_Score'] <= 0.00001]
pd.set_option('display.max_colwidth', 120)  # or 199
print(zero_score_rows.iloc[:, 0])

8     {'id': 2354, 'name': 'How long would it have taken ENIAC (or another early computer) to mine 1 bitcoin?', 'answer': ...
9     {'id': 2373, 'name': 'What is the total area of all currently existing spiderwebs in the world?', 'answer': 18, 'uni...
11                                 {'id': 2104, 'name': 'How much oil is there left in the world?', 'answer': 14, 'unit': ''}
12    {'id': 2115, 'name': 'Suppose a meteor made of nickel and iron falls to the earth from deep space. Estimate how much...
13                                 {'id': 2550, 'name': 'What is the mass, in kgs, of the moon?', 'answer': 23, 'unit': 'kg'}
15    {'id': 2449, 'name': 'At the height of summer, what is the total mass of all the leaves grown on trees in New Hampsh...
16    {'id': 2295, 'name': 'How dense is the Milky Way galaxy in terms of stars per cubic light year?', 'answer': 7, 'unit...
18    {'id': 2004, 'name': 'What would be the effect to the oceans If all fishing was banned for four days once a year