In [189]:
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from time import sleep
import random


load_dotenv()

client = OpenAI()

In [190]:
epistimalogical_features = {
    "behavior": {
        "Social": {
            "Description": "Refers to beliefs, perceptions, preferences, and socially constructed rules that govern human experience; can be 'real' or opinion, but is intrinsically of human origins.",
            "Examples": [
                "I exist and am the same person I was yesterday.",
                "He yelled at me because he was angry.",
                "There are seven days in the week.",
            ],
        },
        "Physical": {
            "Description": "Refers to objective features of the world as described by physics, biology, engineering, mathematics, or other natural rules; can be measured empirically or derived logically.",
            "Examples": [
                "Men on average are taller than women.",
                "The Earth is the third planet from the Sun.",
                "Ants are smaller than elephants.",
            ],
        },
    },
    "everyday": {
        "Everyday": {
            "Description": "People encounter, or could encounter, situations like this in the course of their ordinary, everyday experiences.",
            "Examples": [
                "Touching a hot stove will burn you.",
                "Commuting at rush hour takes longer.",
                "It is rude to jump the line.",
            ],
        },
        "Abstract": {
            "Description": "Refers to regularities or conclusions that cannot be observed or arrived at solely through individual experience.",
            "Examples": [
                "Capitalism is a better economic system than Communism.",
                "Strict gun laws save lives.",
                "God exists.",
            ],
        },
    },
    "figure of speech": {
        "Figure of Speech": {
            "Description": "Contains an aphorism, metaphor, hyperbole.",
            "Examples": [
                "Birds of a feather flock together.",
                "A friend to all is a friend to none.",
            ],
        },
        "Literal Language": {
            "Description": "Plain and ordinary language that means exactly what it says.",
            "Examples": [
                "The sky is blue.",
                "Elephants are larger than dogs.",
                "Abraham Lincoln was a great president.",
            ],
        },
    },
    "judgment": {
        "Normative": {
            "Description": "Refers to a judgment, belief, value, social norm or convention.",
            "Examples": [
                "If you are going to the office, you should wear business attire, not a bathing suit.",
                "Treat others how you want them to treat you.",
                "Freedom is a fundamental human right.",
            ],
        },
        "Positive": {
            "Description": "Refers to something in the world such as an empirical regularity or scientific law.",
            "Examples": [
                "Hot things will burn you.",
                "The sun rises in the east and sets in the west.",
            ],
        },
    },
    "opinion": {
        "Opinion": {
            "Description": "Something that someone might think is true, or wants others to think is true, but can’t be demonstrated to be objectively correct or incorrect; it is inherently subjective.",
            "Examples": [
                "FDR was the greatest US president of the 20th Century.",
                "The Brooklyn Bridge is prettier than the Golden Gate.",
                "Vaccine mandates are a tolerable imposition on individual freedom.",
            ],
        },
        "Factual": {
            "Description": "Something that can be demonstrated to be correct or incorrect, independently of anyone’s opinion.",
            "Examples": [
                "The earth is the third planet from the sun.",
                "Obama was the 24th president of the United States.",
                "It will be sunny next Tuesday.",
            ],
        },
    },
    "reasoning": {
        "Knowledge": {
            "Description": "The claim refers to some observation about the world; it may be true or false, opinion or fact, subjective or objective.",
            "Examples": [
                "The sun rises in the east and sets in the west.",
                "Dogs are nicer than cats.",
                "Glasses break when they are dropped.",
            ],
        },
        "Reasoning": {
            "Description": "The claim presents a conclusion that is arrived at by combining knowledge and logic.",
            "Examples": [
                "The sun is in the east, therefore it is morning.",
                "My dog is wagging its tail, therefore it is happy.",
                "The glass fell off the table, therefore it will break and the floor will become wet.",
            ],
        },
    },
}

In [195]:
design_point = {
    "behavior": "Social",
    "everyday": "Everyday",
    "figure of speech": "Figure of Speech",
    "judgment": "Normative",
    "opinion": "Opinion",
    "reasoning": "Knowledge",
}

In [196]:
wikipedia_categories = [
    "General reference",
    "Culture and the arts",
    "Geography and places",
    "Health and fitness",
    "History and events",
    "Human activities",
    "Mathematics and logic",
    "Natural and physical sciences",
    "People and self",
    "Philosophy and thinking",
    "Religion and belief systems",
    "Society and social sciences",
    "Technology and applied sciences",
]

In [198]:
prompt = f'Give me the all statements that best belong to the following general category "{wikipedia_categories[0]}" with epistimalogical features \n'

for key in epistimalogical_features.keys():
    sub_key = design_point[key]
    prompt += f"\n'{key}' is '{sub_key}' meaning {epistimalogical_features[key][sub_key]['Description']}"

print(prompt)

Give me the all statements that best belong to the following: 
 In the category "General reference" with epistimalogical features 

'behavior' is 'Social' meaning Refers to beliefs, perceptions, preferences, and socially constructed rules that govern human experience; can be 'real' or opinion, but is intrinsically of human origins.
'everyday' is 'Everyday' meaning People encounter, or could encounter, situations like this in the course of their ordinary, everyday experiences.
'figure of speech' is 'Figure of Speech' meaning Contains an aphorism, metaphor, hyperbole.
'judgment' is 'Normative' meaning Refers to a judgment, belief, value, social norm or convention.
'opinion' is 'Opinion' meaning Something that someone might think is true, or wants others to think is true, but can’t be demonstrated to be objectively correct or incorrect; it is inherently subjective.
'reasoning' is 'Knowledge' meaning The claim refers to some observation about the world; it may be true or false, opinion or 

In [199]:
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": "Write 100 short common sense claims that are open to different interpretations and has all the properties mentioned below. Do not explain the result, just provide a single short claim in a JSON list under the key 'statements'.",
        },
        {"role": "user", "content": prompt},
    ],
    response_format={"type": "json_object"},
    presence_penalty=-1,
)

print(completion.choices[0].message)

ChatCompletionMessage(content='{\n  "statements": [\n    "Actions speak louder than words.",\n    "Beauty is in the eye of the beholder.",\n    "Birds of a feather flock together.",\n    "Money can\'t buy happiness.",\n    "Laughter is the best medicine.",\n    "Time heals all wounds.",\n    "The early bird catches the worm.",\n    "Two heads are better than one.",\n    "When in Rome, do as the Romans do.",\n    "Every cloud has a silver lining.",\n    "Actions have consequences.",\n    "The pen is mightier than the sword.",\n    "Absence makes the heart grow fonder.",\n    "Practice makes perfect.",\n    "You can\'t judge a book by its cover.",\n    "A picture is worth a thousand words.",\n    "Honesty is the best policy.",\n    "The grass is always greener on the other side.",\n    "Better late than never.",\n    "Love makes the world go round.",\n    "You reap what you sow.",\n    "The squeaky wheel gets the grease.",\n    "You can’t have your cake and eat it too.",\n    "Good thing

In [200]:
json.loads(completion.choices[0].message.content)

{'statements': ['Actions speak louder than words.',
  'Beauty is in the eye of the beholder.',
  'Birds of a feather flock together.',
  "Money can't buy happiness.",
  'Laughter is the best medicine.',
  'Time heals all wounds.',
  'The early bird catches the worm.',
  'Two heads are better than one.',
  'When in Rome, do as the Romans do.',
  'Every cloud has a silver lining.',
  'Actions have consequences.',
  'The pen is mightier than the sword.',
  'Absence makes the heart grow fonder.',
  'Practice makes perfect.',
  "You can't judge a book by its cover.",
  'A picture is worth a thousand words.',
  'Honesty is the best policy.',
  'The grass is always greener on the other side.',
  'Better late than never.',
  'Love makes the world go round.',
  'You reap what you sow.',
  'The squeaky wheel gets the grease.',
  'You can’t have your cake and eat it too.',
  'Good things come to those who wait.',
  'Beauty is only skin deep.',
  'What doesn’t kill you makes you stronger.',
  'You

## Create batch statement generation job

In [201]:
def get_dict(id, prompt):
    return {
        "custom_id": f"request-{id}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o",
            "messages": [
                {
                    "role": "system",
                    # "content": "Write 100 short common sense claim that has all of the following properties. Do not explain the result, just provide a single short claim in a JSON list.",
                    "content": "Write 100 short common sense claims that are open to different interpretations and has all the properties mentioned below. Do not explain the result, just provide a single short claim in a JSON list under the key 'statements'.",
                },
                {"role": "user", "content": prompt},
            ],
            "response_format": {"type": "json_object"},
            "max_tokens": 4094,
            "temperature": round(random.uniform(1, 1.5), 2),
            "presence_penalty": -1,
        },
    }


with open("statement-batch.jsonl", "w") as file:
    for i in range(0, 20):
        data = get_dict(i, prompt)
        file.write(json.dumps(data) + "\n")

### Upload the data to openai batch api

In [202]:

batch_input_file = client.files.create(
  file=open("statement-batch.jsonl", "rb"),
  purpose="batch"
)

Create the batch job with the uploaded data

In [203]:
# Create a batch
statement_batch_job = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "statement generation batch"},
)

In [204]:
statement_batch_job

Batch(id='batch_lqMkF3rdzPka4tR5nv9bqBti', completion_window='24h', created_at=1724410986, endpoint='/v1/chat/completions', input_file_id='file-FbDI24sQLxbBhpZV6514JFDD', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1724497386, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'statement generation batch'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [205]:
while (
    statement_batch_job.status == "in_progress"
    or statement_batch_job.status == "validating"
):
    statement_batch_job = client.batches.retrieve(statement_batch_job.id)
    print(statement_batch_job.status)
    if statement_batch_job.status == "failed":
        print(statement_batch_job.error)
        break
    elif statement_batch_job.status == "completed":
        print(statement_batch_job.output_file_id)
        break
    sleep(20)

in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
completed
file-Pv0JPxiXF0aIsuifvq6I6TNP


In [206]:
if statement_batch_job.status == "completed":
    print("Batch job completed")
    output_file = client.files.content(statement_batch_job.output_file_id)
else:
    print("Batch job status:", statement_batch_job.status)

Batch job completed


In [207]:
with open("statement-batch-output.jsonl", "w") as file:
    file.write(output_file.text)

In [208]:
statement_list = []

with open("statement-batch-output.jsonl", "r") as file:
    for idx, line in enumerate(file):
        try:
            data = json.loads(line)
            print(idx, data["response"]["body"]["usage"])
            statement: str = data["response"]["body"]["choices"][0]["message"]["content"]
            # print(json.loads(statement)["statements"])
            statement_list.extend(json.loads(statement)["statements"])
        except Exception as e:
            print(e)


df = pd.DataFrame(statement_list, columns=["statement"])
df.drop_duplicates(inplace=True)
df.to_csv("statement-batch-output-2.csv", index=False)

0 {'prompt_tokens': 277, 'completion_tokens': 903, 'total_tokens': 1180}
1 {'prompt_tokens': 277, 'completion_tokens': 953, 'total_tokens': 1230}
2 {'prompt_tokens': 277, 'completion_tokens': 887, 'total_tokens': 1164}
3 {'prompt_tokens': 277, 'completion_tokens': 1655, 'total_tokens': 1932}
4 {'prompt_tokens': 277, 'completion_tokens': 910, 'total_tokens': 1187}
5 {'prompt_tokens': 277, 'completion_tokens': 850, 'total_tokens': 1127}
6 {'prompt_tokens': 277, 'completion_tokens': 914, 'total_tokens': 1191}
7 {'prompt_tokens': 277, 'completion_tokens': 925, 'total_tokens': 1202}
8 {'prompt_tokens': 277, 'completion_tokens': 912, 'total_tokens': 1189}
9 {'prompt_tokens': 277, 'completion_tokens': 776, 'total_tokens': 1053}
10 {'prompt_tokens': 277, 'completion_tokens': 978, 'total_tokens': 1255}
11 {'prompt_tokens': 277, 'completion_tokens': 822, 'total_tokens': 1099}
12 {'prompt_tokens': 277, 'completion_tokens': 908, 'total_tokens': 1185}
13 {'prompt_tokens': 277, 'completion_tokens': 