In [1]:
import json
from typing import Dict, List
import math
import random

In [2]:
def load_examples():
    examples: List[Dict[str, str]] = []

    with open('./data/examples.jsonl', 'r') as file:
        for line in file:
            examples.append(json.loads(line))

    return examples

In [3]:
import os

with open("keys.json", "r") as f:
    keys = json.load(f)
    os.environ['OPENAI_API_KEY'] = keys['openai']

In [4]:
import re
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import BaseOutputParser

class LabelOutputParser(BaseOutputParser):
    def parse(self, text: str):
        return text

system_template = 'You are a helpful classifier that follows the examples below. Output just the correct label. \n {context}'
human_template = '{unrealized_example}'

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("human", human_template),
])

chain = chat_prompt | ChatOpenAI(model='gpt-3.5-turbo-1106', temperature=0, request_timeout=10) | LabelOutputParser()

In [5]:
def create_text_example(example: dict[str, str]) -> str:
    return f"Input: '{example['input']}' Label: {example['label']}"

In [7]:
def create_context(examples: List[Dict[str, str]]) -> str:
    # Try to break any repeating pattern in the examples
    # Set seed for reproducibility
    random.seed(42)
    random.shuffle(examples)
    return '\n'.join([create_text_example(example) for example in examples])

In [6]:
def evaluate(context: str, unrealized_example: Dict[str, str]) -> bool:
    input = unrealized_example['input']    
    true_label = unrealized_example['label']
    text_input = f"Input: '{input}' Label: "
    print(unrealized_example)
    llm_label = chain.invoke({"context": context, "unrealized_example": text_input})
    print(f'true_label: {true_label} llm_label: {llm_label}')

    return llm_label == true_label

In [8]:
def get_accuracy_for_task(task: str, examples: List[Dict[str, str]], share_in_context = 0.5) -> float:
    task_examples = [example for example in examples if example["task"] == task]
    context_cutoff = math.floor(len(task_examples) * share_in_context)
    context_examples = task_examples[:context_cutoff]
    unrealized_examples = task_examples[context_cutoff:]
    context = create_context(context_examples)
    print(context)
    evaluations = [evaluate(context, example) for example in unrealized_examples]
    return sum(evaluations) / len(evaluations)
    

In [31]:
def provide_reasoning(task: str, examples: List[Dict[str, str]], model = 'gpt-3.5-turbo-1106'):
    system_template_rule = ('You are great at identifying rules from examples. '
    'You will receive examples from the user where input is paired with True if it follows the rule and False otherwise. '
    'Think step by step about what the rule is. '
    'Finally summarize the thinking by outputting "Rule:" followed by the rule you identified.')
    human_template_rule = '{examples}'
    chat_prompt_rule = ChatPromptTemplate.from_messages([
        ("system", system_template_rule),
        ("human", human_template_rule),
    ])

    chain_rule = chat_prompt_rule | ChatOpenAI(model=model, temperature=0) | LabelOutputParser()

    task_examples = [example for example in examples if example["task"] == task]
    context = create_context(task_examples)
    response = chain_rule.invoke({"examples": context})

    model_path = f'./data/articulation/{model}'
    if not os.path.exists(model_path):
        os.makedirs(model_path, exist_ok=True)

    with open(f'{model_path}/{task}.txt', 'w') as f:
        f.write(response)


In [16]:
accuracies = { 
    'gpt-3.5-turbo-1106': {},
    'gpt-4-1106-preview': {}
}

In [29]:
def evaluate_task_accuracy(task: str, model = 'gpt-3.5-turbo-1106', share_in_context=0.6):
    examples = load_examples()
    accuracy = get_accuracy_for_task(task, examples, share_in_context=share_in_context)
    print(f'Accuracy for {task}: {accuracy}')
    accuracies[model][task] = accuracy
    provide_reasoning(task, examples=examples, model=model)

In [23]:
tasks = set([d['task'] for d in load_examples()])
tasks

{'about_animals',
 'about_cat',
 'contains_sun',
 'ends_with_period',
 'lowercase',
 'number_start',
 'word_length',
 'word_length_extreme'}

In [43]:
with open('./data/accuracies.json', 'w') as f:
    json.dump(accuracies, f)

accuracies

{'gpt-3.5-turbo-1106': {'about_animals': 0.6875,
  'contains_sun': 0.65625,
  'number_start': 0.78125,
  'word_length': 0.84375,
  'word_length_extreme': 1.0,
  'about_cat': 0.78125,
  'lowercase': 0.53125,
  'ends_with_period': 0.53125,
  'about_food': 0.71875,
  'question': 0.28125},
 'gpt-4-1106-preview': {}}

In [39]:
evaluate_task_accuracy('question')

Input: 'Why do we dream?' Label: True
Input: 'Can fish breathe underwater?' Label: True
Input: 'Is there life on other planets?' Label: True
Input: 'Where is the nearest grocery store?' Label: True
Input: 'How often should I water plants?' Label: True
Input: 'What is your favorite color?' Label: True
Input: 'What are black holes?' Label: True
Input: 'Is Mars inhabited?' Label: True
Input: 'What is the capital of France?' Label: True
Input: 'Where did I leave my keys?' Label: True
Input: 'Are all stars the same size?' Label: True
Input: 'Who won the game last night?' Label: True
Input: 'Deserts have very little rainfall.' Label: False
Input: 'Are dinosaurs extinct?' Label: True
Input: 'Can you help me with this?' Label: True
Input: 'Can money buy happiness?' Label: True
Input: 'Can computers think like humans?' Label: True
Input: 'Trees grow leaves in the spring.' Label: False
Input: 'Can plants grow without sunlight?' Label: True
Input: 'Is global warming real?' Label: True
Input: 'Bir

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=10.0).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=10.0).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=10.0).


true_label: False llm_label: False
{'task': 'question', 'input': 'Cars are a common mode of transportation.', 'label': 'False'}
true_label: False llm_label: False
{'task': 'question', 'input': 'Snow is common in cold climates.', 'label': 'False'}
true_label: False llm_label: False
{'task': 'question', 'input': 'Computers have become essential in modern life.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'The internet connects people globally.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Books can be found in a library.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'A year has four seasons.', 'label': 'False'}


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=10.0).


true_label: False llm_label: False
{'task': 'question', 'input': 'The sun is a star in the solar system.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'The heart pumps blood throughout the body.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Leaves often change color in autumn.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Fish live in water.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Light travels faster than sound.', 'label': 'False'}
true_label: False llm_label: False
{'task': 'question', 'input': 'Volcanoes can erupt and spew lava.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Art can be expressed in many forms.', 'label': 'False'}
true_label: False llm_label: False
{'task': 'question', 'input': 'Polar bears live in cold climates.', 'label': 'False'}
true_label: False llm_label: True
{'tas

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=10.0).


true_label: False llm_label: False
{'task': 'question', 'input': 'Bees are essential for pollination.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'The Great Wall of China is centuries old.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Mathematics is a universal language.', 'label': 'False'}
true_label: False llm_label: False
{'task': 'question', 'input': "Seasons are caused by the Earth's tilt.", 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': "The ocean's tides are influenced by the moon.", 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Wind turbines generate renewable energy.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'Gravity is the force that attracts objects to each other.', 'label': 'False'}
true_label: False llm_label: True
{'task': 'question', 'input': 'The human body has 206 bones.', 'label