In [110]:
import json
from itertools import product
import timeit
import pickle
from os import path, getenv
import difflib
from statistics import mean

from dotenv import load_dotenv
import openai

load_dotenv()

OPENAI_API_KEY = getenv('OPENAI_API_KEY')

# Run Experiments

In [102]:
def make_prompt(context, command):
    framing = f'You are an AI that controls a smart home.'
    context = f'Here is the state of the devices in the home, in JSON format: {json.dumps(context)}'
    command = f'The user issues the command: {command}. Change the device state as appropriate.'
    formatting = f'Provide your response in JSON format.'

    return f'{framing} {context} {command} {formatting}'

def query_chatgpt(prompt):
    response = openai.Completion.create(
      model='text-davinci-003',
      prompt=prompt,
      temperature=0.7,
      max_tokens=2000,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    return response

In [103]:
# load test contexts
with open('./contexts.json', 'r') as f:
    contexts = json.load(f)

In [104]:
# load test commands
with open('./commands.json', 'r') as f:
    commands = json.load(f)

In [105]:
trials = list(product(contexts.keys(), commands.keys()))

In [106]:
def run_trial(trial):
    context = contexts[trial[0]]
    command = commands[trial[1]]
    prompt = make_prompt(context, command)

    t0 = timeit.default_timer()
    response = query_chatgpt(prompt)
    t1 = timeit.default_timer()

    result = {
        'context': context,
        'command': command,
        'prompt': prompt,
        'response': response,
        'time_elapsed': t1 - t0
    }

    with open(f'./results/pickles/v1/{trial[0]}-context_{trial[1]}-command.pkl', 'ab') as f:
        pickle.dump(result, f)

In [107]:
N = 10

for trial in trials:
    for _ in range(0, N):
        run_trial(trial)

# Process Results
This translates the responses into human-readable Markdown files so they can be easily assessed for quality.

In [99]:
def load_results(trial):
    results = []

    with open(f'./results/pickles/v1/{trial[0]}-context_{trial[1]}-command.pkl', 'rb') as f:
        while True:
            try:
                results.append(pickle.load(f))
            except EOFError:
                break

    return results

def markdownify_results(trial):
    results = load_results(trial)

    with open(f'./results/markdown/v1/{trial[0]}-context_{trial[1]}-command.md', 'w') as f:
        f.write(f'# {trial[0]} context, {trial[1]} command\n\n')
        f.write(f'## Instructions\n\n')
        f.write(f'For every trial in this file, please assign a binary score (0 or 1) based on the following:\n\n')
        f.write(f'- Poor response (0): The changes to the devices do not at all reflect the intent behind the user command, or the response is malformed/garbled.\n')
        f.write(f'- Good response (1): The changes to the devices are reasonable for the command. You can imagine _someone_ \
            being satisfied with the result, even if it is somewhat subjective (e.g., based on different personal preferences).\n\n')

        for i, ln in enumerate(results):
            f.write(f'## Trial {i} - {trial[0]} context, {trial[1]} command \n\n')
            f.write(f'command: **{ln["command"]}**\n\n')
            
            context = json.dumps(ln['context'], indent=2)
            response = json.loads(ln['response']['choices'][0]['text'])
            response = json.dumps(response, indent=2)
            
            diff = difflib.ndiff(context.splitlines(keepends=True), response.splitlines(keepends=True))
            f.write(f'chatgpt\'s changes to the devices: \n\n``` json\n{"".join(diff)}\n```\n\n')

In [100]:
# translate the responses into neat markdown for qualitative analysis
for trial in trials:
    markdownify_results(trial)

In [112]:
# compute the average latency for all experiments
for trial in trials:
    results = load_results(trial)
    latencies = [r['time_elapsed'] for r in results]
    print(f'{trial}: {mean(latencies):.2f} average latency')

('simple', 'simple'): 2.91 average latency
('simple', 'medium'): 3.71 average latency
('simple', 'ambiguous'): 3.86 average latency
('medium', 'simple'): 6.64 average latency
('medium', 'medium'): 6.25 average latency
('medium', 'ambiguous'): 6.43 average latency
('complex', 'simple'): 9.71 average latency
('complex', 'medium'): 9.16 average latency
('complex', 'ambiguous'): 11.73 average latency
