In [None]:
import requests
import json
import time
from tqdm import tqdm

headers = {
    'Authorization': 'Bearer <OPEN_AI_KEY>',
    'Content-Type': 'application/json',
}
MODEL_TYPE = "gpt-3.5-turbo-0613" # gpt-4
examples = json.load(open("/PCA-EVAL/pca-eval/data/v1.0/Domestic Robot/meta_data.json","r"))

In [None]:
def format_choices(choices):
    # example: ['Phoenix', 'Baton Rouge', 'Honolulu', 'Cheyenne'] -> "(A) Phoenix. (B) Baton Rouge. (C) Honolulu. (D) Cheyenne."
    return " ".join([f"({chr(ord('A') + i)}) {choice}" for i, choice in enumerate(choices)])

In [None]:
start_prompt = """
You are a professional multimodal embodied reasoner. Your are assisted with multiple visual api which can answer your questions about an  image. Your job is to select the best action to answer my question based on an  image.  Note that you can't directly see the image but through the answer of API. I will first give you the description of valid APIs and then give you the question. You can gather information from the api when giving the answer.
"""

api_prompt = """#API Description
def object_detection():
    \"""
    Detects objects in current view, which you don't need do find.
    :return: list of detected objects, e.g. ['chair','table']
    \"""
    pass

def list_items_in_hands():
    \"""
    Lists items in your hand, which you don't need to pick up
    :return: list of items in hand, e.g. ['coffee cup','milk']
    \"""
    pass"""

In [None]:
print(api_prompt)


#API Description
def object_detection():
    """
    Detects objects in current view, which you don't need do find.
    :return: list of detected objects, e.g. ['chair','table']
    """
    pass

def list_items_in_hands():
    """
    Lists items in your hand, which you don't need to pick up
    :return: list of items in hand, e.g. ['coffee cup','milk']
    """
    pass


# 新段落

In [None]:
def chat_domestic(example_json,model="gpt-4"):
    goal_prompt = example_json['question']
    actions_str = format_choices(example_json['actions'])



    objects = str(example_json['api_cached_results']['object_detection_alfred'])
    items = str(example_json['api_cached_results']['list_items_in_hand_alfred'])


    request = {
    "model": model,
    "messages": [
    {
      "role": "user",
      "content": start_prompt
    },
    {
      "role": "assistant",
      "content": "Sure, please provide me with the description of the valid APIs and your question."
    },
    {
      "role": "user",
      "content": api_prompt
    },
    {
        "role": "assistant",
        "content": "Thank you for providing the descriptions of the valid APIs. Please go ahead and ask your question so that I can assist you in selecting the best action based on the image."
    },
    {
      "role": "user",
      "content": "You are a domestic robot that helps me do housework in the simulation room. Your goal is to {} Based on current condition, what is the best action to do next? {}".format(goal_prompt,actions_str)
    },
    ]
    }

    api_call_history = {
        "object_detection":0,
        "list_items_in_hands":0,
    }

    while True:

      while True:
        try:
          response = requests.post('https://api.openai.com/v1/chat/completions',
                         headers=headers,
                         data=json.dumps(request))

          model_response_json = json.loads(response.text)['choices'][0]['message']

          break
        except Exception as e:
          continue


      # check whether api call exists in the last response



      print(model_response_json)

      request['messages'].append(model_response_json)

      api_response = ""
      has_api_call = 0

      if "object" in model_response_json['content'] and "detection" in model_response_json['content'] and not api_call_history['object_detection']:
        api_response += "object_detection() = "+objects+"\n"
        has_api_call = 1
        api_call_history['object_detection'] = 1


      if "list_items_in_hands" in model_response_json['content'] and not api_call_history['list_items_in_hands']:
        api_response += "list_items_in_hands() = "+items+"\n"
        has_api_call = 1
        api_call_history['list_items_in_hands'] = 1





      request['messages'].append({
          "role":"user",
          "content":api_response
      })




      if not has_api_call:
        break



    return request

In [None]:
examples[0]

{'version': '1.0',
 'domain': 'Domestic Robot',
 'index': 0,
 'image': 'FloorPlan10_Rotation_3.png',
 'question': 'fry eggs',
 'actions': ['Start the stoveburner to heat the pan',
  'Put eggs into Microwave',
  'Break the eggs and pour into the pan',
  'Start the induction cooker to heat the pan',
  'Find eggs',
  'Pick up eggs'],
 'anwser_index': 0,
 'reason': "We don't need to find or pick up eggs since we already hold them in hands. We need to use pan instead of microwave to fry eggs. We must heat the pan before frying eggs. There are only stoveburner can served as heat source (there are no induction cooker)",
 'key_concept': ['stoveburner', 'pan', 'eggs'],
 'api_cached_results': {'object_detection_alfred': ['CounterTop',
   'PepperShaker',
   'Spatula',
   'Pan',
   'StoveKnob',
   'StoveBase1',
   'StoveTopDoor1',
   'CoffeeMachine',
   'Cabinet',
   'Cup',
   'Window',
   'WindowStructure',
   'StoveBurner',
   'StoveTopGas',
   'Microwave',
   'SaltShaker',
   'Drawer',
   'Pape

In [None]:
model_answers = []

In [None]:
for i in tqdm(examples):
  model_answers.append(chat_domestic(i,MODEL_TYPE))

In [None]:
with open("robot_chatgpt_3.5_answer_full_dialog.json","w") as f:
  json.dump(model_answers,f,indent=4)

In [None]:
#assert len(model_answers) == len(examples)

# write answers
outputs = []

for i,j in enumerate(model_answers):
  outputs.append({"index":i,"model_output":j['messages'][-2]['content']})

In [None]:
with open("robot_chatgpt_3.5_answer.json","w") as f:  # for automatic evaluation
  json.dump(outputs,f,indent=4)