In [23]:
import requests
import json
import time
from tqdm import tqdm

headers = {
    'Authorization': 'Bearer <OPEN_AI_KEY>',
    'Content-Type': 'application/json',
}
MODEL_TYPE = "gpt-3.5-turbo-0613" # gpt-4
examples = json.load(open("/PCA-EVAL/pca-eval/data/v1.0/Autonomous Driving/meta_data.json","r"))

In [7]:
def format_choices(choices):
    # example: ['Phoenix', 'Baton Rouge', 'Honolulu', 'Cheyenne'] -> "(A) Phoenix. (B) Baton Rouge. (C) Honolulu. (D) Cheyenne."
    return " ".join([f"({chr(ord('A') + i)}) {choice}" for i, choice in enumerate(choices)])

In [8]:
start_prompt = """
You are a professional multimodal embodied reasoner. Your are assisted with multiple visual api which can answer your questions about an  image. Your job is to select the best action to answer my question based on an  image.  Note that you can't directly see the image but through the answer of API. I will first give you the description of valid APIs and then give you the question. You can gather information from the api when giving the answer.
"""

api_prompt = "# API Description:\ndef detect_traffic_sign():\n    \"\"\"\n    Detects traffic signs in the image.\n    :return: list of detected traffic signs and coordinates, e.g. ['stop','max speed limit']\n    \"\"\"\n    pass\n\ndef object_detection():\n    \"\"\"\n    Detects objects in the image.\n    :return: dict of detected objects and number of the objects, e.g. {'car':10, 'person':1}\n    \"\"\"\n    pass\n\ndef ocr():\n    \"\"\"\n    Performs OCR on the image.\n    :return: list of detected text, e.g. ['Coffee Shop', 'Open 24/7']\n    \"\"\"\n    pass\n\ndef image_caption():\n    \"\"\"\n    Generates a caption for the image.\n    :return: caption, e.g. 'A red car driving down the street'\n    \"\"\"\n    pass\n\ndef weather_detection():\n    \"\"\"\n    Detect current weather.\n    :return: weather, e.g. 'rainy' or 'clear'\n    \"\"\"\n    pass"


In [4]:
print(api_prompt)

# API Description:
def detect_traffic_sign():
    """
    Detects traffic signs in the image.
    :return: list of detected traffic signs and coordinates, e.g. ['stop','max speed limit']
    """
    pass

def object_detection():
    """
    Detects objects in the image.
    :return: dict of detected objects and number of the objects, e.g. {'car':10, 'person':1}
    """
    pass

def ocr():
    """
    Performs OCR on the image.
    :return: list of detected text, e.g. ['Coffee Shop', 'Open 24/7']
    """
    pass

def image_caption():
    """
    Generates a caption for the image.
    :return: caption, e.g. 'A red car driving down the street'
    """
    pass

def weather_detection():
    """
    Detect current weather.
    :return: weather, e.g. 'rainy' or 'clear'
    """
    pass


# 新段落

In [24]:
def chat_traffic(example_json,model="gpt-4"):
    goal_prompt = example_json['question']
    actions_str = format_choices(example_json['actions'])
    answer = example_json['answer_index']


    signs = str(example_json['api_cached_result']['detect_traffic_sign'])
    caption = example_json['api_cached_result']['caption']
    objects = str(example_json['api_cached_result']['object_detection'])
    weather = example_json['api_cached_result']['weather_detection']
    ocr = example_json['api_cached_result']['ocr']

    request = {
    "model": model,
    "messages": [
    {
      "role": "user",
      "content": start_prompt
    },
    {
      "role": "assistant",
      "content": "Sure, please provide me with the description of the valid APIs and your question."
    },
    {
      "role": "user",
      "content": api_prompt
    },
    {
        "role": "assistant",
        "content": "Thank you for providing the descriptions of the valid APIs. Please go ahead and ask your question so that I can assist you in selecting the best action based on the image."
    },
    {
      "role": "user",
      "content": "Question: {} Actions: {}".format(goal_prompt,actions_str)
    },
    ]
    }

    api_call_history = {
        "detect_traffic_sign":0,
        "object_detection":0,
        "caption":0,
        "weather_detection":0,
        "ocr":0
    }

    while True:
      while True:
        try:
          response = requests.post('https://api.openai.com/v1/chat/completions',
                         headers=headers,
                         data=json.dumps(request))
          model_response_json = json.loads(response.text)['choices'][0]['message']

          break
        except Exception as e:
          continue


      # check whether api call exists in the last response


      print(model_response_json)

      request['messages'].append(model_response_json)

      api_response = ""
      has_api_call = 0

      if "detect_traffic_sign" in model_response_json['content'] and not api_call_history['detect_traffic_sign']:
        api_response += "detect_traffic_sign() = "+signs+"\n"
        has_api_call = 1
        api_call_history['detect_traffic_sign'] = 1


      if "object_detection" in model_response_json['content'] and not api_call_history['object_detection']:
        api_response += "object_detection() = "+objects+"\n"
        has_api_call = 1
        api_call_history['object_detection'] = 1

      if "caption" in model_response_json['content'] and not api_call_history['caption']:
        api_response += "caption() = "+caption+"\n"
        has_api_call = 1
        api_call_history['caption'] = 1

      if "weather_detection" in model_response_json['content'] and not api_call_history['weather_detection']:
        api_response += "weather_detection() = "+weather+"\n"
        has_api_call = 1
        api_call_history['weather_detection'] = 1

      if "ocr" in model_response_json['content'] and not api_call_history['ocr']:
        api_response += "ocr() = "+ocr+"\n"
        has_api_call = 1
        api_call_history['ocr'] = 1



      request['messages'].append({
          "role":"user",
          "content":api_response
      })




      if not has_api_call:
        break



    return request

In [10]:
examples[0]

{'version': 1.0,
 'domain': 'Autonomous Driving',
 'index': 0,
 'image': 'traffic_000.jpg',
 'question': 'You are a driving assistant. The car is at the speed of 50 km/h. Based on current image, what is the best action to take?',
 'actions': ['Slow down',
  'keep driving',
  'Stop the car',
  'Change to other lane',
  'Speed up'],
 'answer_index': 5,
 'reason': 'Based on the image, the lowest speed for current lane is 70km/h, current speed is 50km/h, which is lower than the limit, so you need to speed up.',
 'key_concept': ['Minimum Speed Limit 70 km/h', 'clear road'],
 'api_cached_result': {'detect_traffic_sign': ['Maximum Speed Limit 100 km/h',
   'Minimum Speed Limit 70 km/h'],
  'object_detection': [['car', 0], ['people', 0], ['bicycle', 0]],
  'caption': 'a car driving down a highway with signs on it',
  'weather_detection': 'clear',
  'ocr': 'None'}}

In [25]:
model_answers = []

In [None]:
for i in tqdm(examples):
  model_answers.append(chat_traffic(i,MODEL_TYPE))

In [None]:
with open("traffic_chatgpt_3.5_answer_full_dialog.json","w") as f:
  json.dump(model_answers,f,indent=4)

In [19]:
assert len(model_answers) == len(examples)

# write answers
outputs = []

for i,j in enumerate(model_answers):
  outputs.append({"index":i,"model_output":j['messages'][-2]['content']})

In [20]:
with open("traffic_chatgpt_3.5_answer.json","w") as f: # used for automatic evaluation
  json.dump(outputs,f,indent=4)