In [1]:
# !pip install numpy
# !pip install openai==0.28.0
# !pip install opencv-python

In [24]:
import prior
import openai

from integrated_agent import Agent, ACTIONS
from leolani_client import LeolaniChatClient, Action
from ai2thor.controller import Controller
from ipywidgets import Text, Button, Output, VBox, HBox
from IPython.display import display

In [25]:
# Load scene
#dataset = prior.load_dataset("procthor-10k")
#house = dataset["train"][11]

In [26]:
dataset = prior.load_dataset("procthor-10k")
dataset["train"][5808]["objects"][9]["children"][2] = {
  'assetId': 'Laptop_13',
  'id': "Laptop|surface|10|71",
  'kinematic': False,
  'openness': 0,
  'position': {'x': 5.308516502380371,
  'y': 0.960530161857605,
  'z': 3.317396640777588},
  'rotation': {'x': -0.0, 'y': 0.0, 'z': 0.0},
  'layer': 'Procedural1'}

house = dataset["train"][5808]

    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4874.99it/s]
Loading val: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 5163.10it/s]
Loading test: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4644.40it/s]


In [27]:
# Setup OpenAI key
with open('openaikey.txt') as f:
    api_key = f.read().strip()
openai.api_key = api_key

In [28]:
# Create a single controller instance
controller = Controller(
    scene=house,
    visibilityDistance=10,
    width=750,
    height=750
)

In [30]:
emissor_path = "./emissor"
HUMAN = "User"
AGENT = "AI2Thor"

leolaniClient = LeolaniChatClient(emissor_path=emissor_path, agent=AGENT, human=HUMAN)
agent = Agent(controller=controller)

actions_available = ", ".join(ACTIONS)
initial_utterance = f"Hi, I am your navigational agent. I can describe what I see, describe an exact object, move backfoward, turn left or right. I can teleport to another room if you tell me to teleport or change room to explore. If you thought you found some interesting object from my description, please ask me to find the object in my view, and I'll return you the object id. These are the actions I can perform for you: {actions_available}."

human_turn_count = 0
agent_turn_count = 0

output_area = Output()
user_input = Text(
    value='',
    placeholder='Type your message here...',
    description='User:',
    disabled=False,
    layout={'width': '50%'}
)
send_button = Button(description="Send", button_style='success')

# Print initial message
with output_area:
    print(AGENT + ">" + initial_utterance)
leolaniClient._add_utterance(AGENT, initial_utterance)
agent_turn_count += 1  # Agent turn

# Ask the human for initial description
with output_area:
    print("Robot> Please describe in as much detail the room you see in the image shown. Also, describe the object you hope to find.")
agent_turn_count += 1  # Agent turn

conversation_active = True
human_description_stored = False

def on_send_clicked(b):
    global conversation_active, human_description_stored, turn_count

    utterance = user_input.value.strip()
    user_input.value = ""
    if not utterance:
        return

    # Human utterance
    with output_area:
        print(HUMAN + ">" + utterance)
    leolaniClient._add_utterance(HUMAN, utterance)
    human_turn_count += 1  # Human turn

    if utterance.lower() in ["stop", "bye", "exit"]:
        conversation_active = False
        agent.controller.stop()
        leolaniClient._save_scenario()
        with output_area:
            print("Scenario saved and interaction ended.")
        send_button.disabled = True
        user_input.disabled = True
        return

    if not human_description_stored:
        # The first user response is the human description
        agent._human_description = utterance
        human_description_stored = True
        with output_area:
            print(AGENT + "> Thank you! I have stored your description. Maybe you want me to start navigation?")
        leolaniClient._add_utterance(AGENT, "Thank you! I have stored your description. Maybe you want me to start navigation?")
        agent_turn_count += 1  # Agent turn

        # Now perform a 360 view, describe, and show confidence
        panorama_path = agent.perform_360_view()  # increments actions inside
        description = agent.describe_image_with_gpt(panorama_path)
        confidence_level = agent.compare_descriptions(description, agent._human_description)
        reply = f"In the room I started, I see the following: {description}.\nI think there is a possibility of '{confidence_level}'% that the object shows in my current view.\nWhat would you like to do next?"
        with output_area:
            print(AGENT + ">" + reply)
        leolaniClient._add_utterance(AGENT, reply)
        agent_turn_count += 1  # Agent turn

        return

    # For subsequent utterances, process normally
    agent.process_instruction(utterance)

    for ans in agent._answers:
        with output_area:
            print(AGENT + ">" + ans)
        leolaniClient._add_utterance(AGENT, ans)
        agent_turn_count += 1  # Agent turn for each response

    for obj, objectType, coord, image in agent._perceptions:
        leolaniClient._add_image(obj['name'], objectType, coord, image)

    for action in agent._actions:
        leolaniClient._add_action(action)

    # Clear for next round
    agent._answers.clear()
    agent._perceptions.clear()
    agent._actions.clear()

send_button.on_click(on_send_clicked)
display(VBox([user_input, send_button, output_area]))

VBox(children=(Text(value='', description='User:', layout=Layout(width='50%'), placeholder='Type your message …

In [None]:
print(f"Number of dialogue turns used by human: {human_turn_count}.")
print(f"Number of dialogue turns used by agent: {agent_turn_count}.")
print(f"Total number of dialogue turns used: {human_turn_count + agent_turn_count}.")
print(f"Number of actions undertaken by the agent to search for the object {agent.actions}.")

In [None]:
mydict = {}

In [None]:
mydict['test_1'] = {
    'human_turn_count': human_turn_count,
    'agent_turn_count': agent_turn_count,
    'total_turn_count': human_turn_count + agent_turn_count,
    'action_count': agent.actions
}