In [1]:
# !pip install numpy
# !pip install openai==0.28.0
# !pip install opencv-python

In [2]:
import prior
import openai

from integrated_agent import Agent, ACTIONS
from leolani_client import LeolaniChatClient, Action
from ai2thor.controller import Controller
from ipywidgets import Text, Button, Output, VBox, HBox
from IPython.display import display

In [3]:
# Load scene
dataset = prior.load_dataset("procthor-10k")
house = dataset["train"][11]

    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 5428.72it/s]
Loading val: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 5642.71it/s]
Loading test: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 5629.17it/s]


In [4]:
# Setup OpenAI key
with open('openaikey.txt') as f:
    api_key = f.read().strip()
openai.api_key = api_key

In [5]:
# Create a single controller instance
controller = Controller(
    scene=house,
    visibilityDistance=3,
    width=750,
    height=750
)

In [6]:
emissor_path = "./emissor"
HUMAN = "User"
AGENT = "AI2Thor"

leolaniClient = LeolaniChatClient(emissor_path=emissor_path, agent=AGENT, human=HUMAN)
agent = Agent(controller=controller)

actions_available = ", ".join(ACTIONS)
initial_utterance = f"This is what I can do: {actions_available}"

turn_count = 0

output_area = Output()
user_input = Text(
    value='',
    placeholder='Type your message here...',
    description='User:',
    disabled=False,
    layout={'width': '50%'}
)
send_button = Button(description="Send", button_style='success')

# Print initial message
with output_area:
    print(AGENT + ">" + initial_utterance)
leolaniClient._add_utterance(AGENT, initial_utterance)
turn_count += 1  # Agent turn

# Ask the human for initial description
with output_area:
    print("Robot> Please describe in one sentence what you see in the image shown.")
turn_count += 1  # Agent turn

conversation_active = True
human_description_stored = False

def on_send_clicked(b):
    global conversation_active, human_description_stored, turn_count

    utterance = user_input.value.strip()
    user_input.value = ""
    if not utterance:
        return

    # Human utterance
    with output_area:
        print(HUMAN + ">" + utterance)
    leolaniClient._add_utterance(HUMAN, utterance)
    turn_count += 1  # Human turn

    if utterance.lower() in ["stop", "bye", "exit"]:
        conversation_active = False
        agent.controller.stop()
        leolaniClient._save_scenario()
        with output_area:
            print("Scenario saved and interaction ended.")
        send_button.disabled = True
        user_input.disabled = True
        return

    if not human_description_stored:
        # The first user response is the human description
        agent._human_description = utterance
        human_description_stored = True
        with output_area:
            print(AGENT + "> Thank you! I have stored your description.")
        leolaniClient._add_utterance(AGENT, "Thank you! I have stored your description.")
        turn_count += 1  # Agent turn

        # Now perform a 360 view, describe, and show confidence
        panorama_path = agent.perform_360_view()  # increments actions inside
        description = agent.describe_image_with_gpt(panorama_path)
        confidence_level = agent.compare_descriptions(description, agent._human_description)
        reply = f"I see: {description}.\nMy similarity confidence with your description is {confidence_level}%.\nWhat would you like to do next?"
        with output_area:
            print(AGENT + ">" + reply)
        leolaniClient._add_utterance(AGENT, reply)
        turn_count += 1  # Agent turn

        return

    # For subsequent utterances, process normally
    agent.process_instruction(utterance)

    for ans in agent._answers:
        with output_area:
            print(AGENT + ">" + ans)
        leolaniClient._add_utterance(AGENT, ans)
        turn_count += 1  # Agent turn for each response

    for obj, objectType, coord, image in agent._perceptions:
        leolaniClient._add_image(obj['name'], objectType, coord, image)

    for action in agent._actions:
        leolaniClient._add_action(action)

    # Clear for next round
    agent._answers.clear()
    agent._perceptions.clear()
    agent._actions.clear()

send_button.on_click(on_send_clicked)
display(VBox([user_input, send_button, output_area]))

VBox(children=(Text(value='', description='User:', layout=Layout(width='50%'), placeholder='Type your message …