In [3]:
import json
from openai import OpenAI
import plotly.graph_objects as go
import kaleido as kaleido
from nltk.corpus import stopwords
from collections import Counter
import nltk
import random
import os
import pandas as pd

# set the resolution of the plots
resolution_x = 3000
resolution_y = 1000

In [4]:
'''extract the data from the json files in each experiment folder and store it in a pandas dataframe'''


# get the experiment folders
experiments_folder = 'logs'
experiment_folders = [f for f in os.listdir(
    experiments_folder) if os.path.isdir(os.path.join(experiments_folder, f))]

# experiment folders is called `exp`
experiment_folders = [os.path.join(experiments_folder, exp)
                      for exp in experiment_folders]

print(len(experiment_folders))
data = []
for folder in experiment_folders:
    # get the log file for each experiment
    log_file = os.path.join(folder, "logs.json")
    # clean the folder name
    folder = folder.split("/")[-1]
    # get the scenario name from the folder name. remove the `exp_` prefix and the -15-02-11 suffix
    scenario = folder.split("_")[1]
    scenario = scenario[:-9]
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            logs = json.load(f)
            for step, step_data in logs.items():
                entry = {
                    #    keep only the folder name
                    "exp_name": folder,
                    "scenario": scenario,
                    "step": step,
                    "action": step_data["logs"]["action"]["message"] if "action" in step_data["logs"] else None,
                    "plan": step_data["logs"]["plan"]["message"] if "plan" in step_data["logs"] else None,
                    "memory": step_data["logs"]["memory"]["message"] if "memory" in step_data["logs"] else None,
                    "observation": step_data["logs"]["observation"]["message"] if "observation" in step_data["logs"] else None,
                    "task": step_data["logs"]["task"]["message"] if "task" in step_data["logs"] else None,

                }
                data.append(entry)

df = pd.DataFrame(data)

# add a column for the action type and a column for the action value
df["action_type"] = df["action"].apply(
    lambda x: json.loads(x)["action"] if x else None)
df["action_value"] = df["action"].apply(
    lambda x: json.loads(x)["value"] if x else None)

100


In [5]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
import plotly.graph_objects as go
import random

# Set parameters
num_of_words = 200
message_type = "observation"
resolution_x, resolution_y = 2000, 1500  # Assuming these are defined elsewhere

# Download stopwords and create custom list
nltk.download('stopwords')
custom_stopwords = set(stopwords.words('english'))

# Define scenarios and colors
scenarios = df["scenario"].unique()
start_colors = {
    "base": "#66c5cc", "male": "#f6cf71", "night": "#f89c74",
    "tokyo": "#dcb0f2", "winter": "#87c55f"
}


def get_top_words(text, num_of_words):
    words = [word for word in text.lower().split()
             if word.isalpha() and word not in custom_stopwords]
    return dict(Counter(words).most_common(num_of_words))


# Create figure
fig = go.Figure()

for scenario in scenarios:
    text = " ".join(df[df["scenario"] == scenario][message_type].dropna())
    top_words = get_top_words(text, num_of_words)
    color = start_colors[scenario]

    fig.add_trace(go.Scatter(
        x=list(top_words.keys()),
        y=list(top_words.values()),
        mode='markers',
        marker=dict(
            size=20,
            opacity=0.75,
            line=dict(width=1.5, color='grey'),
            color=f'rgba({int(color[1:3], 16)}, {int(color[3:5], 16)}, {int(color[5:], 16)}, 0.75)'
        ),
        name=scenario
    ))

    # Add annotations
    for i, (word, count) in enumerate(top_words.items()):
        if i % 25 == 0 or count == max(top_words.values()):
            rnd = random.randint(100, 240)
            fig.add_annotation(
                x=word, y=count,
                text=f"{word} ({count})",
                showarrow=True,
                font=dict(size=16, color='gray'),
                ax=- rnd,
                ay=- rnd,
                arrowwidth=0.2
            )

# Update layout
fig.update_layout(
    title=f"Top {num_of_words} terms in '{message_type}' by scenario",
    title_font=dict(size=40),
    xaxis_categoryorder='total ascending',
    legend_title="Scenarios",
    yaxis=dict(side='right'),
    yaxis_tickfont=dict(size=20),
    xaxis=dict(tickangle=-45, tickfont=dict(size=20)),
    hovermode="closest",
    plot_bgcolor="white",
    width=resolution_x,
    height=resolution_y,
    legend=dict(
        font=dict(size=32),
        x=0.05, y=0.99,
        xanchor='left', yanchor='top',
        bordercolor='grey'
    )
)

# Save and show figure
fig.write_image("ta-top-words.png",
                width=resolution_x, height=resolution_y, engine="kaleido")
fig.show()

[nltk_data] Downloading package stopwords to /Users/kai/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# FIND THE MAIN TOPICS FOR THE WORDS IN THE PLANS
 df['plan'][0] is: '{"thought":"The subway station sign is directly in front of me, and with the compass indicating a clear path, I should move forward without any immediate distractions or obstacles in my way."}'

- create an object with all 'plan' messages, after removing the '{"thought":"..."}' part



# The main topics provided words can be clustered into the following themes:

'1. **Navigation and Direction**: This topic encompasses words related to guiding oneself through an environment, including "navigate," "direction," "path," "route," "forward," "turn," "move," and "navigate."

'2. **Urban Environment**: This category focuses on words related to the city landscape and surroundings, such as "subway," "station," "building," "area," "environment," "urban," and "surroundings."

'3. **Movement and Progression**: This topic includes words that pertain to moving from one place to another, including "proceed," "reach," "moving," "steps," "continue," and "advancing."

'4. **Clarity and Visibility**: This theme encompasses words that pertain to seeing and understanding routes or markers, including "clear," "visible," "sign," "indicates," "appears," and "reference."

'5. **Obstacles and Challenges**: This category consists of terms that describe barriers or difficulties encountered in navigation, such as "obstacles," "blocking," "safe," "significant," and "efficient." 

'These clusters reflect the main themes present in the words associated with navigating an urban environment, focusing on direction, movement, visibility, and potential challenges.'


{
    "predicated_topic": {
        "subway": "Urban Environment",
        "station": "Urban Environment",
        "left": "Direction",
        "towards": "Navigation and Direction",
        "sign": "Clarity and Visibility",
        "forward": "Movement and Progression",
        "directly": "Clarity and Visibility",
        "path": "Navigation and Direction",
        "right": "Direction",
        "clear": "Clarity and Visibility",


    },
    "top_words": {
        "subway": 376,
        "station": 307,
        "left": 263,
        "towards": 208,
        "sign": 200,
        "forward": 188,
        "directly": 187,
        "path": 164,
        "right": 157,
        "clear": 136,
    }
}

In [None]:
api_key = # your openai api key

def analyze_plans(df, api_key, num_of_words=500):
    # get the plans
    plans_df = df['plan'].apply(lambda x: json.loads(x)[
                                "thought"] if x else None)
    # remove the None values
    plans_df.dropna(inplace=True)

    # join all the plans into a single string
    all_plans_text = " ".join(plans_df)

    # get the top words in the plans
    top_words_plans = get_top_words(all_plans_text, num_of_words=num_of_words)

    # Create a client
    client = OpenAI(api_key=api_key)
    # Create a completion
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content":
             '''
                    - You are an expert NLP researcher.
                    - You are given a list of words.
                    - Your task is to cluster these words into no more than 5 topics that capture the main themes of the plans.
                    - Respond with the topics you identified. 
                
                    '''
             },
            {"role": "user", "content": "find the main topics for these words: " +
             ", ".join(list(top_words_plans.keys()))}
        ],
    )

    # Parse the response
    topics = response.choices[0].message.content
    return topics


# topics = analyze_plans(df, api_key)
# print(topics)

#  topics for the plans inferred by the model
plan_topics = [
    "Navigation and Direction",
    "Urban Environment",
    "Movement and Progression",
    "Clarity and Visibility",
    "Obstacles and Challenges",
]

client = OpenAI(api_key=api_key)

# Define the message type and number of words to cluster
message_type = "plan"

# Number of words to cluster
num_of_words = 100

# save topic in this folder
folder = "topics"

# Get unique scenarios
scenarios = df["scenario"].unique()


def match_topic_to_top_words(words):
    # Create a request to OpenAI
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content":
                '''
                - You are an expert in Natural Language Processing (NLP) with a PhD in Linguistics. Your approach is meticulous and detail-oriented.
                - You will receive a list of words and a corresponding list of topics. Your task is to accurately match each word to one specific topic that best represents its meaning.

                Instructions:
                - For each word provided, identify the single most appropriate topic from the list.
                - Respond with a JSON object where each key is a word and its corresponding value is the matched topic. The format should be: {"word1": "topic1", "word2": "topic2", ...}.
                
                Important Notes:
                - Each word must be matched to exactly one topic only.
                - Maintain the original wording of both the words and the topics; do not use synonyms or variations.
                '''
             },
            {"role": "user", "content":  "match the words to the topics: " +
                ", ".join(words) + " to " + ", ".join(plan_topics)}
        ],
        response_format={"type": "json_object"}
    )

    # Parse the JSON response
    topics = json.loads(response.choices[0].message.content)

    return topics


# Iterate through scenarios and plot scatter points
for scenario in scenarios:

    # Get all observation messages for the scenario
    type_messages = df[df["scenario"] == scenario][message_type].dropna()

    # Join all messages into a single text
    text = " ".join(type_messages)

    # Get the top words
    top_words = get_top_words(text, num_of_words=num_of_words)

    # Get the topics from OpenAI
    predicated_topic = match_topic_to_top_words(list(top_words.keys()))

    # create a new object that simply adds the topic to the top words
    top_words_with_topic = {
        "predicated_topic": predicated_topic, "top_words": top_words}

    # create  folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)

    # save the top words to a file
    with open(f"{folder}/{scenario}_top_words_with_topic.json", "w") as f:
        json.dump(top_words_with_topic, f, indent=4)

In [123]:
import os
import json
import plotly.graph_objects as go

# Load all the JSON files
scenarios = []
all_topics = set()
all_data = []

start_colors = {
    "topic_1": "#66c5cc", "topic_2": "#f6cf71", "topic_3": "#f89c74",
    "topic_4": "#dcb0f2", "topic_5": "#87c55f"
}

# Assuming the JSON files are in a directory named "topics"
json_files = [f for f in os.listdir("topics") if f.endswith(".json")]

for file in json_files:
    scenario = file.replace("predicated_topic_for_", "").replace(".json", "")
    scenarios.append(scenario)

    with open(os.path.join("topics", file), "r") as f:
        data = json.load(f)

    predicated_topic = data["predicated_topic"]
    top_words = data["top_words"]

    for word, topic in predicated_topic.items():
        all_data.append({
            "scenario": scenario,
            "word": word,
            "topic": topic,
            "count": top_words[word]
        })
        all_topics.add(topic)

# Ensure we have exactly 5 topics
assert len(all_topics) == 5, f"Expected 5 topics, but found {len(all_topics)}"

# Create a fixed color mapping for the 5 topics
topic_colors = {topic: start_colors[f"topic_{i+1}"] for i, topic in enumerate(all_topics)}

# Create the scatter plot
fig = go.Figure()

for topic in all_topics:
    topic_data = [item for item in all_data if item["topic"] == topic]
    fig.add_trace(go.Scatter(
        x=[item["word"] for item in topic_data],
        y=[item["count"] for item in topic_data],
        mode='markers',
        name=topic,
        marker=dict(size=20, opacity=0.75, line=dict(width=1, color='grey'), color=topic_colors[topic]),
    ))

    # Add text to each point for each topic if the count is max or in specific range
    max_count = max(item["count"] for item in topic_data)
    for item in topic_data:
        if item["count"] == max_count or item["count"] in range(10, max_count, 20):
            fig.add_annotation(
                x=item["word"],
                y=item["count"],
                text=item["word"],
                showarrow=True,
                font=dict(size=24, color='gray'),
                ax=-100,
                ay=-100,
                arrowwidth=0.5,
            )

# Update layout
fig.update_layout(
    plot_bgcolor='white',
    title="Topical Distribution of Key Terms Across Scenarios",
    title_font=dict(size=40),
    width=resolution_x,
    height=resolution_y,
    yaxis=dict(tickfont=dict(size=16)),
    xaxis=dict(
        gridcolor='rgba(0,0,0,.1)',
        autorange='reversed',
        tickangle=-45,
        tickfont=dict(size=20),
        side='right',
    ),
    legend=dict(
        x=0.05,
        y=1,
        xanchor='left',
        yanchor='top',
        bgcolor='rgba(255,255,255,1)',
        bordercolor='rgba(0,0,0,.1)',
        borderwidth=2,
        font=dict(size=24)
    ),
)

# Show the figure
fig.show()

# Save the figure as an image
fig.write_image("ta-topical.png",
                width=resolution_x, height=resolution_y, engine="kaleido")
