In [None]:
import os
import json
from collections import Counter, defaultdict

os.chdir("..")
import MatterSim
import numpy as np
import cv2
import matplotlib as mpl
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

print(os.getcwd())
mpl.rcParams['pdf.fonttype'] = 42  # Use TrueType fonts in PDF
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['figure.dpi'] = 300  # Higher DPI for better rendering
mpl.rcParams['savefig.dpi'] = 300

## Low-Level Action Space

In [None]:
dataset = "v3-full"

with open("tasks/R2R/data/R2R_train.json", encoding="utf-8") as file:
    r2r_train = json.load(file)
    
with open("tasks/R2R/data/R2R_val_unseen.json", encoding="utf-8") as file:
    r2r_unseen = json.load(file)
    
with open("tasks/R2R/data/R2R_val_seen.json", encoding="utf-8") as file:
    r2r_seen = json.load(file)

with open(f"./code/dataset_{dataset}/train/train_data.json", encoding="utf-8") as file:
    train_data = json.load(file)

with open(f"./code/dataset_{dataset}/val/val_data.json", encoding="utf-8") as file:
    val_data = json.load(file)

with open(f"./code/dataset_{dataset}/test/test_data.json", encoding="utf-8") as file:
    test_data = json.load(file) 
    
# Map your splits to the original full R2R splits
original_r2r_paths = {
    "Train": len(set([item["path_id"] for item in r2r_train])),
    "Val": len(set([item["path_id"] for item in r2r_seen])),
    "Test": len(set([item["path_id"] for item in r2r_unseen]))
}

def avg_steps_per_path(data):
    path_lengths = {}
    for item in data:
        pid = item["path_id"]
        path_lengths[pid] = path_lengths.get(pid, 0) + 1
    return round(sum(path_lengths.values()) / len(path_lengths), 2)

# Function to compute stats
def compute_stats(data):
    num_samples = len(data)
    num_paths = len(set([item["path_id"] for item in data]))
    unique_environments = len(set([item["scan"] for item in data]))
    avg_steps = avg_steps_per_path(data)
    action_counts = Counter([
        item["gold_label"].replace("Stop Navigation", "Stop") for item in data
    ])
    return num_samples, num_paths, action_counts, unique_environments, avg_steps

# Compute stats
splits = {
    "Train": train_data,
    "Val": val_data,
    "Test": test_data
}

summary_data = []
plot_data = []
total_action_counts = Counter()

for split_name, data in splits.items():
    num_samples, num_paths, action_counts, unique_environments, avg_steps = compute_stats(data)
    summary_data.append({
        "Split": split_name,
        "Num Samples": num_samples,
        "Num Paths": num_paths,
        "Unique Environments" : unique_environments,
        "Avg Steps Per Path" : avg_steps
    })
    total = sum(action_counts.values())
    total_action_counts.update(action_counts)

    for action, count in action_counts.items():
        plot_data.append({
            "Split": split_name,
            "Action": action,
            "Percentage": (count / total) * 100
        })

# Order actions by total frequency
ordered_actions = [action for action, _ in total_action_counts.most_common()]
plot_df = pd.DataFrame(plot_data)
plot_df["Action"] = pd.Categorical(plot_df["Action"], categories=ordered_actions, ordered=True)

total_paths_used = sum(row["Num Paths"] for row in summary_data)

# Add relative path share (%) to each row
for row in summary_data:
    split_name = row["Split"]
    used_paths = row["Num Paths"]
    total_paths_in_r2r = original_r2r_paths[split_name]
    row["Data Usage (%)"] = round((used_paths / total_paths_in_r2r) * 100, 2)

# Final tidy DataFrame
summary_df = pd.DataFrame(summary_data)[[
    "Split", "Num Samples", "Num Paths", "Data Usage (%)", "Unique Environments", "Avg Steps Per Path"
]]

# Print nicely
print("=== Dataset Summary ===")
print(summary_df.to_string(index=False))

In [None]:
sns.set_theme(style="whitegrid")  # or
sns.set_context("paper")
# Plot
#palette = sns.color_palette("colorblind")  # or try "colorblind", "pastel", etc.
#palette = ["#4D4D4D", "#7F7F7F", "#BFBFBF"]
palette = sns.color_palette("colorblind")
plt.figure(figsize=(10, 6))
ax = sns.barplot(
    data=plot_df,
    x="Action",
    y="Percentage",
    hue="Split",
    palette=palette,
    edgecolor="black"
)

for container in ax.containers:
    for bar in container:
        height = bar.get_height()
        ax.annotate(
            f"{height:.1f}%",
            xy=(bar.get_x() + bar.get_width() / 2, height),
            xytext=(0, 3),  # 3 points above bar
            textcoords="offset points",
            ha='center',
            va='bottom',
            fontsize=12
        )

plt.ylabel("Percentage (%)", fontsize=18)
plt.xlabel("Action", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.legend(title="Split", fontsize=13, title_fontsize=14)
plt.tight_layout()
plt.savefig("./figures/action_distribution_full.pdf", bbox_inches="tight", dpi=700)
plt.show()

In [None]:
# Helper to compute steps per path
def compute_path_lengths(data):
    path_steps = defaultdict(int)
    for item in data:
        path_steps[item["path_id"]] += 1
    return list(path_steps.values())

# Helper to compute instruction lengths
def compute_instruction_lengths(data):
    lengths = []
    seen_path_ids = set()
    for item in data:
        if item["path_id"] not in seen_path_ids:
            seen_path_ids.add(item["path_id"])
            for inst in item["instructions"]:
                token_count = len(inst.split())
                lengths.append(token_count)
    return lengths

# Gather data
splits = {
    "Train": train_data,
}

path_length_data = []
instruction_length_data = []

for split_name, data in splits.items():
    # Path lengths
    lengths = compute_path_lengths(data)
    for length in lengths:
        path_length_data.append({"Split": split_name, "Steps per Path": length})

    # Instruction lengths
    instr_lens = compute_instruction_lengths(data)
    for instr_len in instr_lens:
        instruction_length_data.append({"Split": split_name, "Instruction Length": instr_len})

# Convert to DataFrames
path_length_df = pd.DataFrame(path_length_data)
instr_length_df = pd.DataFrame(instruction_length_data)

# Plot: Histogram of steps per path
plt.figure(figsize=(9, 5))
sns.histplot(data=path_length_df, x="Steps per Path", hue="Split", bins=20, multiple="dodge", palette="Set2", edgecolor="black")
plt.title("Distribution of Steps per Path")
plt.xlabel("Number of Steps")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Set theme before plotting
sns.set_theme(style="whitegrid")
sns.set_context("paper")

# Filter only the train split
train_instr_df = instr_length_df[instr_length_df["Split"] == "Train"]

# Calculate mean and median
mean_length = train_instr_df["Instruction Length"].mean()
median_length = train_instr_df["Instruction Length"].median()

print(mean_length)
plt.figure(figsize=(9, 5))
sns.histplot(
    data=train_instr_df,
    x="Instruction Length",
    bins=50,
    color=sns.color_palette("colorblind")[0],
    edgecolor="black",
    alpha=1.0
)

# Add vertical lines
plt.axvline(mean_length, color=palette[1], linestyle="--", linewidth=1.5, label=f"Mean: {mean_length:.1f}")
plt.axvline(median_length, color=palette[2], linestyle="--", linewidth=1.5, label=f"Median: {median_length:.1f}")
# Add legend to show labels
plt.legend(fontsize=14)

# Labels and styling
plt.xlabel("Instruction Length (whitespace tokens)", fontsize=18)
plt.ylabel("Count", fontsize=18)
plt.xticks(range(0, train_instr_df["Instruction Length"].max() + 10, 20), fontsize=16)
plt.yticks(fontsize=16)
plt.tick_params(axis='x', which='major', length=6, width=1.2, direction='out')
plt.tight_layout()
plt.savefig("./figures/histogram_instruction_length_full.pdf", bbox_inches="tight", dpi=700)
plt.show()

## Panoramic Action Space

In [None]:
dataset = "v5-full"

with open("tasks/R2R/data/R2R_train.json", encoding="utf-8") as file:
    r2r_train = json.load(file)
    
with open("tasks/R2R/data/R2R_val_unseen.json", encoding="utf-8") as file:
    r2r_unseen = json.load(file)
    
with open("tasks/R2R/data/R2R_val_seen.json", encoding="utf-8") as file:
    r2r_seen = json.load(file)

with open(f"./code/dataset_{dataset}/train/train_data.json", encoding="utf-8") as file:
    train_data = json.load(file)

with open(f"./code/dataset_{dataset}/val/val_data.json", encoding="utf-8") as file:
    val_data = json.load(file)

with open(f"./code/dataset_{dataset}/test/test_data.json", encoding="utf-8") as file:
    test_data = json.load(file) 
    
    # Helper to compute steps per path
def compute_path_lengths(data):
    path_steps = defaultdict(int)
    for item in data:
        path_steps[item["path_id"]] += 1
    return list(path_steps.values())

def compute_nr_of_candidates(data):
    nr_of_candidates = []
    for item in data:
        nr_of_candidates.append(len(item["candidates"]))
        
    return nr_of_candidates
        

# Helper to compute instruction lengths
def compute_instruction_lengths(data):
    lengths = []
    seen_path_ids = set()
    for item in data:
        if item["path_id"] not in seen_path_ids:
            seen_path_ids.add(item["path_id"])
            for inst in item["instructions"]:
                token_count = len(inst.split())
                lengths.append(token_count)
    return lengths

# Gather data
splits = {
    "Train": train_data,
}

path_length_data = []
instruction_length_data = []
nr_of_candidates = []

for split_name, data in splits.items():
    # Path lengths
    lengths = compute_path_lengths(data)
    for length in lengths:
        path_length_data.append({"Split": split_name, "Steps per Path": length})

    # Instruction lengths
    instr_lens = compute_instruction_lengths(data)
    for instr_len in instr_lens:
        instruction_length_data.append({"Split": split_name, "Instruction Length": instr_len})
        
    cand_lens = compute_nr_of_candidates(data)
    
    for can in cand_lens:
        nr_of_candidates.append({"Split": split_name, "Nr of Candidates" : can})

# Convert to DataFrames
path_length_df = pd.DataFrame(path_length_data)
instr_length_df = pd.DataFrame(instruction_length_data)
can_df = pd.DataFrame(nr_of_candidates)

can_mean = float(can_df.mean())
can_median = float(can_df.median())


# Plot: Histogram of steps per path
plt.figure(figsize=(9, 5))
sns.histplot(
    data=can_df, 
    x="Nr of Candidates", 
    bins=13, 
    multiple="dodge", 
    color=sns.color_palette("colorblind")[0], 
    edgecolor="black",
    alpha=1.0
)
# Add vertical lines
plt.axvline(can_mean, color=palette[1], linestyle="--", linewidth=1.5, label=f"Mean: {can_mean:.1f}")
plt.axvline(can_median, color=palette[2], linestyle="--", linewidth=1.5, label=f"Median: {can_median:.1f}")
# Add legend to show labels
plt.legend(fontsize=14)

plt.xlabel("Number of Candidates", fontsize=18)
plt.ylabel("Count", fontsize=18)
plt.yticks(fontsize=16)
plt.xticks(fontsize=16)
plt.tight_layout()
plt.savefig("./figures/histogram_candidates_full.pdf", bbox_inches="tight", dpi=700)
plt.show()

In [None]:
# Map your splits to the original full R2R splits
original_r2r_paths = {
    "Train": len(set([item["path_id"] for item in r2r_train])),
    "Val": len(set([item["path_id"] for item in r2r_seen])),
    "Test": len(set([item["path_id"] for item in r2r_unseen]))
}

def avg_steps_per_path(data):
    path_lengths = {}
    for item in data:
        pid = item["path_id"]
        path_lengths[pid] = path_lengths.get(pid, 0) + 1
    return round(sum(path_lengths.values()) / len(path_lengths), 2)


# Function to compute stats
def compute_stats(data):
    num_samples = len(data)
    num_paths = len(set([item["path_id"] for item in data]))
    unique_environments = len(set([item["scan"] for item in data]))
    avg_steps = avg_steps_per_path(data)
    avg_nr_candidates = sum(compute_nr_of_candidates(data))/len(data)
    
    action_counts = Counter([
        item["gold_label"] for item in data
    ])
    return num_samples, num_paths, action_counts, unique_environments, avg_steps, avg_nr_candidates

# Compute stats
splits = {
    "Train": train_data,
    "Val": val_data,
    "Test": test_data
}

summary_data = []
plot_data = []
total_action_counts = Counter()

for split_name, data in splits.items():
    num_samples, num_paths, action_counts, unique_environments, avg_steps, avg_nr_candidates = compute_stats(data)
    summary_data.append({
        "Split": split_name,
        "Num Samples": num_samples,
        "Num Paths": num_paths,
        "Unique Environments" : unique_environments,
        "Avg Steps Per Path" : avg_steps,
        "Avg Nr Candidates" : round(avg_nr_candidates,2)
    })
    total = sum(action_counts.values())
    total_action_counts.update(action_counts)

    for action, count in action_counts.items():
        plot_data.append({
            "Split": split_name,
            "Action": action,
            "Percentage": (count / total) * 100
        })

# Order actions by total frequency
ordered_actions = [action for action, _ in total_action_counts.most_common()]
plot_df = pd.DataFrame(plot_data)
plot_df["Action"] = pd.Categorical(plot_df["Action"], categories=ordered_actions, ordered=True)

total_paths_used = sum(row["Num Paths"] for row in summary_data)

# Add relative path share (%) to each row
for row in summary_data:
    split_name = row["Split"]
    used_paths = row["Num Paths"]
    total_paths_in_r2r = original_r2r_paths[split_name]
    row["Data Usage (%)"] = round((used_paths / total_paths_in_r2r) * 100, 2)

# Final tidy DataFrame
summary_df = pd.DataFrame(summary_data)[[
    "Split", "Num Samples", "Num Paths", "Data Usage (%)", "Unique Environments", "Avg Steps Per Path", "Avg Nr Candidates"
]]

# Print nicely
print("=== Dataset Summary ===")
print(summary_df.to_string(index=False))