In [None]:
import os
import json
import pandas as pd

os.chdir("..")
print(os.getcwd())

In [None]:
import json
import os
from collections import defaultdict

def calculate_results(action_space, dataset, split, base_path):
    with open(base_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    tp = defaultdict(int)
    fp = defaultdict(int)
    fn = defaultdict(int)

    for path_id, entry in data.items():
        gold_seq = entry["gold"]
        pred_seq = entry["model"]
        min_len = min(len(gold_seq), len(pred_seq))

        for i in range(min_len):
            gold = gold_seq[i]
            pred = pred_seq[i]
            if pred == gold:
                tp[gold] += 1
            else:
                fp[pred] += 1
                fn[gold] += 1

    # Get unique actions
    actions = set(tp.keys()) | set(fp.keys()) | set(fn.keys())

    # Compute per-class precision, recall, f1
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    count = 0

    print(f"Results for {split} [{action_space}]\n")
    for action in sorted(actions):
        p = tp[action] / (tp[action] + fp[action]) if (tp[action] + fp[action]) > 0 else 0
        r = tp[action] / (tp[action] + fn[action]) if (tp[action] + fn[action]) > 0 else 0
        f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0

        print(f"{action}: Precision={p:.3f}, Recall={r:.3f}, F1={f1:.3f}")

        total_precision += p
        total_recall += r
        total_f1 += f1
        count += 1

    # Macro average
    macro_p = total_precision / count if count > 0 else 0
    macro_r = total_recall / count if count > 0 else 0
    macro_f1 = total_f1 / count if count > 0 else 0

    print("\nMacro-Averaged Metrics")
    print(f"Precision: {macro_p:.3f}")
    print(f"Recall:    {macro_r:.3f}")
    print(f"F1 Score:  {macro_f1:.3f}")
    print("-" * 40)
    
    # Micro-average metrics
    total_tp = sum(tp.values())
    total_fp = sum(fp.values())
    total_fn = sum(fn.values())

    micro_p = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    micro_r = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    micro_f1 = 2 * micro_p * micro_r / (micro_p + micro_r) if (micro_p + micro_r) > 0 else 0

    print("\nMicro-Averaged Metrics")
    print(f"Precision: {micro_p:.3f}")
    print(f"Recall:    {micro_r:.3f}")
    print(f"F1 Score:  {micro_f1:.3f}")
    print("=" * 40)

def print_results(action_space):
    base_path = f"./preprocessed-results/{action_space}"
    models = ["full-data"]
    dataset = "dataset_v3-full" if action_space == "low-level" else "dataset_v5-full"
    splits = ["val", "test"]

    for model in models:
        print(f"model: {model}")
        for split in splits:
            result_path = os.path.join(base_path, model, f"{split}_path_dict.json")
            calculate_results(action_space, dataset, split, result_path)
            
        print("\n"*4)

print_results("low-level")

In [None]:
import json
import os
from collections import defaultdict
import pandas as pd
from IPython.display import display, Markdown

def calculate_results(action_space, dataset, split, base_path):
    with open(base_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    tp = defaultdict(int)
    fp = defaultdict(int)
    fn = defaultdict(int)
    sr = 0
    
    for path_id, entry in data.items():
        gold_seq = entry["gold"]
        pred_seq = entry["model"]
        min_len = min(len(gold_seq), len(pred_seq))
        
        if entry["gold"] == entry["model"]:
            sr += 1

        for i in range(min_len):
            gold = gold_seq[i]
            pred = pred_seq[i]
            if pred == gold:
                tp[gold] += 1
            else:
                fp[pred] += 1
                fn[gold] += 1

    actions = set(tp.keys()) | set(fp.keys()) | set(fn.keys())

    action_metrics = {}
    total_f1 = 0
    for action in sorted(actions):
        p = tp[action] / (tp[action] + fp[action]) if (tp[action] + fp[action]) > 0 else 0
        r = tp[action] / (tp[action] + fn[action]) if (tp[action] + fn[action]) > 0 else 0
        f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0
        action_metrics[action] = f1
        total_f1 += f1

    total_tp = sum(tp.values())
    total = total_tp + sum(fn.values())
    accuracy = total_tp / total if total > 0 else 0
    macro_f1 = total_f1/len(actions)

    # Example placeholder for Conservative Success Rate (you might want to replace it)
    conservative_success_rate = sr / len(data)

    return accuracy, action_metrics, conservative_success_rate, macro_f1


def print_results(action_space):
    base_path = f"./preprocessed-results/{action_space}"
    models = ["full-data"]
    dataset = "dataset_v3-full" if action_space == "low-level" else "dataset_v5-full"
    splits = ["val", "test"]


    # Prepare DataFrames to collect everything
    results = []

    for split in splits:
        for model in models:
            result_path = os.path.join(base_path, model, f"{split}_path_dict.json")
            accuracy, action_metrics, conservative_success_rate, macro_f1 = calculate_results(action_space, dataset, split, result_path)
            
            row = {
                "Split" : split,
                "Model": model,
                "Accuracy": round(accuracy, 2),
                "Right f1": round(action_metrics.get("Right", 0), 2),
                "Left f1": round(action_metrics.get("Left", 0), 2),
                "Move f1": round(action_metrics.get("Move", 0), 2),
                "Stop f1": round(action_metrics.get("Stop", 0), 2),
                "Macro f1": round(macro_f1, 2),
                "Conservative Success Rate": round(conservative_success_rate, 2)
            }
            results.append(row)

    #val_df = pd.DataFrame(results["val"])
    #test_df = pd.DataFrame(results["test"])

    #print("Validation Split (Seen):")
    #print(val_df)
    #print("\nTest Split (Unseen):")
    #print(test_df)
    results_df = pd.DataFrame(results)
    return results_df

# Run the function
columns_order = [
    "Split",
    "Model",
    "Accuracy",
    "Right f1",
    "Left f1",
    "Move f1",
    "Stop f1",
    "Macro f1",
    "Conservative Success Rate"
]

results_df = print_results("low-level")
results_df = results_df[columns_order]

display(Markdown("### 📊 Low-level-results"))
display(results_df)

In [None]:
def print_results(action_space):
    base_path = f"./preprocessed-results/{action_space}"
    models = ["full-data"]
    dataset = "dataset_v3-full" if action_space == "low-level" else "dataset_v5-full"
    splits = ["val", "test"]

    # Prepare DataFrames to collect everything
    results = []

    for split in splits:
        for model in models:
            result_path = os.path.join(base_path, model, f"{split}_path_dict.json")
            accuracy, action_metrics, conservative_success_rate, macro_f1 = calculate_results(action_space, dataset, split, result_path)
            
            row = {
                "Split" : split,
                "Model": model,
                "Accuracy": round(accuracy, 2),
                "Stop f1": round(action_metrics.get("Stop", 0), 2),
                "Macro f1": round(macro_f1, 2),
                "Conservative Success Rate": round(conservative_success_rate, 2)
            }
            results.append(row)

    #val_df = pd.DataFrame(results["val"])
    #test_df = pd.DataFrame(results["test"])

    #print("Validation Split (Seen):")
    #print(val_df)
    #print("\nTest Split (Unseen):")
    #print(test_df)
    results_df = pd.DataFrame(results)
    return results_df

# Run the function
columns_order = [
    "Split",
    "Model",
    "Accuracy",
    "Stop f1",
    "Conservative Success Rate"
]

results_df = print_results("panoramic")
results_df = results_df[columns_order]

display(Markdown("### 📊 panoramic-results"))
display(results_df)

In [None]:
def print_results(action_space):
    base_path = f"./preprocessed-results/{action_space}"
    models = ["qwen2-full", "qwen2.5-freeze"]
    dataset = "dataset_v3_5" if action_space == "low-level" else "dataset_v5"
    splits = ["val"]

    # Prepare DataFrames to collect everything
    results = []

    for split in splits:
        for model in models:
            result_path = os.path.join(base_path, model, "zero-shot.json")
            accuracy, action_metrics, conservative_success_rate, macro_f1 = calculate_results(action_space, dataset, split, result_path)
            print(macro_f1)
            row = {
                "Split" : split,
                "Model": model,
                "Accuracy": round(accuracy, 2),
                "Right f1": round(action_metrics.get("Right", 0), 2),
                "Left f1": round(action_metrics.get("Left", 0), 2),
                "Move f1": round(action_metrics.get("Move", 0), 2),
                "Stop f1": round(action_metrics.get("Stop", 0), 2),
                "Macro f1": round(macro_f1, 2),
                "Conservative Success Rate": round(conservative_success_rate, 2)
            }
            results.append(row)

    #val_df = pd.DataFrame(results["val"])
    #test_df = pd.DataFrame(results["test"])

    #print("Validation Split (Seen):")
    #print(val_df)
    #print("\nTest Split (Unseen):")
    #print(test_df)
    results_df = pd.DataFrame(results)
    return results_df

# Run the function
columns_order = [
    "Split",
    "Model",
    "Accuracy",
    "Right f1",
    "Left f1",
    "Move f1",
    "Stop f1",
    "Macro f1",
    "Conservative Success Rate"
]

results_df = print_results("low-level")
results_df = results_df[columns_order]

display(Markdown("### 📊 Low-level-results"))
display(results_df)

In [None]:
def print_results(action_space):
    base_path = f"./preprocessed-results/{action_space}"
    models = ["qwen2-full", "qwen2.5-freeze"]
    dataset = "dataset_v3_5" if action_space == "low-level" else "dataset_v5"
    splits = ["val"]

    # Prepare DataFrames to collect everything
    results = []

    for split in splits:
        for model in models:
            result_path = os.path.join(base_path, model, "zero-shot.json")
            accuracy, action_metrics, conservative_success_rate, macro_f1 = calculate_results(action_space, dataset, split, result_path)
            print(macro_f1)
            row = {
                "Split" : split,
                "Model": model,
                "Accuracy": round(accuracy, 2),
                "Stop f1": round(action_metrics.get("Stop", 0), 2),
                "Macro f1": round(macro_f1, 2),
                "Conservative Success Rate": round(conservative_success_rate, 2)
            }
            results.append(row)

    #val_df = pd.DataFrame(results["val"])
    #test_df = pd.DataFrame(results["test"])

    #print("Validation Split (Seen):")
    #print(val_df)
    #print("\nTest Split (Unseen):")
    #print(test_df)
    results_df = pd.DataFrame(results)
    return results_df

# Run the function
columns_order = [
    "Split",
    "Model",
    "Accuracy",
    "Stop f1",
    "Macro f1",
    "Conservative Success Rate"
]

results_df = print_results("panoramic")
results_df = results_df[columns_order]

display(Markdown("### 📊 Panoramic -zero-shot-results"))
display(results_df)

# No-adjust action

In [None]:
def print_results(action_space):
    base_path = f"./preprocessed-results/{action_space}"
    models = ["qwen2.5-freezed"]
    dataset = "dataset_v3_5" if action_space == "low-level" else "dataset_v5"
    splits = ["val", "test"]

    # Prepare DataFrames to collect everything
    results = []

    for split in splits:
        for model in models:
            result_path = os.path.join(base_path, model, f"{split}_path_dict.json")
            accuracy, action_metrics, conservative_success_rate, macro_f1 = calculate_results(action_space, dataset, split, result_path)
            print(macro_f1)
            row = {
                "Split" : split,
                "Model": model,
                "Accuracy": round(accuracy, 2),
                "Right f1": round(action_metrics.get("Right", 0), 2),
                "Left f1": round(action_metrics.get("Left", 0), 2),
                "Move f1": round(action_metrics.get("Move", 0), 2),
                "Stop f1": round(action_metrics.get("Stop", 0), 2),
                "Macro f1": round(macro_f1, 2),
                "Conservative Success Rate": round(conservative_success_rate, 2)
            }
            results.append(row)

    #val_df = pd.DataFrame(results["val"])
    #test_df = pd.DataFrame(results["test"])

    #print("Validation Split (Seen):")
    #print(val_df)
    #print("\nTest Split (Unseen):")
    #print(test_df)
    results_df = pd.DataFrame(results)
    return results_df

# Run the function
columns_order = [
    "Split",
    "Model",
    "Accuracy",
    "Right f1",
    "Left f1",
    "Move f1",
    "Stop f1",
    "Macro f1",
    "Conservative Success Rate"
]

results_df = print_results("82-vfov")
results_df = results_df[columns_order]

display(Markdown("### 📊 Low-level-results"))
display(results_df)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

# Set plotting style
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['savefig.dpi'] = 300


# Data (manually entered from your table)
data = [
    # Val Seen
    ["Val Seen", "Qwen2-VL-full", "Right", 0.65],
    ["Val Seen", "Qwen2-VL-full", "Left", 0.59],
    ["Val Seen", "Qwen2-VL-full", "Move", 0.57],
    ["Val Seen", "Qwen2-VL-full", "Stop", 0.71],
    ["Val Seen", "Qwen2-VL-frozen", "Right", 0.70],
    ["Val Seen", "Qwen2-VL-frozen", "Left", 0.68],
    ["Val Seen", "Qwen2-VL-frozen", "Move", 0.73],
    ["Val Seen", "Qwen2-VL-frozen", "Stop", 0.78],
    ["Val Seen", "Qwen2.5-VL-frozen", "Right", 0.71],
    ["Val Seen", "Qwen2.5-VL-frozen", "Left", 0.69],
    ["Val Seen", "Qwen2.5-VL-frozen", "Move", 0.74],
    ["Val Seen", "Qwen2.5-VL-frozen", "Stop", 0.78],
    
    # Val Unseen
    ["Val Unseen", "Qwen2-VL-full", "Right", 0.66],
    ["Val Unseen", "Qwen2-VL-full", "Left", 0.63],
    ["Val Unseen", "Qwen2-VL-full", "Move", 0.58],
    ["Val Unseen", "Qwen2-VL-full", "Stop", 0.72],
    ["Val Unseen", "Qwen2-VL-frozen", "Right", 0.66],
    ["Val Unseen", "Qwen2-VL-frozen", "Left", 0.68],
    ["Val Unseen", "Qwen2-VL-frozen", "Move", 0.70],
    ["Val Unseen", "Qwen2-VL-frozen", "Stop", 0.74],
    ["Val Unseen", "Qwen2.5-VL-frozen", "Right", 0.70],
    ["Val Unseen", "Qwen2.5-VL-frozen", "Left", 0.72],
    ["Val Unseen", "Qwen2.5-VL-frozen", "Move", 0.72],
    ["Val Unseen", "Qwen2.5-VL-frozen", "Stop", 0.74],
]

df = pd.DataFrame(data, columns=["Split", "Model", "Action", "F1 Score"])
sns.set_theme(style="whitegrid")
sns.set_context("paper")
palette = sns.color_palette("colorblind")
# Create one plot per split
for split in ["Val Seen", "Val Unseen"]:
    plt.figure(figsize=(8, 6))
    ax = sns.barplot(data=df[df["Split"] == split], x="Action", y="F1 Score", hue="Model", palette=palette)
    plt.title(f"F1 Scores by Action - {split}")
    plt.ylim(0, 1)
    plt.legend(title="Model")

    # Add value labels manually
    for p in ax.patches:
        height = p.get_height()
        if height > 0:
            ax.annotate(f'{height:.2f}',
                        (p.get_x() + p.get_width() / 2., height),
                        ha='center', va='bottom',
                        fontsize=9, xytext=(0, 3), textcoords='offset points')

    plt.tight_layout()
    plt.show()

In [None]:
#### import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
%matplotlib inline

# Set plotting style
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['savefig.dpi'] = 300

# Load both splits
with open("../tasks/R2R/data/R2R_val_seen.json", "r", encoding="utf-8") as f_seen:
    data_seen = json.load(f_seen)

with open("../tasks/R2R/data/R2R_val_unseen.json", "r", encoding="utf-8") as f_unseen:
    data_unseen = json.load(f_unseen)

def avg_lengths(data):
    totals = [0, 0, 0]
    for sample in data:
        for i in range(3):
            totals[i] += len(sample["instructions"][i].split(" "))
    return [total / len(data) for total in totals]

seen_avg = avg_lengths(data_seen)
unseen_avg = avg_lengths(data_unseen)

df = pd.DataFrame({
    "Split": ["Val Seen"] * 3 + ["Val Unseen"] * 3,
    "Instruction Index": [0, 1, 2] * 2,
    "Avg Length": seen_avg + unseen_avg
})

sns.set_theme(style="whitegrid")
sns.set_context("paper")
palette = sns.color_palette("colorblind")

plt.figure(figsize=(8, 6))
ax = sns.barplot(data=df, x="Instruction Index", y="Avg Length", hue="Split", palette=palette)

plt.ylim(0, max(df["Avg Length"]) + 11)
plt.xlabel("Instruction Index", fontsize=24)
plt.ylabel("Average Length (words)", fontsize=24)
plt.legend(title="Data Split", fontsize=16, title_fontsize=16)

plt.xticks(fontsize=24)
plt.yticks(fontsize=18)
# Add value labels
for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.annotate(f'{height:.1f}',
                    (p.get_x() + p.get_width() / 2., height),
                    ha='center', va='bottom',
                    fontsize=18, xytext=(0, 3), textcoords='offset points')

plt.tight_layout()
plt.savefig(f"../figures/plots/instruction-lenths-over-indexes.pdf", bbox_inches="tight", dpi=700)
plt.show()