In [None]:
import os
import json
import math

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

os.chdir("..")
print(os.getcwd())

In [None]:
def get_path_mapping(data):
    path_dict = {}
    for p in data:
        path_dict[p["path_id"]] = p
    return path_dict


def load_nav_graphs(scans):
    ''' Load connectivity graph for each scan '''

    def distance(pose1, pose2):
        ''' Euclidean distance between two graph poses '''
        return ((pose1['pose'][3]-pose2['pose'][3])**2\
          + (pose1['pose'][7]-pose2['pose'][7])**2\
          + (pose1['pose'][11]-pose2['pose'][11])**2)**0.5

    graphs = {}
    for scan in scans:
        with open('connectivity/%s_connectivity.json' % scan) as f:
            G = nx.Graph()
            positions = {}
            data = json.load(f)
            for i,item in enumerate(data):
                if item['included']:
                    for j,conn in enumerate(item['unobstructed']):
                        if conn and data[j]['included']:
                            positions[item['image_id']] = np.array([item['pose'][3],
                                    item['pose'][7], item['pose'][11]]);
                            assert data[j]['unobstructed'][i], 'Graph should be undirected'
                            G.add_edge(item['image_id'],data[j]['image_id'],weight=distance(item,data[j]))
            nx.set_node_attributes(G, values=positions, name='position')
            graphs[scan] = G
    return graphs

def fix_path(node_path):
    """ Fixes duplicate nodes from paths given by low-level action space
    """
    fixed_path = []
    current = None
    for node in node_path:
        if node != current:
            fixed_path.append(node)
            
        current = node
        
    return fixed_path
        
def get_path_length(graph, node_path):
    """ Returns the path length in meters for a given node path
    """
    path_length = 0
    
    for i in range(len(node_path)-1):
        u, v = node_path[i], node_path[i+1]
        path_length += graph[u][v]["weight"]
        
    return path_length

def get_osr(graph, node_path, gold_node, threshold=3.0):
    """ A function which returns oracle sucess for a single path
    """
    osr = 0
    for node in node_path:
        shortest_path = nx.shortest_path(graph, source=node, target=gold_node, weight="weight")
        navigation_error = sum(graph[u][v]["weight"] for u, v in zip(shortest_path[:-1], shortest_path[1:]))
        
        if navigation_error < threshold:
            osr = 1
            break
            
    return osr

def get_ne(graph, source, target):
    shortest_path = nx.shortest_path(graph, source=source, target=target, weight="weight")
    
    return sum(graph[u][v]["weight"] for u, v in zip(shortest_path[:-1], shortest_path[1:]))
    

def calculate_pc(reference, predicted, graph, threshold=3.0):
    pc = 0
    
    for ref in reference:
        min_distance = 1000
        for pred in predicted:
            shortest_path = nx.shortest_path(graph, source=ref, target=pred, weight="weight")
            distance = sum(graph[u][v]["weight"] for u, v in zip(shortest_path[:-1], shortest_path[1:]))
            
            min_distance = min(distance, min_distance)
            
        
        pc += math.exp(-(min_distance/threshold))
        
    return pc/len(reference)

def calculate_epl(reference, predicted, pc, graph):
    epl = pc * reference["distance"]
    return epl

def calculate_ls(reference, predicted, pc, graph):
    epl = calculate_epl(reference, predicted, pc, graph)
    pl = get_path_length(graph, predicted)
    ls = epl/(epl+ abs(epl-pl))
    return ls
    

def get_results(r2r_data, data, actions, instruction_index, threshold=3.0):
    print(len(data))
    r2r_mapping = get_path_mapping(r2r_data)
    scans = set([p["scan"] for p in r2r_mapping.values()])

    G = load_nav_graphs(scans)
    
    sr = 0
    ne = 0
    pl = 0
    osr = 0
    spl = 0
    cls = 0
    
    stats = []
    
    for path_id, path in data.items():
        path_data = r2r_mapping[int(path_id)]
        gold_node = path_data["path"][-1]
        tmp_g = G[path_data["scan"]]
        
        old_path = path
        path = fix_path(path)
        
        current_path_length = get_path_length(tmp_g, path)
        current_navigation_error = get_ne(tmp_g, path[-1], gold_node)
        pc = calculate_pc(path_data["path"], path, tmp_g, threshold=threshold)
        
        # metrics
        cls += pc * calculate_ls(path_data, path, pc, tmp_g)
        pl += get_path_length(tmp_g, path)
        ne += current_navigation_error  
        osr += get_osr(tmp_g, path, gold_node, threshold=threshold)
        
        success_indicator = 0
        # navigation error less than 3 = success
        if current_navigation_error < threshold and len(old_path) < 32 and actions[str(path_id)][-1] == "Stop":
            sr += 1
            success_indicator = 1
                
        spl += (success_indicator*(path_data["distance"]/max(path_data["distance"], current_path_length)))
        
        stats.append({
            "path_id" : path_id, 
            "success" : True if success_indicator == 1 else False,
            "instruction" : path_data["instructions"][instruction_index],
            "path_length" : current_path_length
        })
        
    success_rate = sr/len(data)  
    navigation_error = ne/len(data)
    average_path_length = pl/len(data)
    oracle_success_rate = osr/len(data)
    spl = spl/len(data)
    cls = cls/len(data)
    
    return success_rate, navigation_error, average_path_length, oracle_success_rate, spl, cls, stats

In [None]:
model = "qwen2_5"
split = "seen"
action_space = "non_panoramic"
threshold = 3.0

base_path = f"./model_paths/alternative-setups/no-adjust/{model}_{action_space}_no_adjust_{split}_inst_0.json"
action_path = f"./model_paths/alternative-setups/no-adjust/actions_{model}_{action_space}_no_adjust_{split}_inst_0.json"

base_path = f"./model_paths/full-data/low-level/{model}_{action_space}_{split}_inst_0.json"
action_path = f"./model_paths/full-data/low-level/actions_{model}_{action_space}_{split}_inst_0.json"

#base_path = f"./model_paths/alternative-setups/82-vfov/{model}_{action_space}_82-vfov_{split}_inst_0.json"
#action_path = f"./model_paths/alternative-setups/82-vfov/actions_{model}_{action_space}_82-vfov_{split}_inst_0.json"

#print(action_path)
#print("actions_qwen2_5_non-panoramic_no_adjust_unseen_inst_2.json")
# Load R2R data
with open(f"tasks/R2R/data/R2R_val_{split}.json", "r", encoding="utf-8") as file:
    r2r_data = json.load(file)
    

stats = []

with open(base_path, "r", encoding="utf-8") as file:
    data = json.load(file)

with open(action_path, "r", encoding="utf-8") as file:
    action_data = json.load(file)
        
results = get_results(r2r_data, data, action_data, 0, threshold)
    


In [None]:
print(f"path_length: {round(results[2], 2)}")
print(f"navigation error: {round(results[1], 2)}")
print(f"oracle SR: {round(results[3], 2)}")
print(f"Succeses rate: {round(results[0], 2)}")
print(f"SPL: {round(results[4], 2)}")
print(f"cls: {round(results[5], 2)}")

#print(r2r_data[503])

In [None]:
import pandas as pd
import numpy as np

# Modified version that stores the full metrics and calculates stds
def evaluate_full_metrics():
    models = ["qwen2", "qwen2_5"]
    splits = ["seen", "unseen"]
    action_spaces = ["low-level", "panoramic"]
    data_types = ["half-data"]
    threshold = 3.0

    full_results = {space: [] for space in action_spaces}
    std_results = {space: [] for space in action_spaces}
    length_results = {space: [] for space in action_spaces}
    
    for d in data_types:
        for action_space in action_spaces:
            other_name = "non_panoramic" if action_space == "low-level" else "panoramic"
            if d == "full-data":
                other_name = "non-panoramic_full" if action_space == "low-level" else "panoramic_full"
            for split in splits:
                for model in models:
                    print(f"Evaluating: {model} | {action_space} | {split}")

                    base_path = f"./model_paths/{d}/{action_space}/{model}_{other_name}_{split}_inst"
                    action_path = f"./model_paths/{d}/{action_space}/actions_{model}_{other_name}_{split}_inst"
                    r2r_file = f"./tasks/R2R/data/R2R_val_{split}.json"

                    with open(r2r_file, "r", encoding="utf-8") as file:
                        r2r_data = json.load(file)

                    metrics_runs = []
                    stats = []
                    for i in range(3):
                        with open(f"{base_path}_{i}.json", "r", encoding="utf-8") as file:
                            data = json.load(file)

                        with open(f"{action_path}_{i}.json", "r", encoding="utf-8") as file:
                            action_data = json.load(file)

                        sr, ne, pl, osr, spl, cls, run_stats = get_results(r2r_data, data, action_data, i, threshold)
                        metrics_runs.append([pl, ne, osr, sr, spl, cls])
                        stats.extend(run_stats)

                    # Aggregate stats
                    success_stats = [s for s in stats if s["success"]]
                    failed_stats = [s for s in stats if not s["success"]]
                    print(f"success stats length: {len(success_stats)}")
                    print(f"failed stats length: {len(failed_stats)}")
                    def avg_path_and_inst(stats_list):
                        if not stats_list:
                            return (0, 0)
                        avg_pl = sum(s["path_length"] for s in stats_list) / len(stats_list)
                        avg_inst = sum(len(s["instruction"].split(" ")) for s in stats_list) / len(stats_list)
                        return avg_pl, avg_inst

                    s_pl, s_inst = avg_path_and_inst(success_stats)
                    f_pl, f_inst = avg_path_and_inst(failed_stats)
                    model_name = "Qwen2.5-VL" if model == "qwen2_5" else "Qwen2-VL"
                    length_results[action_space].append({
                        "Model": model_name,
                        "Action Space": action_space,
                        "Split": split,
                        "Avg. Path Length (Success)": round(s_pl, 2),
                        "Avg. Instruction Length (Success)": round(s_inst, 2),
                        "Avg. Path Length (Failure)": round(f_pl, 2),
                        "Avg. Instruction Length (Failure)": round(f_inst, 2)
                    })

                    metrics_np = np.array(metrics_runs)
                    means = metrics_np.mean(axis=0)
                    stds = metrics_np.std(axis=0)

                    full_results[action_space].append({
                        "Split": split,
                        "Model": model_name,
                        "Path Length": round(means[0], 2),
                        "Navigation Error": round(means[1], 2),
                        "Oracle Success Rate": round(means[2], 2),
                        "Success Rate": round(means[3], 2),
                        "Success-Weighted by Path Length": round(means[4], 2),
                        "Coverage Weighted by Length Score": round(means[5], 2)
                    })

                    std_results[action_space].append({
                        "Split": split,
                        "Model": model_name,
                        "Path Length": round(stds[0], 3),
                        "Navigation Error": round(stds[1], 3),
                        "Oracle Success Rate": round(stds[2], 3),
                        "Success Rate": round(stds[3], 3),
                        "Success-Weighted by Path Length": round(stds[4], 3),
                        "Coverage Weighted by Length Score": round(stds[5], 3)
                    })

    return pd.DataFrame(full_results["low-level"]), pd.DataFrame(full_results["panoramic"]), pd.DataFrame(std_results["low-level"]), pd.DataFrame(std_results["panoramic"]), pd.DataFrame(length_results["panoramic"]), pd.DataFrame(length_results["low-level"]) 

In [None]:
from IPython.display import display, Markdown
# Run the evaluation
pan_results, pano_results, pan_std, pano_std, pano_length, non_length = evaluate_full_metrics()
# Fix column order
columns_order = [
    "Split",
    "Model",
    "Path Length",
    "Navigation Error",
    "Oracle Success Rate",
    "Success Rate",
    "Success-Weighted by Path Length",
    "Coverage Weighted by Length Score"
]

other_column_order = [
    "Split",
    "Model",
    "Avg. Path Length (Success)",
    "Avg. Instruction Length (Success)",
    "Avg. Path Length (Failure)",
    "Avg. Instruction Length (Failure)"
]

pan_results = pan_results[columns_order]
pano_results = pano_results[columns_order]
pan_std = pan_std[columns_order]
pano_std = pano_std[columns_order]

pano_length = pano_length[other_column_order]
non_length = non_length[other_column_order]

# Display with headers
display(Markdown("### 📊 Non-Panoramic Results (Means)"))
display(pan_results)

display(Markdown("### 📈 Non-Panoramic Results (Standard Deviations)"))
display(pan_std)

display(Markdown("### 📊 Panoramic Results (Means)"))
display(pano_results)


display(Markdown("### 📈 Panoramic Results (Standard Deviations)"))
display(pano_std)

display(Markdown("### 📈 Non-panoamic Lengths"))
display(non_length)

display(Markdown("### 📈 Panoramic Lengths"))
display(pano_length)

In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

# Set plotting style
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['savefig.dpi'] = 300

# Label mapping for nicer axis names
label_map = {
    "Avg. Path Length (Success)": 'Path Length (Success)',
    "Avg. Path Length (Failure)": 'Path Length (Failure)',
    "Avg. Instruction Length (Success)": 'Instruction Length (Success)',
    "Avg. Instruction Length (Failure)": 'Instruction Length (Failure)'
}

action_spaces = ["low-level", "panoramic"]

for space in action_spaces:
    if space == "low-level":
        result_data = non_length
        
    else:
        result_data = pano_length

    # Melt DataFrame
    pano_length_melted = result_data.melt(
        id_vars=['Split', 'Model'],
        value_vars=[
            "Avg. Path Length (Success)", "Avg. Path Length (Failure)",
            "Avg. Instruction Length (Success)", "Avg. Instruction Length (Failure)"
        ],
        var_name='Metric',
        value_name='Value'
    )
    pano_length_melted['Metric'] = pano_length_melted['Metric'].map(label_map)

    # Set theme
    sns.set_theme(style="whitegrid")
    sns.set_context("paper")
    palette = sns.color_palette("colorblind")

    # Separate Path Lengths and Instruction Lengths
    path_metrics = ['Path Length (Success)', 'Path Length (Failure)']
    instruction_metrics = ['Instruction Length (Success)', 'Instruction Length (Failure)']

    for split in pano_length["Split"].unique():
        print(f"Split: {split}")
        subset = pano_length_melted[pano_length_melted['Split'] == split]

        # --- PATH LENGTH PLOT ---
        path_subset = subset[subset['Metric'].isin(path_metrics)]
        plt.figure(figsize=(10, 6))
        ax = sns.barplot(
            data=path_subset,   # <-- FIXED HERE
            x="Model",
            y="Value",
            hue="Metric",
            palette=palette,
            edgecolor="black"
        )
        ax.set_ylim(0, path_subset["Value"].max() * 1.5)
        for container in ax.containers:
            for bar in container:
                height = bar.get_height()
                ax.annotate(
                    f"{height:.1f}",
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center',
                    va='bottom',
                    fontsize=24
                )
        plt.ylabel("Length", fontsize=32)
        plt.xlabel("")
        plt.xticks(fontsize=24, rotation=45)
        plt.yticks(fontsize=18)
        plt.legend(fontsize=18, loc="upper left")
        plt.tight_layout()
        plt.savefig(f"./figures/plots/{space}/{space}-path-length-{split}.pdf", bbox_inches="tight", dpi=700)
        plt.show()

        # --- INSTRUCTION LENGTH PLOT ---
        instr_subset = subset[subset['Metric'].isin(instruction_metrics)]
        plt.figure(figsize=(10, 6))
        ax = sns.barplot(
            data=instr_subset,   # <-- FIXED HERE
            x="Model",
            y="Value",
            hue="Metric",
            palette=palette,
            edgecolor="black"
        )
        ax.set_ylim(0, instr_subset["Value"].max() * 1.6)
        for container in ax.containers:
            for bar in container:
                height = bar.get_height()
                ax.annotate(
                    f"{height:.1f}",
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center',
                    va='bottom',
                    fontsize=24
                )
        plt.ylabel("Length", fontsize=32)
        plt.xlabel("")
        plt.xticks(fontsize=24, rotation=45)
        plt.yticks(fontsize=18)
        plt.legend(fontsize=18, loc="upper left")
        plt.tight_layout()
        plt.savefig(f"./figures/plots/{space}/{space}-instruction-length-{split}.pdf", bbox_inches="tight", dpi=700)
        plt.show()

## SR, SPL and OSR over time

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

def get_plots_over_thresholds():
    models = ["qwen2", "qwen2_5"]
    action_spaces = ["non_panoramic", "panoramic"]
    splits = ["seen", "unseen"]
    thresholds = [5.0, 6.0, 7.0, 8.0, 9.0]

    all_results = []

    for action_space in action_spaces:
        action_space_folder = "low-level" if action_space == "non_panoramic" else "panoramic"
        for split in splits:
            for model in models:
                print(f"Evaluating: {model} | {action_space} | {split}")
                
                base_path = f"./model_paths/half-data/{action_space_folder}/{model}_{action_space}_{split}_inst"
                action_path = f"./model_paths/half-data/{action_space_folder}/actions_{model}_{action_space}_{split}_inst"
                r2r_file = f"tasks/R2R/data/R2R_val_{split}.json"

                with open(r2r_file, "r", encoding="utf-8") as file:
                    r2r_data = json.load(file)

                for t in thresholds:
                    metrics_runs = []
                    for i in range(3):
                        with open(f"{base_path}_{i}.json", "r", encoding="utf-8") as file:
                            data = json.load(file)

                        with open(f"{action_path}_{i}.json", "r", encoding="utf-8") as file:
                            action_data = json.load(file)

                        sr, ne, pl, osr, spl, cls, run_stats = get_results(r2r_data, data, action_data, i, t)
                        metrics_runs.append([ne, osr, sr, spl])

                    for run_id, values in enumerate(metrics_runs):
                        ne, osr, sr, spl = values
                        all_results.append({
                            "Threshold": t,
                            "Split": split,
                            "Model": model,
                            "Action Space": action_space,
                            "Run": run_id,
                            "Navigation Error": ne,
                            "Oracle Success Rate": osr,
                            "Success Rate": sr,
                            "SPL": spl
                        })


    # Create DataFrame
    df = pd.DataFrame(all_results)
    
    return df

df = get_plots_over_thresholds()

In [None]:
# Set plotting theme
sns.set_theme(style="whitegrid", context="paper")
palette = sns.color_palette("colorblind", n_colors=2)

# Define combinations
metrics = ["Success Rate"]
splits = ["seen", "unseen"]
action_spaces = ["panoramic", "non_panoramic"]

# Loop over each combination
for action_space in action_spaces:
    for split in splits:
        for metric in metrics:
            plt.figure(figsize=(8, 6))
            
            # Filter data for current combination
            df_filtered = df[(df["Split"] == split) & (df["Action Space"] == action_space)]
            
            ax = sns.lineplot(
                data=df_filtered,
                x="Threshold", y=metric,
                hue="Model",
                style="Model",  # optional, if you want different dashes
                markers=True,
                ci="sd",  # This shows standard deviation as shaded region
                palette=palette
            )
            
            # TODO: endre på skriften, og legge til sånn bedre navn på modellene og endre shit
            
            #ax.set_title(f"{metric} vs. Threshold\n({split}, {action_space} action space)", fontsize=16)
            ax.set_xlabel("Threshold (meters)", fontsize=32)
            ax.set_ylabel(metric if metric != "SPL" else "Success-Weighted by Path Length", fontsize=32)
            ax.tick_params(labelsize=12)
            plt.xlim(left=5, right=10)
            plt.xticks(fontsize=18)
            plt.yticks(fontsize=18)
            ax.legend(title="Model", fontsize=17, title_fontsize=17)
            plt.tight_layout()

            # Save plot with descriptive name
            filename = f"{metric.replace(' ', '_').lower()}_{split}_{action_space}_plot.pdf"
            plt.savefig(f"./figures/{filename}", bbox_inches="tight", dpi=700)
            plt.close()


In [None]:
import json
import os

def evaluate_models():
    models = ["qwen2", "qwen2_5"]
    action_spaces = ["non_panoramic", "panoramic"]
    splits = ["seen", "unseen"]

    # Store all results
    all_results = []

    for action_space in action_spaces:
        for split in splits:
            for model in models:
                print(f"Evaluating: {model} | {action_space} | {split}")
                
                base_path = f"./model_paths/{model}_{action_space}_{split}_inst"
                action_path = f"./model_paths/actions_{model}_{action_space}_{split}_inst"
                
                r2r_file = f"tasks/R2R/data/R2R_val_{split}.json"
                with open(r2r_file, "r", encoding="utf-8") as file:
                    r2r_data = json.load(file)

                stats = []

                for i in range(3):  # Assume 3 seeds/runs
                    with open(f"{base_path}_{i}.json", "r", encoding="utf-8") as file:
                        data = json.load(file)

                    with open(f"{action_path}_{i}.json", "r", encoding="utf-8") as file:
                        action_data = json.load(file)

                    _, _, _, _, _, _, run_stats = get_results(r2r_data, data, action_data, i, threshold=3.0)
                    stats.extend(run_stats)

                # Aggregate stats
                success_stats = [s for s in stats if s["success"]]
                failed_stats = [s for s in stats if not s["success"]]
                
                print(f"success stats length: {len(success_stats)}")
                print(f"failed stats length: {len(failed_stats)}")
                
                def avg_path_and_inst(stats_list):
                    if not stats_list:
                        return (0, 0)
                    avg_pl = sum(s["path_length"] for s in stats_list) / len(stats_list)
                    avg_inst = sum(len(s["instruction"].split(" ")) for s in stats_list) / len(stats_list)
                    return avg_pl, avg_inst

                s_pl, s_inst = avg_path_and_inst(success_stats)
                f_pl, f_inst = avg_path_and_inst(failed_stats)

                all_results.append({
                    "Model": model,
                    "Action Space": action_space,
                    "Split": split,
                    "Success Path Length": round(s_pl, 2),
                    "Success Inst Length": round(s_inst, 2),
                    "Failed Path Length": round(f_pl, 2),
                    "Failed Inst Length": round(f_inst, 2)
                })

    return all_results

evaluate_models()

In [None]:
print(df)