# Merge .csv files with interrogation metrics into one dataset

In [3]:
import os
import pandas as pd

# Define the top-level results folders for each model
model_folders = [
    "results_qwen_tot",
    "results_llava_tot",
    "results_deepseek_tot",
    "results_instructblip_tot"
]

all_logs = []

for model_dir in model_folders:
    model_name = model_dir.replace("results_", "").replace("_tot", "").capitalize()

    if not os.path.isdir(model_dir):
        print(f"Skipping missing folder: {model_dir}")
        continue

    for category in sorted(os.listdir(model_dir)):
        category_path = os.path.join(model_dir, category)
        if not os.path.isdir(category_path):
            continue

        for difficulty in sorted(os.listdir(category_path)):
            difficulty_path = os.path.join(category_path, difficulty)
            if not os.path.isdir(difficulty_path):
                continue

            for video_id in sorted(os.listdir(difficulty_path)):
                video_path = os.path.join(difficulty_path, video_id)
                csv_path = os.path.join(video_path, "performance_log.csv")

                if not os.path.isfile(csv_path):
                    continue

                try:
                    df = pd.read_csv(csv_path)

                    # Drop Readability column if it exists
                    if "Readability" in df.columns:
                        df = df.drop(columns=["Readability"])

                    df["category"] = category
                    df["difficulty"] = difficulty
                    df["video_id"] = video_id
                    all_logs.append(df)
                except Exception as e:
                    print(f"Failed to load {csv_path}: {e}")

# Combine and save
if all_logs:
    full_df = pd.concat(all_logs, ignore_index=True)
    full_df.to_csv("model.csv", index=False)
    print(f"Aggregated {len(full_df)} rows from {len(all_logs)} video logs.")
else:
    print("No CSV logs found.")

Aggregated 1513061 rows from 12000 video logs.


# Merge .txt files with interrogation outputs into a .csv dataset

In [8]:
import os
import re
import pandas as pd

model_folders = [
    "results_qwen_tot",
    "results_llava_tot",
    "results_deepseek_tot",
    "results_instructblip_tot"
]

model_name_map = {
    "results_qwen_tot": "Qwen2.5-VL",
    "results_llava_tot": "LLaVA",
    "results_deepseek_tot": "DeepSeek-VL",
    "results_instructblip_tot": "InstructBLIP"
}

all_logs = []

# Match both dash and en-dash; handle lowercase answers
pattern = re.compile(
    r"Frame (\d+) \(Time: ([\d.]+)s\) [–-] Confidence: ([\d.]+) [–-] (yes|no|unclear) [–-] Processing Time: ([\d.]+)ms",
    re.IGNORECASE
)

for model_folder in model_folders:
    if not os.path.exists(model_folder):
        continue

    model_name = model_name_map[model_folder]

    for category in os.listdir(model_folder):
        cat_path = os.path.join(model_folder, category)
        if not os.path.isdir(cat_path):
            continue

        for difficulty in os.listdir(cat_path):
            diff_path = os.path.join(cat_path, difficulty)
            if not os.path.isdir(diff_path):
                continue

            for video_id in os.listdir(diff_path):
                vid_path = os.path.join(diff_path, video_id)
                results_file = os.path.join(vid_path, "results.txt")
                if not os.path.isfile(results_file):
                    continue

                with open(results_file, "r") as f:
                    for line in f:
                        match = pattern.search(line.strip())
                        if match:
                            all_logs.append({
                                "Frame": int(match.group(1)),
                                "Time (s)": float(match.group(2)),
                                "Confidence": float(match.group(3)),
                                "Response": match.group(4).capitalize(),
                                "Inference Time (ms)": float(match.group(5)),
                                "Model": model_name,
                                "category": category,
                                "difficulty": difficulty,
                                "video_id": video_id
                            })

df = pd.DataFrame(all_logs)
df.sort_values(by=["Model", "category", "difficulty", "video_id", "Frame"], inplace=True)
df.to_csv("dataset_results.csv", index=False)

print(f"Saved {len(df)} rows to 'dataset_results.csv'")

Saved 1513061 rows to 'dataset_results.csv'
