Analyzing the Agent Task Dataset

In [19]:
import json
from collections import defaultdict
import pandas as pd

def analyze_data(filepath):
    """
    Analyzes a JSON dataset to count tasks and categories.

    Args:
        filepath (str): The path to the JSON dataset file.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: The file '{filepath}' is not a valid JSON file.")
        return

    # --- Data structures for analytics ---
    unique_tasks = set()
    category_to_tasks = defaultdict(set)
    category_record_counts = defaultdict(int)
    category_task_counts = defaultdict(lambda: defaultdict(int))

    # --- Process each record in the dataset ---
    for record in data:
        category = record.get('category', 'Uncategorized')
        category_record_counts[category] += 1
        nodes = record.get('sampled_nodes', [])

        if not isinstance(nodes, list):
            continue

        for node in nodes:
            task = node.get('task')
            if task:
                unique_tasks.add(task)
                category_to_tasks[category].add(task)
                category_task_counts[category][task] += 1

    # --- Display the results ---
    print("--------- Dataset Analytics Results ---------")

    # --- NEW: Create and display the summary matrix using pandas ---
    '''
    print(f"\n[*] Total records analyzed: {len(data)}")
    print(f"[*] Total unique tasks found: {len(unique_tasks)}")
    '''
    
    # Prepare data for the DataFrame
    table_data = []
    for category in sorted(category_record_counts.keys()):
        table_data.append({
            "Category": category,
            "Records": category_record_counts[category],
            "Unique Tasks": len(category_to_tasks.get(category, set()))
        })

    if table_data:
        # Create DataFrame and set the Category as the index
        df = pd.DataFrame(table_data)
        df = df.set_index("Category")

        # Add a "TOTAL" row at the end
        total_row = pd.DataFrame({
            'Records': [len(data)],
            'Unique Tasks': [len(unique_tasks)]
        }, index=['--- TOTAL ---'])

        # Concatenate the main dataframe with the total row
        df = pd.concat([df, total_row])

        print("\n[1] Summary Matrix:")
        # Use to_string() to ensure the full table is printed
        print(df.to_string())
    else:
        print("\n[1] Summary Matrix: No data to display.")


    # --- Display detailed breakdown, grouped by category ---
    print("\n[2] Records per unique task (grouped by category):")
    # Iterate through each category, sorted alphabetically
    for category, tasks in sorted(category_task_counts.items()):
        print(f"    - {category}:")
        # For each category, iterate through its tasks, sorted alphabetically
        for task, count in sorted(tasks.items()):
            print(f"        - {task}: {count} records")

    print("\n-------------------------------------------")


if __name__ == "__main__":
    DATASET_FILE = "/Users/val/MA/Code/main_folder/data/dataset_combined.json"
    analyze_data(DATASET_FILE)

--------- Dataset Analytics Results ---------

[1] Summary Matrix:
                           Records  Unique Tasks
business_and_productivity       50            20
data                            50            20
entertainment_and_media         50            20
finance                         50            20
travel_and_transportation       50            20
--- TOTAL ---                  250           100

[2] Records per unique task (grouped by category):
    - business_and_productivity:
        - apply_for_job: 3 records
        - auto_housework_by_robot: 3 records
        - borrow_book_online: 2 records
        - consult_lawyer_online: 2 records
        - create_team_poll: 3 records
        - enroll_in_course: 3 records
        - generate_meeting_summary: 3 records
        - initiate_code_repository: 2 records
        - log_work_hours: 2 records
        - manage_crm_contact: 3 records
        - organize_meeting_online: 2 records
        - print_document: 3 records
        - query_i

Analyzing the model output files for completeness

In [8]:
import os
from collections import defaultdict, Counter
import itertools

def analyze_files_in_subdirectories(parent_directory_path):
    """
    Analyzes files in subdirectories to ensure each unique ID has the correct
    number of files and to identify duplicates.

    Args:
        parent_directory_path (str): The path to the parent directory to scan.

    Returns:
        tuple[list, list]: A tuple containing two lists:
                           1. A sorted list of string ID numbers with missing files.
                           2. A sorted list of string ID numbers with duplicate files.
    """
    if not os.path.isdir(parent_directory_path):
        print(f"Error: Parent directory not found at '{parent_directory_path}'")
        return [], []

    # Use a list for combinations to allow for duplicate detection.
    id_data = defaultdict(lambda: {"category": None, "combinations": []})

    expected_models = {'Qwen3-0.6B', 'Qwen3-1.7B'}
    expected_endings = {'agent_out.dill', 'inseq_out.dill'}
    expected_combinations = set(itertools.product(expected_models, expected_endings))

    print(f"Scanning subdirectories in: '{parent_directory_path}'...\n")
    for category_dir in os.listdir(parent_directory_path):
        category_path = os.path.join(parent_directory_path, category_dir)
        if os.path.isdir(category_path):
            print(f"  Checking '{category_dir}'...")
            for filename in os.listdir(category_path):
                filepath = os.path.join(category_path, filename)
                if os.path.isfile(filepath):
                    parts = filename.split('_')
                    file_id, model_name = None, None
                    for part in parts:
                        if part.startswith("ID"):
                            file_id = part
                        elif part.startswith("Qwen3"):
                            model_name = part
                    
                    if not file_id or not model_name:
                        continue

                    file_ending = None
                    if filename.endswith('agent_out.dill'):
                        file_ending = 'agent_out.dill'
                    elif filename.endswith('inseq_out.dill'):
                        file_ending = 'inseq_out.dill'
                    
                    if file_ending:
                        id_data[file_id]["category"] = category_dir
                        id_data[file_id]["combinations"].append((model_name, file_ending))

    if not id_data:
        print("\nNo files matching the required ID and Model pattern were found.")
        return [], []
        
    total_unique_ids = len(id_data)
    print("\n" + "-" * 30)
    print(f"Analysis complete. Found a total of {total_unique_ids} unique IDs.")
    print("-" * 30)

    # --- Analysis and Segregation of IDs ---
    incomplete_ids = {}
    duplicate_ids = {}

    for file_id, details in id_data.items():
        combinations_list = details["combinations"]
        unique_combinations = set(combinations_list)

        # Check for missing files
        if unique_combinations != expected_combinations:
            incomplete_ids[file_id] = details

        # Check for duplicate files
        if len(combinations_list) > len(unique_combinations):
            duplicate_ids[file_id] = details

    # --- Reporting Section ---
    all_ok = True
    # Report 1: Incomplete IDs
    if not incomplete_ids:
        print("✅ Success! No IDs are missing files.")
    else:
        all_ok = False
        incomplete_by_category = defaultdict(list)
        for file_id, details in incomplete_ids.items():
            incomplete_by_category[details["category"]].append((file_id, details))

        print(f"❌ Found {len(incomplete_ids)} ID(s) that are missing corresponding files:")
        for category, id_list in sorted(incomplete_by_category.items()):
            print(f"\n{category}:")
            for file_id, details in sorted(id_list):
                missing_combinations = expected_combinations - set(details["combinations"])
                missing_by_model = defaultdict(list)
                for model, ending in missing_combinations:
                    missing_by_model[model].append(ending)
                report_parts = []
                for model, endings in sorted(missing_by_model.items()):
                    report_parts.append(f"Model {model}, Filetype/s {' + '.join(sorted(endings))}")
                print(f"- {file_id} file/s missing: {'; '.join(report_parts)}")
    
    print("-" * 30)
    # Report 2: Duplicate IDs
    if not duplicate_ids:
        print("✅ Success! No IDs have duplicate files.")
    else:
        all_ok = False
        duplicates_by_category = defaultdict(list)
        for file_id, details in duplicate_ids.items():
            duplicates_by_category[details["category"]].append((file_id, details))

        print(f"🔍 Found {len(duplicate_ids)} ID(s) with duplicate files:")
        for category, id_list in sorted(duplicates_by_category.items()):
            print(f"\n{category}:")
            for file_id, details in sorted(id_list):
                counts = Counter(details["combinations"])
                dupes = {item: count for item, count in counts.items() if count > 1}
                report_parts = []
                for (model, ending), count in dupes.items():
                    report_parts.append(f"Model {model}, Filetype {ending} (found {count} times)")
                print(f"- {file_id} has duplicates: {'; '.join(report_parts)}")

    if all_ok:
        print("\n✅✅ Overall Success! All IDs have exactly 4 unique files.")

    # --- Return Section ---
    incomplete_keys = [str(int(key.replace('ID', ''))) for key in incomplete_ids.keys()]
    duplicate_keys = [str(int(key.replace('ID', ''))) for key in duplicate_ids.keys()]
    
    return sorted(incomplete_keys, key=int), sorted(duplicate_keys, key=int)


if __name__ == "__main__":
    parent_directory = "outputs23-09"
    
    missing_ids_list, duplicate_ids_list = analyze_files_in_subdirectories(parent_directory)

    if missing_ids_list:
        print("\n" + "-" * 30)
        print("Returned list of IDs with MISSING files:")
        print(missing_ids_list)

    if duplicate_ids_list:
        print("\n" + "-" * 30)
        print("Returned list of IDs with DUPLICATE files:")
        print(duplicate_ids_list)



Scanning subdirectories in: 'outputs23-09'...

  Checking 'business_and_productivity'...
  Checking 'travel_and_transportation'...
  Checking 'entertainment_and_media'...
  Checking 'finance'...
  Checking 'data'...

------------------------------
Analysis complete. Found a total of 250 unique IDs.
------------------------------
❌ Found 30 ID(s) that are missing corresponding files:

finance:
- ID1072 file/s missing: Model Qwen3-1.7B, Filetype/s inseq_out.dill
- ID1073 file/s missing: Model Qwen3-1.7B, Filetype/s agent_out.dill + inseq_out.dill
- ID1074 file/s missing: Model Qwen3-1.7B, Filetype/s agent_out.dill + inseq_out.dill
- ID1075 file/s missing: Model Qwen3-1.7B, Filetype/s agent_out.dill + inseq_out.dill
- ID1076 file/s missing: Model Qwen3-1.7B, Filetype/s agent_out.dill + inseq_out.dill
- ID1077 file/s missing: Model Qwen3-1.7B, Filetype/s agent_out.dill + inseq_out.dill
- ID1078 file/s missing: Model Qwen3-1.7B, Filetype/s agent_out.dill + inseq_out.dill
- ID1079 file/s mis