In [2]:
import json
from collections import defaultdict

def analyze_data(filepath):
    """
    Analyzes a JSON dataset to count tasks and categories.

    Args:
        filepath (str): The path to the JSON dataset file.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: The file '{filepath}' is not a valid JSON file.")
        return

    # --- Data structures for analytics ---
    # To store all unique task strings
    unique_tasks = set()
    # To count occurrences of each task across all records
    task_record_counts = defaultdict(int)
    # To map categories to the unique tasks they contain
    category_to_tasks = defaultdict(set)

    # --- Process each record in the dataset ---
    for record in data:
        category = record.get('category', 'Uncategorized')
        nodes = record.get('sampled_nodes', [])

        if not isinstance(nodes, list):
            continue # Skip if 'sampled_nodes' isn't a list

        for node in nodes:
            task = node.get('task')
            if task:
                # Add the task to our analytics collections
                unique_tasks.add(task)
                task_record_counts[task] += 1
                category_to_tasks[category].add(task)

    # --- Display the results ---
    print("--------- Dataset Analytics Results ---------")

    # 1. Count and list all unique task strings
    print(f"\n[1] Found {len(unique_tasks)} unique tasks:")
    for task in sorted(list(unique_tasks)):
        print(f"    - {task}")

    # 2. Count records per unique task string
    print("\n[2] Records per unique task:")
    for task, count in sorted(task_record_counts.items()):
        print(f"    - {task}: {count} records")

    # 3. Count unique tasks per category
    print("\n[3] Unique tasks per category:")
    for category, tasks in sorted(category_to_tasks.items()):
        print(f"    - {category}: {len(tasks)} unique tasks")

    print("\n-------------------------------------------")


if __name__ == "__main__":
    # The name of the dataset file you want to analyze.
    # Make sure this file is in the same directory as the script.
    DATASET_FILE = "/Users/val/MA/Code/main_folder/data/filtered_dataset.json"
    analyze_data(DATASET_FILE)


--------- Dataset Analytics Results ---------

[1] Found 40 unique tasks:
    - apply_for_job
    - apply_for_passport
    - attend_meeting_online
    - auto_driving_to_destination
    - auto_housework_by_robot
    - book_car
    - book_flight
    - book_hotel
    - book_restaurant
    - borrow_book_online
    - buy_insurance
    - consult_lawyer_online
    - daily_bill_payment
    - deliver_package
    - do_tax_return
    - enroll_in_course
    - get_news_for_topic
    - get_weather
    - make_video_call
    - make_voice_call
    - online_banking
    - online_shopping
    - order_food_delivery
    - order_taxi
    - organize_meeting_online
    - pay_for_credit_card
    - play_movie_by_title
    - play_music_by_title
    - print_document
    - recording_audio
    - search_by_engine
    - see_doctor_online
    - sell_item_online
    - send_email
    - send_sms
    - set_alarm
    - share_by_social_network
    - software_management
    - stock_operation
    - take_note

[2] Records per u

In [5]:
import json
from collections import defaultdict

def analyze_data(filepath):
    """
    Analyzes a JSON dataset to count tasks and categories.

    Args:
        filepath (str): The path to the JSON dataset file.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: The file '{filepath}' is not a valid JSON file.")
        return

    # --- Data structures for analytics ---
    # To store all unique task strings
    unique_tasks = set()
    # To count occurrences of each task across all records
    task_record_counts = defaultdict(int)
    # To map categories to the unique tasks they contain
    category_to_tasks = defaultdict(set)

    # --- Process each record in the dataset ---
    for record in data:
        category = record.get('category', 'Uncategorized')
        nodes = record.get('sampled_nodes', [])

        if not isinstance(nodes, list):
            continue # Skip if 'sampled_nodes' isn't a list

        for node in nodes:
            task = node.get('task')
            if task:
                # Add the task to our analytics collections
                unique_tasks.add(task)
                task_record_counts[task] += 1
                category_to_tasks[category].add(task)

    # --- Display the results ---
    print("--------- Dataset Analytics Results ---------")

    print(f"\n[*] Total records analyzed: {len(data)}")

    # 1. Count and list all unique task strings
    print(f"\n[1] Found {len(unique_tasks)} unique tasks:")
    for task in sorted(list(unique_tasks)):
        print(f"    - {task}")

    # 2. Count records per unique task string
    print("\n[2] Records per unique task:")
    for task, count in sorted(task_record_counts.items()):
        print(f"    - {task}: {count} records")

    # 3. Count unique tasks per category
    print("\n[3] Unique tasks per category:")
    for category, tasks in sorted(category_to_tasks.items()):
        print(f"    - {category}: {len(tasks)} unique tasks")

    print("\n-------------------------------------------")


if __name__ == "__main__":
    # The name of the dataset file you want to analyze.
    # Make sure this file is in the same directory as the script.
    DATASET_FILE = "/Users/val/MA/Code/main_folder/data/dataset_combined.json"
    analyze_data(DATASET_FILE)



--------- Dataset Analytics Results ---------

[*] Total records analyzed: 256

[1] Found 100 unique tasks:
    - add_item_to_trip_itinerary
    - analyze_monthly_spending
    - apply_for_job
    - apply_for_passport
    - attend_meeting_online
    - auto_driving_to_destination
    - auto_housework_by_robot
    - automate_loan_payment
    - book_car
    - book_flight
    - book_hotel
    - book_movie_tickets
    - book_restaurant
    - book_train_ticket
    - borrow_book_online
    - buy_insurance
    - check_credit_score
    - check_flight_status
    - clean_dataset
    - compare_rental_car_prices
    - compare_two_documents
    - compress_file
    - consult_lawyer_online
    - convert_currency
    - convert_file_format
    - create_qr_code
    - create_savings_goal
    - create_spreadsheet_from_data
    - create_team_poll
    - daily_bill_payment
    - deliver_package
    - do_tax_return
    - enroll_in_course
    - extract_data_from_website
    - extract_tabular_data_from_pdf
    - 