In [3]:
import json
from datasets import load_dataset

# --- 1. Configuration ---
# All the settings are at the top for easy changes.

DATASET_NAME = "microsoft/Taskbench"
SUBSET_NAME = "dailylifeapis"
SPLIT_NAME = "test"

# The final, clean file will be saved here.
OUTPUT_FILENAME = "data/taskbench_processed_parsed_v2.json"

# List of columns to remove from the dataset.
COLUMNS_TO_REMOVE = ["seed", "type", "sampled_links", "tool_links"]

# List of columns that contain "JSON within a string" and need to be parsed.
COLUMNS_TO_PARSE = ["sampled_nodes", "tool_steps", "tool_nodes"]


# --- 2. Main Processing ---

def process_dataset():
    """
    Loads, filters, cleans, and saves the Hugging Face dataset.
    """
    print(f"Loading '{SUBSET_NAME}' from '{DATASET_NAME}'...")
    dataset = load_dataset(DATASET_NAME, name=SUBSET_NAME, split=SPLIT_NAME)
    print(f"  - Loaded {len(dataset)} records.")

    print("Filtering records where 'n_tools' is 1...")
    filtered_dataset = dataset.filter(lambda example: example['n_tools'] == 1)
    print(f"  - Found {len(filtered_dataset)} matching records.")

    print("Removing specified columns...")
    cleaned_dataset = filtered_dataset.remove_columns(COLUMNS_TO_REMOVE)
    print(f"  - Columns remaining: {cleaned_dataset.column_names}")

    print("Converting, parsing, and cleaning data types...")
    data_list = [row for row in cleaned_dataset]

    for record in data_list:
        # A. Parse the nested JSON strings into Python objects.
        for key in COLUMNS_TO_PARSE:
            if key in record and isinstance(record[key], str):
                try:
                    record[key] = json.loads(record[key])
                except json.JSONDecodeError:
                    print(f"  - Warning: Could not parse key '{key}' in record ID {record.get('id')}.")
        
        # B. Convert the 'id' field from a string to an integer.
        if 'id' in record and isinstance(record['id'], str):
            try:
                record['id'] = int(record['id'])
            except ValueError:
                print(f"  - Warning: Could not convert id '{record['id']}' to an integer.")

    print(f"Saving the final, clean data to '{OUTPUT_FILENAME}'...")
    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        json.dump(data_list, f, indent=4)

    print(f"\nDone! Your clean data is ready in '{OUTPUT_FILENAME}'.")


# --- 3. Run the script ---
if __name__ == "__main__":
    process_dataset()

Loading 'dailylifeapis' from 'microsoft/Taskbench'...
  - Loaded 4318 records.
Filtering records where 'n_tools' is 1...
  - Found 1257 matching records.
Removing specified columns...
  - Columns remaining: ['id', 'n_tools', 'sampled_nodes', 'instruction', 'tool_steps', 'tool_nodes']
Converting, parsing, and cleaning data types...
Saving the final, clean data to 'data/taskbench_processed_parsed_v2.json'...

Done! Your clean data is ready in 'data/taskbench_processed_parsed_v2.json'.


In [4]:
import json

file_path = OUTPUT_FILENAME

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print("task: " + data[0]["sampled_nodes"][0]["task"])
print("prompt: " + data[0]["instruction"])

task: play_movie_by_title
prompt: I've been hearing a lot about this movie called 'Inception', could you help me watch it?
