In [1]:
# This script loads the dataset associated with the APIGen paper 
# and extracts a list of unique REST APIs.
# Uniqueness is determined by the combination of an API's name and description.

# First, ensure you have the 'datasets' library installed:
# pip install datasets

from datasets import load_dataset
import json
from utils import standardize, standardize_category

def get_apigen_tools():
  """
  Loads the APIGen dataset and returns a list of unique API dictionaries.

  Returns:
    list: A list of unique APIs, where each API is a dictionary with
          'name', 'description', and 'parameters' keys.
  """
  print("Loading the Salesforce/xlam-function-calling-60k dataset...")
  
  # Load the dataset from the Hugging Face Hub
  try:
    dataset = load_dataset("Salesforce/xlam-function-calling-60k", split='train')
  except Exception as e:
    print(f"Failed to load dataset. Error: {e}")
    return []

  print("Extracting unique APIs...")

  # Use a dictionary to store unique APIs, with (name, description) as the key
  unique_apis_dict = {}

  # Iterate through each example in the dataset to find all unique tools
  for row in dataset:    
    try:
      # Assuming each row `data` has a 'tools' key with a JSON string
      tools_list = json.loads(row['tools'])
      if not isinstance(tools_list, list):
        continue
    except (json.JSONDecodeError, KeyError):
        continue

    for tool in tools_list:
      api_name = standardize(tool['name'])
      api_description = tool['description']
      api_parameters = tool.get('parameters', {})

      # Create a hashable, order-independent key from the parameter names
      # A frozenset is used because it's immutable and can be a dict key.
      parameter_names = frozenset(standardize(k) for k in api_parameters.keys())
      
      # Create a unique key based on both name and description
      api_key = (api_name, parameter_names)
      
      if api_key not in unique_apis_dict:
        # If this (name, parameters) combination is new, add it to the dictionary
        unique_apis_dict[api_key] = {
          "name": api_name,
          "descriptions": {api_description},
          "parameters": api_parameters
        }
      else:
        # If it already exists, just add the new description to the set
        unique_apis_dict[api_key]["descriptions"].add(api_description)
  
  # Convert the 'descriptions' set to a list for JSON serialization
  for api in unique_apis_dict.values():
    api['descriptions'] = list(api['descriptions'])

  return unique_apis_dict

apigen_apis_dict = get_apigen_tools()
apigen_apis_list = list(apigen_apis_dict.values())
print(f"Found {len(apigen_apis_list)} unique APIs.")

with open("apigen_apis.json", 'w', encoding='utf-8') as f:
  json.dump(apigen_apis_list, f, indent=2, ensure_ascii=False)
# Pretty-print the resulting list of API dictionaries
# print("-" * 50)
# I'm using an indent of 2 as you prefer.
# print(json.dumps(unique_apis_list, indent=2))

Loading the Salesforce/xlam-function-calling-60k dataset...
Extracting unique APIs...
Found 3935 unique APIs.


In [2]:
print("APIs with multiple descriptions:")
ctr = 0
for api in apigen_apis_dict.values():
  if len(api['descriptions']) > 1:
    ctr += 1
    print(f"\n{api['name']}")
    for desc in sorted(api['descriptions']):
      print(f": {desc}")
print(f"\nFound {ctr} APIs with multiple descriptions.")

APIs with multiple descriptions:

get_product
: Fetches a product's information from the API using the given product ID.
: Fetches product details from a RapidAPI endpoint using the provided product ID and RapidAPI key.
: Fetches product details from an API using the given product ID.
: Fetches product details from the RapidAPI endpoint using the given product ID.
: Fetches product details from the RapidAPI endpoint using the provided product ID and API key.
: Fetches the product details for a given product ID from the RapidAPI demo project.

auto_complete
: Fetch auto-complete suggestions for a given query using the Wayfair API.
: Fetches auto-complete suggestions for a given query using the Twitter API.
: Get autocomplete suggestions for a given query using the Realty-in-AU API.

matchschedules
: Fetches football match schedules for a specific date using the Toolbench RapidAPI service.
: Fetches the cricket match schedules for a given date, including match timings, teams, and other r

In [4]:
import os
import json
import csv
import sys
from pathlib import Path

# Import utils functions for standardization
sys.path.append('server')

def get_toolbench_tools(tools_dir):
  """Scan the tools directory and extract all unique APIs."""
  unique_apis = {}
  api_count = 0

  if not os.path.exists(tools_dir):
    print(f"Error: Tools directory {tools_dir} does not exist")
    return {}

  print(f"Scanning tools directory: {tools_dir}")

  # Iterate through category directories
  for category_name in os.listdir(tools_dir):
    category_path = os.path.join(tools_dir, category_name)

    if not os.path.isdir(category_path):
      continue

    print(f"Processing category: {category_name}")
    standard_category = standardize_category(category_name)

    # Iterate through JSON files in each category
    for item in os.listdir(category_path):
      if item.endswith('.json'):
        json_path = os.path.join(category_path, item)

        try:
          with open(json_path, 'r', encoding='utf-8') as f:
            tool_data = json.load(f)

          # Extract tool information
          tool_name = tool_data.get('tool_name', item.replace('.json', ''))
          tool_description = tool_data.get('tool_description', '')
          standardized_tool_name = standardize(tool_name)

          # Process each API in the tool
          api_list = tool_data.get('api_list', [])
          for api in api_list:
            api_name = api.get('name', '')
            api_description = api.get('description', '')
            api_parameters = api.get('required_parameters', []) + api.get('optional_parameters', [])

            standardized_api_name = standardize(api_name)
            standardized_parameter_names = frozenset(standardize(p['name']) for p in api_parameters)

            # Create unique identifier using name and description
            api_key = (standardized_api_name, standardized_parameter_names)

            # Store unique API
            if api_key not in unique_apis:
              unique_apis[api_key] = {
                'category': standard_category,
                'tool_name': standardized_tool_name,
                'api_name': standardized_api_name,
                'api_description': api_description,
                'tool_description': tool_description,
                'required_parameters': api.get('required_parameters', []),
                'optional_parameters': api.get('optional_parameters', [])
              }
              api_count += 1

        except Exception as e:
          print(f"Error processing {json_path}: {e}")
          continue

  # print(f"Found {api_count} unique APIs across {len(os.listdir(tools_dir))} categories")
  return unique_apis

def save_to_json(unique_apis, output_file):
  """Save the unique APIs to a JSON file."""
  with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(list(unique_apis.values()), f, indent=2, ensure_ascii=False)

  # print(f"Saved {len(unique_apis)} unique APIs to {output_file}")

# Default paths
tools_dir = "tools/"
json_output = "toolbench_apis.json"

print("=" * 60)
print("API Compilation Script")
print("=" * 60)

# Scan tools directory
toolbench_apis = get_toolbench_tools(tools_dir)
# Save results
save_to_json(toolbench_apis, json_output)

# Print summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)

categories = set(api['category'] for api in toolbench_apis.values())
tools = set(api['tool_name'] for api in toolbench_apis.values())

print(f"Total unique APIs: {len(toolbench_apis)}")
print(f"Total categories: {len(categories)}")
print(f"Total tools: {len(tools)}")
print(f"Output file: {json_output}")

API Compilation Script
Scanning tools directory: tools/
Processing category: Transportation
Processing category: Cryptography
Processing category: Energy
Processing category: Finance
Processing category: Translation
Processing category: Email
Processing category: Media
Processing category: Payments
Processing category: News_Media
Processing category: Weather
Processing category: Artificial_Intelligence_Machine_Learning
Processing category: Business
Processing category: Storage
Processing category: Gaming
Processing category: Communication
Processing category: Customized
Processing category: Business_Software
Processing category: Other
Processing category: Jobs
Processing category: Video_Images
Processing category: Education
Processing category: Food
Processing category: Mapping
Processing category: Entertainment
Processing category: Cybersecurity
Processing category: Events
Processing category: Sports
Processing category: Devices
Processing category: Advertising
Processing category: Lo

In [5]:
def standardize_type(type_str):
  """Converts various type formats to a standard JSON schema type."""
  if not isinstance(type_str, str):
    return "string" # Default for unknown types
  
  type_str_lower = type_str.lower()
  type_map = {
    "str": "string",
    "string": "string",
    "int": "integer",
    "integer": "integer",
    "float": "number",
    "double": "number",
    "number": "number",
    "bool": "boolean",
    "boolean": "boolean"
  }
  return type_map.get(type_str_lower, "string") # Default to string if no match

def standardize_api(apigen_api, toolbench_api):
  """
  Formats a matched APIGen-toolbench API pair into the standard function definition.
  """
  function_def = {
    "name": toolbench_api.get("api_name"),
    "description": toolbench_api.get("api_description"),
    "parameters": {
      "type": "object",
      "properties": {},
      "required": []
    },
    "category": toolbench_api.get("category"),
    "descriptions": apigen_api.get("descriptions"),
    "tool_name": toolbench_api.get("tool_name"),
    "tool_description": toolbench_api.get("tool_description")
  }

  required_params_list = [p.get("name") for p in toolbench_api.get("required_parameters", []) if p.get("name")]
  if required_params_list:
    function_def["parameters"]["required"] = required_params_list

  all_params = toolbench_api.get("required_parameters", []) + toolbench_api.get("optional_parameters", [])

  for param in all_params:
    param_name = param.get("name")
    if not param_name:
      continue
    
    param_info = {
      "type": standardize_type(param.get("type")),
      "description": param.get("description", "")
    }
    
    function_def["parameters"]["properties"][param_name] = param_info

  return {
    "type": "function",
    "function": function_def
  }

def match_apis(apigen_apis, toolbench_apis):
  """
  Finds exact matches for apigen_apis within toolbench_apis based on API name and parameters.

  An exact match requires two conditions to be met:
  1. The 'name' in apigen_apis must be identical to the 'api_name' in toolbench_apis.
  2. The set of parameter names for both APIs must be identical.

  Args:
    apigen_apis: A list of APIs from the apigen.json file.
    toolbench_apis: A list of APIs from the toolbench.json file.

  Returns:
    A list of dictionaries, where each dictionary contains the apigen API and
    its corresponding exact match from the toolbench file.
  """
  matches = []
  print(f"Comparing {len(apigen_apis)} APIGen APIs with {len(toolbench_apis.keys())} ToolBench APIs...")
  for apigen_api in apigen_apis:
    apigen_name = apigen_api.get("name", "")
    apigen_params = frozenset(apigen_api.get("parameters", {}).keys())
    
    key = (apigen_name, apigen_params)
    
    # Perform a quick dictionary lookup to find a match.
    if key in toolbench_apis:
      match = standardize_api(apigen_api, toolbench_apis[key])
      matches.append(match)
      # print(f"  Exact ma tch found for API: '{apigen_name}'")

  return matches

matches = match_apis(apigen_apis_list, toolbench_apis)
print(f"Found matches for {len(matches)} APIGen APIs.")

with open("matched_apis.json", 'w', encoding='utf-8') as f:
  json.dump(matches, f, indent=2, ensure_ascii=False)

Comparing 3935 APIGen APIs with 41403 ToolBench APIs...
Found matches for 3360 APIGen APIs.
