In [3]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sft_data_gen import prompt_generation

In [4]:
tool_dir = "../toolset"

tool_names = [
    "rapidapi/weatherapi_com", 
    "rapidapi/news_api", 
    "rapidapi/public_holiday", 
    "dev_tools/calendar", 
    "rapidapi/recipe_by_api_ninjas"
]

tool_names += [
    "rapidapi/objects_detection", 
    "rapidapi/ocr_extract_text", 
    "dev_tools/image_translation", 
    "dev_tools/image_processing", 
    "dev_tools/image_remove_bg",
    "rapidapi/web_capture"
]

tool_names += [
    "rapidapi/onecompiler_apis", 
    "rapidapi/bing",
    "rapidapi/bing_search_apis",
    "rapidapi/google_api"
]

tool_names += [
    "rapidapi/skyscanner80", 
    "rapidapi/currency_exchange", 
    "rapidapi/geocoding_by_api_ninjas", 
    "rapidapi/airports",
    "rapidapi/list_of_all_countries_and_languages_with_their_codes",
    "rapidapi/tourist_attraction"
]

tool_names += [
    "rapidapi/world_time_by_api_ninjas", 
    "rapidapi/google_translate",
    "rapidapi/ip_geo_location",
    "dev_tools/calculator",
    "dev_tools/access_user_info",
    "dev_tools/agent_equipments"
]

In [2]:
api_list = []
for name in tool_names:
    tool_src, tool = name.split("/")
    doc_fn = os.path.join(tool_dir, tool_src, "docs", tool + ".json")
    with open(doc_fn) as file:
        doc_obj = json.loads(file.read())
        for api_obj in doc_obj["APIs"]:
            api_name = api_obj["function"]["name"]
            api_list.append((tool_src, tool, api_name))

# df = pd.DataFrame(data = api_list, columns = ["source", "tool_name", "api_name"])
# df.to_csv("selected_tools.csv")

In [None]:
toolset_dir = os.path.join(os.getcwd(), "../toolset") 
out_dir = "./output"
for idx, tn in enumerate(tool_names):
    print(f"==>> {idx}/{len(tool_names)} {tn}")
    src, tool_name = tn.split('/')
    doc_path = os.path.join(toolset_dir, f"{src}/docs/{tool_name}.json") 
    print(doc_path)
    instruct_df = prompt_generation.gen_instructions_given_tool_doc(doc_path, 1, 10, toolset_dir)
    out_fn = os.path.join(out_dir, "instruction_data/type-i", f"{tool_name}.csv")
    os.makedirs(os.path.join(out_dir, "instruction_data/type-i"), exist_ok=True)
    instruct_df.to_csv(out_fn)

In [None]:
toolset_dir = os.path.join(os.getcwd(), "../toolset") 
out_dir = "./output"
for idx, tn in enumerate(tool_names):
    print(f"==>> {idx}/{len(tool_names)} {tn}")
    src, tool_name = tn.split('/')
    doc_path = os.path.join(toolset_dir, f"{src}/docs/{tool_name}.json") 
    print(doc_path)
    instruct_df = prompt_generation.gen_instructions_given_tool_doc(doc_path, 2, 10, toolset_dir)
    out_fn = os.path.join(out_dir, "instruction_data/type-ii", f"{tool_name}.csv")
    os.makedirs(os.path.join(out_dir, "instruction_data/type-ii"), exist_ok=True)
    instruct_df.to_csv(out_fn)

In [None]:
toolset_dir = os.path.join(os.getcwd(), "../toolset") 
out_dir = "./output"
for idx, tn in enumerate(tool_names):
    print(f"==>> {idx}/{len(tool_names)} {tn}")
    src, tool_name = tn.split('/')
    doc_path = os.path.join(toolset_dir, f"{src}/docs/{tool_name}.json") 
    print(doc_path)
    instruct_df = prompt_generation.gen_instructions_given_tool_doc(doc_path, 3, 10, toolset_dir)
    out_fn = os.path.join(out_dir, "instruction_data/type-iii", f"{tool_name}.csv")
    os.makedirs(os.path.join(out_dir, "instruction_data/type-iii"), exist_ok=True)
    instruct_df.to_csv(out_fn)

In [None]:
# merge

instruction_type = "type-i"  # can change it to type-ii, type-iii
data_dir = f"output/instruction_data/{instruction_type}"
out_dir = "./output/instruction_data/"
name_list = os.listdir(data_dir)

merge_data_list = []
for name in name_list:
    fn = os.path.join(data_dir, name)
    df = pd.read_csv(fn, index_col=0)
    cur_tool, cur_api = None, None
    for idx in df.index:
        row = df.loc[idx]
        tool_name, api_name, instruction, ins_t = row.tolist()
        if pd.isna(tool_name):
            tool_name = cur_tool
        else:
            cur_tool = tool_name
            
        if pd.isna(api_name):
            api_name = cur_api
        else:
            cur_api = api_name

        assert not pd.isna(tool_name) and not pd.isna(api_name), f"Exception: tool_name: {tool_name}, api_name: {api_name}"
        merge_data_list.append((tool_name, api_name, instruction, ins_t))

merge_df = pd.DataFrame(data=merge_data_list, columns=["tool_name", "api_name", "instruction", "type"])
merge_df.to_csv(os.path.join(out_dir, f"merged_instructions_{instruction_type}.csv"))