In [None]:
# Download and decompress [data.tar.gz](https://huggingface.co/datasets/reasonwang/ToolGen-Datasets/blob/main/data.tar.gz). Other datasets are at [ToolGen-Datasets](https://huggingface.co/datasets/reasonwang/ToolGen-Datasets).

'''
pip install:
    deepspeed
    UniTok
    pigmento
    refconfig
    fastparquet
    transformers==4.40.0
    tokenizers==0.19.1
    numpy==1.25.2
    pyarrow==18.1.0
    backoff==2.2.1
    click==8.1.7
    faiss_cpu
    Flask==3.0.3
    flask_cors==5.0.0
    fschat==0.2.36
    httpx==0.27.2
    huggingface_hub==0.24.6
    nltk==3.9.1
    openai
    pandas==2.2.3
    peft
    psutil
    pydantic==2.9.2
    rank_bm25==0.2.2
    Requests==2.32.3
    scikit_learn==1.5.2
    scipy==1.14.1
    sentence_transformers==3.1.0
    tenacity==8.5.0
    termcolor==2.5.0
    tiktoken==0.7.0
    torch==2.4.1
    tqdm
    Unidecode
    fastapi==0.110.0
    PyYAML==6.0.1
    slowapi==0.1.9
    uvicorn==0.28.0

'''

In [None]:
# Extract the training data required by the codebook from the tools.json file of ToolBench, and transfer it to GenCodebook\data\tb_train and GenCodebook\data\tb_dev directories.
import json
import csv
"""
{
    'product_id': 'api_2c3bbf59-df39-4b01-b91b-0f176c8effd9', 
    'tool_description': "Extract the information on a Thai driver's license and return text results such as driver's license number and personal information.", 
    'home_url': 'https://rapidapi.com/the-brainstem-brainbotapi/api/thai-drivers-license-ocr/', 
    'name': 'Thai Drivers License OCR', 
    'title': 'Thai Drivers License OCR', 
    'pricing': 'FREEMIUM', 
    'tool_name': 'Thai Drivers License OCR', 
    'score': None, 
    'host': 'thai-drivers-license-ocr.p.rapidapi.com', 
    'api_list': [{
        'name': "Driver's  License", 
        'url': 'https://thai-drivers-license-ocr.p.rapidapi.com/api/v1/ocr-licensedriver', 
        'description': "Extract the information on a Thai driver's license and return text results such as driver's license number and personal information.", 
        'method': 'POST', 
        'required_parameters': [], 
        'optional_parameters': [], 
        'code': 'import requests\n\nurl = "https://thai-drivers-license-ocr.p.rapidapi.com/api/v1/ocr-licensedriver"\n\nheaders = {\n            "X-RapidAPI-Key": "SIGN-UP-FOR-KEY",\n            "X-RapidAPI-Host": "thai-drivers-license-ocr.p.rapidapi.com"\n        }\n\nresponse = requests.post(url, headers=headers)\nprint(response.json())\n', 
        'convert_code': 'import requests\n\nurl = "https://thai-drivers-license-ocr.p.rapidapi.com/api/v1/ocr-licensedriver"\n\nheaders = {\n            "X-RapidAPI-Key": "SIGN-UP-FOR-KEY",\n            "X-RapidAPI-Host": "thai-drivers-license-ocr.p.rapidapi.com"\n        }\n\nresponse = requests.post(url, headers=headers)\nprint(response.json())\n', 
        'test_endpoint': '', 
        'statuscode': 200, 
        'schema': {}
        }
    ], 
    'category_name': 'Video_Images'
}

"""
with open("GenCodebook/data/tools.json", 'r') as f:
    tool_infos = json.load(f)

all_datas = {
    "aid": [0],
    "cat": [""],
    "namedesc": ["Finish"],   # tool_name&&api_name
    "acode": [""],   # code
    "aexam": ["The end of processing."]   # desc
}
idx = 1
for tool in tool_infos:
    for api in tool["api_list"]:
        all_datas["aid"].append(idx)
        all_datas["cat"].append(tool['category_name'])
        all_datas["namedesc"].append(f"{tool['tool_name']}&&{api['name']}")
        if "code" not in api.keys():
            all_datas["acode"].append("")
        else:
            all_datas["acode"].append(api["code"])
        all_datas["aexam"].append(api["description"])
        idx += 1


def write_to_tsv(data, filename):
    columns = list(data.keys())
    rows = zip(*(data[col] for col in columns))

    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(columns)
        writer.writerows(rows)

filename = "GenCodebook/data/tb_train/tools.tsv"
write_to_tsv(all_datas, filename)

In [None]:
# Train the codebook
# run train to get the compressing model: 
# > python worker.py --model llama3 --data tb --batch_size 12 --num_gist 2 --warmup True --mode train
# run export to get the gist embedding:
# > python worker.py --model llama3 --data tb --batch_size 12 --num_gist 2 --warmup True --mode export
# run cluster to get the codebook:
# > python cluster_kmeans.py --model llama3 --sign 8fa057 --num_comp 32 --num_clst 512 --num_dept 2
# the codebook might named 8fa057@32-512-2.code

In [None]:
# Change codebook number into str, e.g. 1,2 --> <a_1><b_1>
import json
store_codebook_str = {}
codes = {}
with open("GenCodebook/tuning/Qwen3/8fa057@32-512-2.code", 'r') as f:
    store_codebook = json.load(f)

for id, code in store_codebook.items():
    codechars = []
    for i, c in enumerate(code):
        codechars.append("<" + chr(i + ord('a')) + "_" + str(c) + ">")
    
    code = ' '.join(str(code))
    if code not in codes:
        codes[code] = 0
    codes[code] += 1

    store_codebook_str[id] = codechars

store_codebook_str["aid"] = [2, 2, 512, max(codes.values())]   # gists, depth, kernels, overlops

with open('GenCodebook/tuning/Qwen3/8fa057@32-512-2-2.code', 'w') as f:
    json.dump(store_codebook_str, f, indent=4)

In [None]:
# The process of reusing the codebook
# Make sure that the.kmeans file in cluster_kmeans.py is generated properly.
# 2. Place the new "tool.tsv" file in the "data/xxx_train" or "data/xxx_dev" folder. Remember to rename the old "tool.tsv" file, and also rename the folder "data/xxx"
# 3. Run the command with "mode" set to "export" and set "batch_size" to 1. This is to enable sign to be distinguishable from the previous one (at this point, the "./prepare/" directory will generate a new "sign" folder. If the previous "tool.tsv" file is not correctly replaced and needs correction, this folder should also be cleaned up afterwards).
# 4. Run the python script "cluster_kmeans_inference.py", and make sure to correctly replace the paths within it.
# 5. The new codebook for the new "tool.tsv" file will be generated. You do not need to train the model again.

In [None]:
# Create a mapping dictionary 'Tool2hierarchicalId.json' for tool names and codebooks. This requires reference to ToolGen's Tool2AtomicId.json.
import json
import pandas as pd
from unidecode import unidecode

def remove_accents(text):
    return unidecode(text)


with open("data/Tool2AtomicId.json", 'r') as f:
    Tool2AtomicId = json.load(f)

# All "fil" values have been changed to "url", but there is still an "e_url_for_audio_transcoder_compression_optimization_download_url" that needs to be manually modified.
AtomicId2Tool = {}
for k, v in Tool2AtomicId.items():
    # AtomicId2Tool[v] = k
    name = k.split("_")
    new_name = []
    for i in name:
        if i == "fil":
            new_name.append("url")
        else:
            new_name.append(i)
    AtomicId2Tool[v] = "_".join(new_name)
print(len(AtomicId2Tool))

Tool2hierarchicalId = {}

tool2id_ = pd.read_csv("GenCodebook/data/tb_train/tools.tsv", sep='\t').set_index(['namedesc'])['aid'].to_dict()
tool2id = {}
for key in tool2id_.keys():
    tool2id[remove_accents(key)] = tool2id_[key]
print("tool2id_: ", len(tool2id_))
print("tool2id: ", len(tool2id))

with open("GenCodebook/tuning/Qwen3/8fa057@32-512-2-2.code", 'r') as f:
    codebook = json.load(f)

for atomicId in AtomicId2Tool.keys():
    apiname = atomicId[2:-2]
    id = tool2id[apiname]
    codebook = "".join(list(map(str, codebook[str(id)])))
    if atomicId not in AtomicId2Tool:
        print(atomicId)
        continue
    Tool2hierarchicalId[AtomicId2Tool[atomicId]] = codebook
print(len(Tool2hierarchicalId))

with open("data/Tool2hierarchicalId.json", 'w') as f:
    json.dump(Tool2hierarchicalId, f, indent=4)

In [None]:
# Add the unique tokens in the codebook, such as <a_1>, <b_1>, to the vocabulary of the base model.
import torch
import transformers
from unidecode import unidecode
import os
import numpy as np
from typing import Type, cast
import pandas as pd
import json

# tool2id
tool2id_ = pd.read_csv("GenCodebook/data/tb_train/tools.tsv", sep='\t').set_index(['namedesc'])['aid'].to_dict()
tool2id = {}
for key in tool2id_.keys():
    tool2id[unidecode(key)] = tool2id_[key]
print("tool2id_: ", len(tool2id_))
print("tool2id: ", len(tool2id))

# id2embedding
id2emb = np.load("GenCodebook/tuning/Qwen3/8fa057_reconstructed_32.npy", allow_pickle=True).item()
id2emb = cast(dict, id2emb)

# id2union_codebook
with open("GenCodebook/tuning/Qwen3/8fa057@32-512-2.code", 'r') as f:
    union_codebook = json.load(f)
gists, depth, kernels, overlops = union_codebook.pop("aid")
runds = gists*depth

# kernal's embedding
codeId2emb = {}
for item, code in union_codebook.items():
    for i, codeId in enumerate(code):
        if i == len(code)-1:
            break
        if codeId not in codeId2emb:
            codeId2emb[codeId] = []
        codeId2emb[codeId].append(id2emb[item][i])
print(codeId2emb.keys())

new_tokens = []
for pos in range(runds):
    new_tokens.extend([f"<{chr(pos + ord('a'))}_{i}>" for i in range(kernels)])

# Load tokenizer and add tokens into vocabulary
model_name_or_path = "models/qwen2.5-3B"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
num_added_tokens = tokenizer.add_tokens(new_tokens=new_tokens, special_tokens=False)
print(f"Added {num_added_tokens} new tokens. Model now has {len(tokenizer)} tokens.")

# Load model and expand embeddings
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.bfloat16
)
model.resize_token_embeddings(len(tokenizer))

for codeId, embs in codeId2emb.items():
    token_id = tokenizer(codeId, add_special_tokens=False).input_ids
    assert len(token_id) == 1

    if chr(runds + ord('a')) not in codeId:
        embedding = torch.mean(torch.from_numpy(np.array(embs)), dim=0)
        # print(codeId, embedding.shape)
        embedding_dim = model.model.embed_tokens.weight.data.size(1)
        assert embedding.shape[0] == embedding_dim, f"Embedding dimension mismatch: {embedding.shape[0]} != {embedding_dim}"
        model.model.embed_tokens.weight.data[token_id[0]] = embedding
    else:
        print("?")

# Save model
model.save_pretrained("models/qwen2.5-3B-virtualized-2@2", safe_serialization=True)
tokenizer.save_pretrained("models/qwen2.5-3B-virtualized-2@2")
# tool2id_:  46985
# tool2id:  46985
# Added 2048 new tokens. Model now has 130304 tokens.


In [None]:
# Construct domain-specific training data, including query-tool pairs and trajectories
import re
import pandas as pd
from unidecode import unidecode

tool2id_ = pd.read_csv("GenCodebook/data/tb_train/tools.tsv", sep='\t').set_index(['namedesc'])['aid'].to_dict()
tool2id = {}
for key in tool2id_.keys():
    decoded_key = key.encode('utf-8').decode('unicode_escape')
    tool2id[unidecode(decoded_key)] = tool2id_[key]
    tool2id[unidecode(key)] = tool2id_[key]
print("tool2id_: ", len(tool2id_))
print("tool2id: ", len(tool2id))

import json
with open("data/toolgen_atomic_retrieval_G123.json", 'r') as f:
    toolgen_atomic_retrieval_G123 = json.load(f)
print("toolgen_atomic_retrieval_G123: ", len(toolgen_atomic_retrieval_G123))
with open("data/toolgen_atomic_G123_dfs.json", 'r') as f:
    toolgen_atomic_G123_dfs = json.load(f)
print("toolgen_atomic_G123_dfs: ", len(toolgen_atomic_G123_dfs))


with open("GenCodebook/tuning/Qwen3/8fa057@32-512-2-2.code", 'r') as f:
    store_codebook = json.load(f)
print("store_codebook: ", len(store_codebook))

# ===
toolscaler_hierarchical_retrieval_G123 = []
for item in toolgen_atomic_retrieval_G123:
    if "conversations" in item:
        tool_api = item["conversations"][1]["content"][2:-2]
        id = tool2id[tool_api]
        codebook = "".join(store_codebook[str(id)])
        # print(tool_api, id, codebook)
        item["conversations"][1]["content"] = codebook
        toolscaler_hierarchical_retrieval_G123.append(item)
    # print(item)
    else:
        print(item)

with open("data/toolscaler_hierarchical_2@2_retrieval_G123_qwen3.json", 'w') as f:
    json.dump(toolscaler_hierarchical_retrieval_G123, f, indent=4)


# ===
def replace_custom_tags(text):
    """
    替换文本中所有<<...>>结构为映射字典中对应的值
    
    :param text: 原始文本字符串
    :return: 替换后的文本字符串
    """
    pattern = r'<<(.*?)>>'  # 非贪婪匹配<<...>>中的内容
    def replacer(match):
        name = match.group(1)   # Tool_name&&Api_name
        decoded_name = name.encode('utf-8').decode('unicode_escape')
        if unidecode(decoded_name) in tool2id:
            id = tool2id[unidecode(decoded_name)]
            codebook = "".join(store_codebook[str(id)])
        else:
            codebook = f"<<{name}>>"
            # print(codebook)
        return codebook
    return re.sub(pattern, replacer, text)

toolscaler_hierarchical_G123_dfs = []
for item in toolgen_atomic_G123_dfs:
    if "conversations" in item:
        for i, step in enumerate(item["conversations"]):
            sentence = step["value"]
            item["conversations"][i]["value"] = replace_custom_tags(sentence)
    else:
        print("something wrong: ", item)
    toolscaler_hierarchical_G123_dfs.append(item)

with open("data/toolscaler_hierarchical_2@2_G123_dfs_qwen3.json", 'w') as f:
    json.dump(toolscaler_hierarchical_G123_dfs, f, indent=4)


In [None]:
# Install llama-factory, and then configure the information of the dataset in LLaMA-Factory/data/dataset_info.json, such as:
"""
"toolscaler_hierarchical_2@2_retrieval_G123_qwen3": {
    "file_name": "data/toolscaler_hierarchical_2@2_retrieval_G123_qwen3.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
"""

In [None]:
# Toolscaler Retrieval - Full-scale fine-tuning:
# > CUDA_VISIBLE_DEVICES=1,2,3,4 FORCE_TORCHRUN=1 llamafactory-cli train Scripts/full/gist_ret_sft_ds3.yaml
# 
#   Eval the retrieval model with NDCG:
#   > cp data/Tool2hierarchicalId.json Evaluator/data/toolenv/Tool2hierarchicalId.json
#   > cd Evaluator
#   > python eval_toolscaler_qwen.py

In [None]:
# ToolScaler E2E - Full-scale fine-tuning:
# > bash E2eTraining\scripts\train.sh
# 
#   Eval the E2E model with StableToolBench:
#   Get ToolBench key from https://github.com/OpenBMB/ToolBench repo. Then deploy https://github.com/THUNLP-MT/StableToolBench following the instructions in their repo.
#   To start StableToolBench/server, run the following command in the folder: 
#   > tmux new -d -s e2e 'python main.py > out.log 2>&1'
#   > cd Evaluator
#   > bash scripts/inference/inference_toolscaler_pipeline_virtual.sh
#   > bash scripts/convert_answer/run_convert_answer.sh
#   > bash scripts/pass_rate/run_pass_rate.sh
#   > bash scripts/preference/run_preference.sh