In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("/root/PycharmProjects/ProtAgent")
import os
import yaml
import random
import json

from agent.utils.others import setup_seed
from agent.tools.tool_manager import ToolManager
from openai import OpenAI
from tqdm import tqdm
from agent.agent.multi_agent_backbone import ProteinReActAgent

  from .autonotebook import tqdm as notebook_tqdm


# Load all available tools

In [238]:
tool_dir = "/root/PycharmProjects/ProtAgent/agent/tools"
tools = {}
input_type2tool = {}
output_type2tool = {}

exclude_list = [
    "protrek_text2text"
]

for dir_name in os.listdir(tool_dir):
    dir_path = os.path.join(tool_dir, dir_name)
    
    # Load a tool
    if os.path.isdir(dir_path) and os.path.exists(os.path.join(dir_path, "config.yaml")):
        config_path = os.path.join(dir_path, "config.yaml")
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)
            
            # Load tool document
            docs = config["document"]
            if isinstance(docs, dict):
                docs = [docs]
            
            for tool_doc in docs:
                if tool_doc["tool_name"] in exclude_list:
                    continue
                
                try:
                    # Record input types
                    input_type_cnt = {}
                    input_types = []
                    for param in tool_doc["required_parameters"]:
                        detailed_type = param["detailed_type"]
                        # Count the number of the same type
                        input_type_cnt[detailed_type] = input_type_cnt.get(detailed_type, 0) + 1
                        input_types.append(f"{detailed_type}_{input_type_cnt[detailed_type]}")
                        
                    # Record output types    
                    output_type_cnt = {}
                    output_types = []
                    for param in tool_doc["return_values"]:
                        detailed_type = param["detailed_type"]
                        # Count the number of the same type
                        output_type_cnt[detailed_type] = output_type_cnt.get(detailed_type, 0) + 1
                        output_types.append(f"{detailed_type}_{output_type_cnt[detailed_type]}")
                    
                    tools[tool_doc["tool_name"]] = {
                        "input_types": input_types,
                        "output_types": output_types,
                        "description": tool_doc["tool_description"],
                    }
                    
                    for input_type in input_types:
                        if input_type not in input_type2tool:
                            input_type2tool[input_type] = []
                        input_type2tool[input_type].append(tool_doc["tool_name"])
                    
                    for output_type in output_types:
                        if output_type not in output_type2tool:
                            output_type2tool[output_type] = []
                        output_type2tool[output_type].append(tool_doc["tool_name"])
                
                except Exception as e:
                    print(dir_name, e)
                    pass

saprot_task 'detailed_type'


In [338]:
print(tools["saprot_mutation_byinfo"])
print(tools["hmmsearch"])

{'input_types': ['AA_SEQUENCE_1', 'MUTATION_INFO_1'], 'output_types': ['MUTATION_SCORE_1'], 'description': 'Predicts the mutational effect score of a given specific mutation on a wild type sequence.'}
{'input_types': ['FASTA_PATH_1', 'FASTA_PATH_2'], 'output_types': ['AA_SEQUENCE_1'], 'description': "HMMER's hmm_search tool is used to search for sequence homologs of a query protein domain model within a sequence database. It utilizes Hidden Markov Models (HMMs) for identifying domains and sequences with similar structural and functional motifs."}


In [339]:
print(input_type2tool.keys())
type2case = {
    "A3M_PATH_1": "example_1.a3m",
    "FASTA_PATH_1": "example_1.fasta",
    "FASTA_PATH_2": "example_2.fasta",
    "AA_SEQUENCE_1": "MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK",
    "AA_SEQUENCE_2": "MITLDWEKANGLITTVVQDATTKQVLMVAYMNQESLAKTMATGETWFWSRSRKTLWHKGATSGNIQTVKTIAVDCDADTLLVTVDPAGPACHTGHISCFYRHYPEGKDLT",
    "FULL_STRUCTURE_PATH_1": "example_1.pdb",
    "FULL_STRUCTURE_PATH_2": "example_2.pdb",
    "FOLDSEEK_SEQUENCE_1": "dddddddddddddddpdpppvcppvnvvvvvvvvvvvvvvvvvvvvvvvvvvqdpqdedeqvrddpcqqpvqhkhkykafwappqwdddpqkiwtwghnppgiaieieghdappqddhrfikifiaghdpvrhtygdhidtdddpddddvvnvvvcvvvvndpdd",
    "TEXT_1": "Catalyzes the hydrolysis of cutin, a polyester that forms the structure of plant cuticle.",
    "RFDIFFUSION_CONTIGS_1": "A:50",
    "MUTATION_INFO_1": "A123B:C124D",
    "SMILES_1": "CC(=O)OC1=C(C(=C(C=C1)C(=O)O)C(=O)O)C(=O)O",
    "PDB_ID_1": "1A2B",
    "HMM_PATH_1": "example.hmm",
    "UNIPROT_ID_1": "P05067",
    "PFAM_ID_1": "PF00085",
    "UNIPROT_SUBSECTION_1": "Function",
    'QUESTION_1': "What is the function of the protein?",
}

dict_keys(['A3M_PATH_1', 'FASTA_PATH_1', 'AA_SEQUENCE_1', 'FULL_STRUCTURE_PATH_1', 'FOLDSEEK_SEQUENCE_1', 'TEXT_1', 'RFDIFFUSION_CONTIGS_1', 'SAPROT_ADAPTOR_1', 'MUTATION_INFO_1', 'AA_SEQUENCE_2', 'LABEL_NUM_1', 'ADAPTOR_DIRECTORY_1', 'FULL_STRUCTURE_PATH_2', 'SMILES_1', 'FASTA_PATH_2', 'PDB_ID_1', 'HMM_PATH_1', 'TRAINING_DATASET_1', 'UNIPROT_ID_1', 'PFAM_ID_1', 'UNIPROT_SUBSECTION_1', 'QUESTION_1'])


In [1304]:
"value.pdbci".endswith((".pdb", ".cif"))

False

In [437]:
print(input_type2tool.keys())
print(output_type2tool.keys())
print(input_type2tool["MUTATION_INFO_1"])

dict_keys(['A3M_PATH_1', 'FASTA_PATH_1', 'AA_SEQUENCE_1', 'FULL_STRUCTURE_PATH_1', 'FOLDSEEK_SEQUENCE_1', 'TEXT_1', 'RFDIFFUSION_CONTIGS_1', 'SAPROT_ADAPTOR_1', 'MUTATION_INFO_1', 'AA_SEQUENCE_2', 'LABEL_NUM_1', 'ADAPTOR_DIRECTORY_1', 'FULL_STRUCTURE_PATH_2', 'SMILES_1', 'FASTA_PATH_2', 'PDB_ID_1', 'HMM_PATH_1', 'TRAINING_DATASET_1', 'UNIPROT_ID_1', 'PFAM_ID_1', 'UNIPROT_SUBSECTION_1', 'QUESTION_1'])
dict_keys(['HMM_PATH_1', 'AA_SEQUENCE_LIST_1', 'SEQUENCE_COUNT_1', 'FASTA_PATH_1', 'FULL_STRUCTURE_PATH_1', 'CHAIN_ID_1', 'AA_SEQUENCE_1', 'CHAIN_COUNT_1', 'UNIPROT_ID_1', 'PROTEIN_LENGTH_1', 'SEQUENCE_IDENTITY_1', 'PROTREK_SCORE_1', 'FOLDSEEK_SEQUENCE_1', 'TEXT_1', 'ALN_PATH_1', 'XML_PATH_1', 'CLASSIFICATION_RESULT_1', 'MUTATION_SCORE_1', 'REGRESSION_RESULT_1', 'TOKEN_CLASSIFICATION_RESULT_1', 'MPNN_CSV_PATH_1', 'TEXT_2', 'URL_1', 'DIRECTORY_1', 'TMSCORE_1', 'DATE_1', 'DOI_1', 'A3M_PATH_1', 'HHR_PATH_1', 'MMSEQS_ID_PATH_1', 'ADAPTOR_DIRECTORY_1', 'LABEL_NUM_1', 'PINAL_CSV_PATH_1'])
['sapr

# Sample tool calling trajectories

In [322]:
setup_seed(20000812)

# Decide the number of steps
step = 4
num = 10
cases = {}

while len(cases) < num:
    # Randomly sample the initial tool
    init_tool = random.choice(list(tools.keys()))
    
    tool_chain = [init_tool]
    # Inputs that the user has to provide
    required_inputs = set(tools[init_tool]["input_types"])
    # Inputs that later steps can obtain
    available_inputs = required_inputs.union(set(tools[init_tool]["output_types"]))
    
    for i in range(step-1):
        # Obtain available tools for the next step
        output_types = tools[tool_chain[-1]]["output_types"]
        next_tools = []
        for output_type in output_types:
            if output_type in input_type2tool:
                next_tools.extend(input_type2tool[output_type])
        
        # If no next tool is available, break the loop
        if next_tools == []:
            break
        
        # Randomly sample the next tool (can be the same as the previous one)
        while True:
            next_tool = random.choice(next_tools)
            if next_tool not in tool_chain:
                break
        tool_chain.append(next_tool)
        
        # Update the required inputs and available inputs
        new_required_inputs = set(tools[next_tool]["input_types"]) - available_inputs
        required_inputs = required_inputs.union(new_required_inputs)
        available_inputs = available_inputs.union(set(tools[next_tool]["output_types"]))
    
    # If some required inputs are not provided, skip this tool chain
    if required_inputs - type2case.keys() != set():
        continue
    
    else:
        tool_order = " -> ".join(tool_chain)
        cases[tool_order] = {
            "tool_chain": tool_chain,
            "required_inputs": required_inputs,
        }


print(cases)

{'hmmsearch -> saprot_mutation_byinfo': {'tool_chain': ['hmmsearch', 'saprot_mutation_byinfo'], 'required_inputs': {'MUTATION_INFO_1', 'FASTA_PATH_2', 'FASTA_PATH_1'}}, 'pinal': {'tool_chain': ['pinal'], 'required_inputs': {'TEXT_1'}}, 'saprot_mutation_byinfo': {'tool_chain': ['saprot_mutation_byinfo'], 'required_inputs': {'MUTATION_INFO_1', 'AA_SEQUENCE_1'}}, 'uniprot_fetch_subsection -> biorxiv -> pinal': {'tool_chain': ['uniprot_fetch_subsection', 'biorxiv', 'pinal'], 'required_inputs': {'UNIPROT_ID_1', 'UNIPROT_SUBSECTION_1'}}, 'seq2fasta -> clustalw': {'tool_chain': ['seq2fasta', 'clustalw'], 'required_inputs': {'AA_SEQUENCE_1'}}, 'hhblits -> hhmake -> hhsearch': {'tool_chain': ['hhblits', 'hhmake', 'hhsearch'], 'required_inputs': {'AA_SEQUENCE_1'}}, 'pfam_entry -> biorxiv -> wikipedia -> pinal': {'tool_chain': ['pfam_entry', 'biorxiv', 'wikipedia', 'pinal'], 'required_inputs': {'PFAM_ID_1'}}, 'rfdiffusion_binder_design -> diffab_optimize': {'tool_chain': ['rfdiffusion_binder_desi

# Generate user query

In [3]:
tool_manager = ToolManager(enable_quick_run=True)
tool_manager.set_out_dir("/home/public/ProtAgent/examples")

In [196]:
# Record how can a input type be converted to another type by a tool
transfer_matrix = {}
for tool_name, obj in tool_manager.tools.items():
    doc = obj.config.document
    input_types = set([param["detailed_type"] for param in doc.required_parameters])
    output_types = set([param["detailed_type"] for param in doc.return_values])
    
    if len(input_types) == 1:
        input_type = input_types.pop()
        # Each output type can be generated given the input type and the tool
        if input_type not in transfer_matrix:
            transfer_matrix[input_type] = {output_type: [tool_name] for output_type in output_types}
        
        else:
            for output_type in output_types:
                transfer_matrix[input_type][output_type] = transfer_matrix[input_type].get(output_type, []) + [tool_name]
    

def find_path(input_type: str, output_type: str, exclusive: set = None) -> list:
    """
    Find the shortest paths from input_type to output_type in the transfer matrix.
    Args:
        input_type: Input type
        output_type: Output type

    Returns:
        A list of tool chains that can convert the input_type to output_type.
    """
    assert input_type != output_type, f"input type and output type are the same: {input_type}"
    
    if input_type not in transfer_matrix:
        return []
    
    # If the input type can be converted to the output type directly
    if output_type in transfer_matrix[input_type]:
        return [[tool] for tool in transfer_matrix[input_type][output_type]]
    
    else:
        if exclusive is None:
            exclusive = set()
        new_exclusive = exclusive.union({input_type})
        
        shortest_paths = []
        for available_output_type, tools in transfer_matrix[input_type].items():
            # If the output type is not in the exclusive list
            if available_output_type not in new_exclusive:
                # If the output type can be converted to the target output type
                paths = find_path(available_output_type, output_type, new_exclusive)
                if paths != []:
                    paths = [[tool] + path for tool in tools for path in paths]
                    shortest_paths.extend(paths)
        
        if shortest_paths:
            # Filter the paths to keep the shortest ones
            min_length = min([len(path) for path in shortest_paths])
            shortest_paths = [path for path in shortest_paths if len(path) == min_length]
        
        return shortest_paths


find_path("FASTA_PATH", "AA_SEQUENCE_LIST")

[['fasta2seq']]

In [195]:
tool_manager.tools["fasta2seq"].config.document

{'category_name': 'sequence_operator',
 'tool_name': 'fasta2seq',
 'tool_description': 'This tool converts a FASTA file into a list of protein sequences.',
 'required_parameters': [{'name': 'fasta_file',
   'type': 'PATH',
   'detailed_type': 'FASTA_PATH',
   'description': 'The FASTA file containing the protein sequences to be converted.'}],
 'optional_parameters': [],
 'return_values': [{'name': 'protein_sequences',
   'type': 'LIST',
   'detailed_type': 'AA_SEQUENCE_LIST',
   'description': 'A list of protein sequences extracted from the FASTA file.'},
  {'name': 'count',
   'type': 'INTEGER',
   'detailed_type': 'SEQUENCE_COUNT',
   'description': 'The number of protein sequences extracted from the FASTA file.'}]}

In [332]:
def generate_query(tool_chain: list, required_inputs: set) -> str:
    prompt = """\
    You are a helpful AI assistant. Your task is to generate a reasonable user query based on a given tool chain so that this query could be answered after the tool chain is executed successfully.
    
    The description of each tool is provided below:
    PUT_TOOL_DESCRIPTION_HERE
    
    The order of the tools is provided below:
    PUT_TOOL_ORDER_HERE
    
    The inputs you should provide are:
    PUT_TOOL_INPUT_HERE
    
    You have to generate a query like a normal user. The query should be a natural language question or command that reflect what the user wants to do, usually regarding a real-world scenario. The query should be clear and concise. The query should contain the inputs you provided, not the keys but only the values. And you should not directly mention the tool names in your query.
    
    Example:
    1. Predict the structure of the sequence "AEGIKL", and the calculate the tmscore between this structure and "/example.pdb".
    
    2. Can you fetch the 3D structural data for the protein with UniProt ID P05067 and then align it with the structure in "/example_2.cif" to determine the TM-score?
    
    Now, please generate a user query based on the above information. Directly output the generated query.
    """
    tool_desc = tool_manager.brief_documents(tool_chain)
    
    tool_order = " -> ".join(tool_chain)
    user_input =  "\n".join([f"{key}: '{type2case[key]}'" for key in required_inputs])
    prompt = prompt.replace("PUT_TOOL_DESCRIPTION_HERE", tool_desc).replace("PUT_TOOL_ORDER_HERE", tool_order).replace("PUT_TOOL_INPUT_HERE", user_input)
    
    client = OpenAI(
        api_key="1192ecb8bb9947e1add0e27a7a61500e.6R9bireyVXRL8szS",
        base_url="https://open.bigmodel.cn/api/paas/v4/"
    )
    
    response = client.chat.completions.create(
        model="glm-4",  
        messages=[
            {"role": "user", "content": prompt},
        ],
        temperature=0
    )
    
    return response.choices[0].message.content

In [340]:
for key, case_dict in tqdm(cases.items()):
    query = generate_query(case_dict["tool_chain"], case_dict["required_inputs"])
    cases[key]["query"] = query

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:38<00:00,  3.89s/it]


In [342]:
save_path = "/home/public/ProtAgent/agent_testset.tsv"
with open(save_path, "w") as f:
    f.write("tool_chain\trequired_inputs\tquery\n")
    for key, case_dict in tqdm(cases.items()):
        f.write(f"{key}\t{case_dict['required_inputs']}\t{case_dict['query']}\n")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 89813.79it/s]


# Evaluation

In [344]:
model_path = "/root/PycharmProjects/ProtAgent/modelhub/Qwen2.5-7B-Instruct"
# api_key = "1192ecb8bb9947e1add0e27a7a61500e.6R9bireyVXRL8szS"
api_key = "7fb9d05edadc426aaa836e334c706ae2.s4aLWRTeepiRDZPl"
tool_manager = ToolManager("/root/PycharmProjects/ProtAgent/output/temp")
agent = ProteinReActAgent(
    None,
    api_key,
    tool_manager
)

In [345]:
agent.tool_manager.set_out_dir("/home/public/ProtAgent/examples")

In [411]:
for case, case_dict in cases.items():
    query = case_dict["query"]
    
    messages = [
        {"role": "user", "content": query}
    ]

    lm = agent.stream_chat(messages)
    for response in lm:
        from IPython import display
        display.clear_output(wait=True)
        content = response.content
        print(query)
        for text_type, (st, ed) in response.fragments.items():
            print(response.content[st:ed])
    break

Predict the homologous protein domains in the sequence database provided in 'example_1.fasta', and then estimate the mutational effect score of the mutation 'A123B:C124D' on the wild type sequence from 'example_2.fasta'.
**Analysis:**

<TypeAnalyzer>
+ protein sequence(text) -> homologous protein domains(text)
+ mutation description(text) -> mutational effect score(text)
</TypeAnalyzer>
**Plan:**

<Planner>
1. Use a sequence analysis tool to identify homologous protein domains in the sequence database provided in 'example_1.fasta'.
2. Use the identified homologous protein domains as input to estimate the mutational effect score of the mutation 'A123B:C124D' on the wild type sequence from 'example_2.fasta'.
3. Provide the mutational effect score as the output of the workflow.
</Planner>
**Thought 1:**

<StepLocator>
The current task is to use a sequence analysis tool to identify homologous



KeyboardInterrupt



# Query parser

In [335]:
from agent.agent.multi_agent_api.query_parser import QueryParserAPI

query_parser = QueryParserAPI(model_name="qwen-turbo", tool_manager=tool_manager)

In [336]:
request = """\
Find and align protein sequences that catalyze the hydrolysis of cutin, and filter the results to remove redundancy, focusing on diversity and sequence coverage.
"""

lm = query_parser.stream_chat(request)
st = 0
for response in lm:
    content = response.content
    print(content[st:], end="")
    st = len(content)

<QueryParser>
{
    "TEXT": ["hydrolysis of cutin"],
    "AA_SEQUENCE": []
}
</QueryParser><QueryParser>
{
    "TEXT": ["hydrolysis of cutin"]
}
</QueryParser>

# Planner

In [6]:
from agent.agent.sujin_multi_agent_api.new_planner import NewPlannerAPI
from agent.utils.constants import AGENT_STATUS, AgentResponse

new_planner = NewPlannerAPI(model_name="qwen-turbo", tool_manager=tool_manager)

In [21]:
# query = "What can you tell me about the protein with this Foldseek structure sequence: 'dddddddddddddddpdpppvcppvnvvvvvvvvvvvvvvvvvvvvvvvvvvqdpqdedeqvrddpcqqpvqhkhkykafwappqwdddpqkiwtwghnppgiaieieghdappqddhrfikifiaghdpvrhtygdhidtdddpddddvvnvvvcvvvvndpdd'?"
# query = "Calculate TMscore"
# query = "Find the 3D structure of the protein with the given amino acid sequence 'MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK', then identify similar structures, match it to a Pfam entry, and provide any relevant research articles on this protein family."
query = "I have a sequence, predict its structure for me"
# query = "I have uploaded the following files: \"\"example_1.pdb`` ``example_2.pdb`` compute tmscore"

lm = new_planner.stream_chat(query, message_pool=[])
# lm = new_planner.stream_chat(query, prev_plan=prev_plan, feedback=feedback)
for response in lm:
    from IPython import display
    display.clear_output(wait=True)
    
    print(response.content, end="")


{
    "sender": "planner",
    "content": {
        "user_request": "I have a sequence, predict its structure for me",
        "analysis": "The user wants to predict the structure of a protein based on its sequence. We will use the ESMFold tool to predict the 3D structure of the protein since it is suitable for proteins lacking homologous sequence information.",
        "step_1": {
            "tool": "esmfold",
            "reason": "ESMFold is a powerful tool for predicting the 3D structure of proteins directly from their amino acid sequences."
        },
        "step_2": {
            "tool": "chat",
            "reason": "After predicting the structure using ESMFold, we will communicate the results to the user."
        }
    }
}


In [41]:
plan = {
    "user_request": "Calculate tmscore. One is 'a.pdb'",
    "analysis": "The user wants to calculate the TM-score, which is a measure of structural similarity between two protein structures. This requires aligning the two structures and then calculating the TM-score.",
    "1": {
        "tool": "tmalign",
        "goal": "Align two protein structures and calculate the TM-score."
    }
}

from agent.agent.sujin_multi_agent_api.tool_executor import ToolExecutorAPI
from agent.utils.constants import AGENT_STATUS, AgentResponse

api_key = "896b80950544431e8c583e03781bd14a.v2sd48fGT009KSo6"
tool_executor = ToolExecutorAPI(api_key, tool_manager, model="glm-4-flash")

TypeError: ToolExecutorAPI.__init__() takes 1 positional argument but 3 were given

In [217]:
lm = tool_executor.stream_chat(plan)
st = 0
for response in lm:
    content = response.content
    print(content[st:], end="")
    st = len(content)

NameError: name 'tool_executor' is not defined

# Plan optimizer

In [24]:
from agent.agent.sujin_multi_agent_api.plan_optimizer import PlanOptimizerAPI

plan_optimizer = PlanOptimizerAPI(model_name="qwen-turbo", tool_manager=tool_manager)

In [51]:
plan = {
    "sender": "planner",
    "content": {
        "user_request": "I have a sequence, predict its structure for me",
        "analysis": "The user wants to predict the structure of a protein based on its sequence. We will use the ESMFold tool to predict the 3D structure of the protein since it is suitable for proteins lacking homologous sequence information.",
        "step_1": {
            "tool": "esmfold",
            "reason": "ESMFold is a powerful tool for predicting the 3D structure of proteins directly from their amino acid sequences."
        },
        "step_2": {
            "tool": "chat",
            "reason": "After predicting the structure using ESMFold, we will communicate the results to the user."
        }
    }
}


lm = plan_optimizer.stream_chat(plan["content"])
st = 0
for response in lm:
    # from IPython import display
    # display.clear_output(wait=True)
    # 
    # print(response.content, end="")
    pass

type_dict {'AA_SEQUENCE': [], 'FULL_STRUCTURE_PATH': []}
true_dict {}
wrong_dict {}


In [119]:
print(response.content)

{
   "sender": "plan_evaluator",
   "content": {
        "analysis": This plan is invalid. The tool pfam_match requires the following input types: {'UNIPROT_ID'}, but the available input types are: {'QUESTION', 'AA_SEQUENCE', 'FULL_STRUCTURE_PATH'}. Missing types: {'UNIPROT_ID'}. To call the tool, you should call other tools that return the missing types {'UNIPROT_ID'}.,
        "accept": no,
   }
}


# Argument Extractor

In [1113]:
from agent.agent.sujin_multi_agent_api.arg_extractor import ArgumentExtractorAPI
from agent.utils.constants import AGENT_STATUS, AgentResponse

# Zhipu AI
api_key = "896b80950544431e8c583e03781bd14a.v2sd48fGT009KSo6"
# Qwen
api_key = "sk-ad8e7bbdc1b344efbcdb1e48bab78b84"
arg_extractor = ArgumentExtractorAPI(api_key, tool_manager, model="qwen-turbo")

In [1114]:
plan = {
        "user_request": "Could you extract the foldseek structural sequence for the protein 6Z4G from the PDB, which is associated with viral entry into host cells?",
        "analysis": "The user wants to extract the foldseek structural sequence for the protein 6Z4G from the PDB. This task requires a tool capable of processing PDB files and extracting the foldseek sequence. The tool 'pdb2aaseq' can extract amino acid sequences from PDB files, and 'foldseek' can generate foldseek sequences from protein structures. Therefore, the first step is to retrieve the PDB file for the protein 6Z4G using the 'get_pdb' tool, and then extract the foldseek sequence using the 'foldseek' tool.",
        "step_1": {
            "tool": "get_pdb",
            "reason": "Retrieve the PDB file for the protein 6Z4G using the 'get_pdb' tool."
        },
        "step_2": {
            "tool": "foldseek",
            "reason": "Extract the foldseek structural sequence for the protein 6Z4G from the retrieved PDB file using the 'foldseek' tool."
        }
    }

task_id = "step_2"
lm = arg_extractor.stream_chat(plan, task_id, message_pool)
# lm = arg_extractor.stream_chat(plan, task_id, message_pool=message_pool[:-1])
st = 0
for response in lm:
    content = response.content
    print(content[st:], end="")
    st = len(content)

<ArgumentExtractor>
{
    "sender": "argument_extractor",
    "content": {
        "current_step": "step_2",
        "tool_name": "foldseek",
        "analysis": "The user request mentions the protein 6Z4G, which is the target protein structure. The 'foldseek' tool requires the PDB file path as an argument to extract the foldseek sequence. The PDB file for the protein 6Z4G has already been retrieved and saved at /home/public/ProtAgent/examples/6Z4G.pdb. Therefore, the path to this PDB file should be provided as an argument for the 'foldseek' tool. However, there is a persistent permission issue when running the 'foldseek.sh' script. Despite this, the required argument is still present, so it can be extracted.",
        "tool_arg": {
            "pdb_path": "/home/public/ProtAgent/examples/6Z4G.pdb"
        }
    }
}


# Tool executor

In [60]:
from agent.agent.sujin_multi_agent_api.tool_executor import ToolExecutorAPI
from agent.utils.constants import AGENT_STATUS, AgentResponse

tool_executor = ToolExecutorAPI(model_name="qwen-turbo", tool_manager=tool_manager)

In [76]:
plan = {
"user_request":"Calculate TMscore for two uploaded PDB files: example_1.pdb and example_2.pdb.",
"analysis":"The user wants to calculate the structural similarity between two protein structures using TMscore. The required inputs are the paths to the two PDB files.",
"step_1":{
"tool":"tmalign",
"reason":"TMAlign is the appropriate tool to calculate the structural similarity between two protein structures using TMscore."
},
"step_2":{
"tool":"chat",
"reason":"After calculating the TMscore, the result will be communicated to the user."
}
}

lm = tool_executor.stream_chat(message_pool[9]["content"], "step_1", message_pool[:10])
for response in lm:
    from IPython import display
    display.clear_output(wait=True)
    
    print(response.content, end="")

{
    "sender": "tool_executor",
    "content": {
        "current_step": "step_1",
        "tool_name": "seq2fasta",
        "analysis": "Based on the user request and the analysis provided, the required argument 'protein_sequence' can be extracted directly from the user request.",
        "tool_arg": {
            "protein_sequence": "AAAAAAAAAAAAAAAAAAAAAAAA"
        },
        "running_log": "Running log: \nNo log file\n\nResults: \n{'error': \"module 'posixpath' has no attribute 'makedirs'\"}",
        "results": {
            "error": "module 'posixpath' has no attribute 'makedirs'"
        },
        "status": "error"
    }
}

In [66]:
print(response.content)

{
    "sender": "tool_executor",
    "content": {
        "current_step": "step_1",
        "tool_name": "chat",
        "analysis": "Based on the user request and the analysis provided, the required argument 'protein_sequence' can be extracted directly from the user request.",
        "tool_arg": {},
        "status": "success"
    }
}


# Responder

In [791]:
from agent.agent.sujin_multi_agent_api.responder import ResponderAPI
from agent.utils.constants import AGENT_STATUS, AgentResponse

api_key = "896b80950544431e8c583e03781bd14a.v2sd48fGT009KSo6"
responder = ResponderAPI(api_key, model="glm-4-flash")

In [795]:
query = "Hello?"
lm = responder.stream_chat(query, message_pool)
st = 0
for response in lm:
    content = response.content
    print(content[st:], end="")
    st = len(content)

<Responder>
{
    "sender": "Planner",
    "content": "Hello there! How can I assist you today?"
}
</Responder>

# Plan-execution loop

In [1343]:
query = """\
Please design a novel amino acid sequence that maintains the structure of example_1.pdb except for segment A:50, then create a binder for this modified sequence, and finally, design an antibody that targets the binder.
"""

message_pool = []
task_complete = False
# Maximum number of turns for plan generation
max_plan_turn = 2
# Maximum number of turns for each step
max_step_turn = 2 

for _ in range(max_plan_turn):
    # Assume the task is complete. If any step is not complete, set it to False later.
    task_complete = True
    
    lm = new_planner.stream_chat(query, message_pool)
    st = 0
    for response in lm:
        if response.status == AGENT_STATUS.GENERATING:
            content = response.content
            print(content[st:], end="")
            st = len(content)
    
    plan = json.loads(response.content)
    message_pool.append(plan)
    print()
    
    # Plan evaluation
    # lm = plan_evaluator.stream_chat(plan)
    # st = 0
    # for response in lm:
    #     if response.status == AGENT_STATUS.GENERATING:
    #         content = response.content
    #         print(content[st:], end="")
    #         st = len(content)
    # 
    # break
    
    # Execute the plan
    for i in range(len(plan["content"]) - 2):
        # task_complete is False means the execution fa
        if not task_complete:
            break
        
        task_id = f"step_{i + 1}"
        # Break the loop when the next tool is a chat tool
        if plan["content"][task_id]["tool"] == "chat":
            break
        
        # Extract arguments and execute the tool
        for _ in range(max_step_turn):
            lm = tool_executor.stream_chat(plan["content"], task_id, message_pool)
            st = 0
            for response in lm:
                if response.status == AGENT_STATUS.GENERATING:
                    content = response.content
                    print(content[st:], end="")
                    st = len(content)
            
            exec_dict = json.loads(response.content)
            message_pool.append(exec_dict)
            print()
            
            if "error" in exec_dict["content"]:
                # When error occurs during the argument extraction process, the plan is not valid
                step_complete = False
                break
            
            # Tool execution
            elif exec_dict["content"]["status"] == "success":
                step_complete = True
                break
            
            else:
                # Execution failed. Continue to the next turn
                step_complete = False
        
        # Go back and revise the plan
        if not step_complete:
            task_complete = False
            break
    
    # Break the loop when the task is complete
    if task_complete:
        break

print()
lm = responder.stream_chat(query, message_pool)
st = 0
for response in lm:
    if response.status == AGENT_STATUS.GENERATING:
        content = response.content
        print(content[st:], end="")
        st = len(content)

final_response = json.loads(response.content)
message_pool.append(final_response)

# raise 
# feedback = json.loads(response.content)
# 
# lm = new_planner.stream_chat(query, plan, feedback)
# st = 0
# for response in lm:
#     if response.status == AGENT_STATUS.GENERATING:
#         content = response.content
#         print(content[st:], end="")
#         st = len(content)

<Planner>
{
    "sender": "planner",
    "content": {
        "user_request": "Please design a novel amino acid sequence that maintains the structure of example_1.pdb except for segment A:50, then create a binder for this modified sequence, and finally, design an antibody that targets the binder.",
        "analysis": "The user wants to modify a specific segment of a given protein structure, design a binder for the modified sequence, and then design an antibody targeting the binder. We will need to use several tools to achieve this step-by-step.",
        "step_1": {
            "tool": "rfdiffusion_partial_diffusion",
            "reason": "This tool is used to design a novel amino acid sequence that maintains the structure of the given protein except for a specific segment."
        },
        "step_2": {
            "tool": "rfdiffusion_binder_design",
            "reason": "After modifying the protein structure, we need to design a binder for the modified sequence. This tool is sui

# Multi-agent backbone

In [22]:
from agent.agent.sujin_multi_agent_backbone import MultiAgentBackbone


backbone = MultiAgentBackbone(model_name="qwen-turbo", tool_manager=tool_manager)

In [54]:
tool_manager.set_out_dir("/sujin/PycharmProjects/ProtAgent/output/sujin")
# request = """\
# I have a sequence, predict its structure for me
# """

request = "scaffold the residues index 10-30 of pdb id 1xiw chain A. Also give me the aa sequence and 3D structure of this scaffold."

for response in backbone.stream_chat(request):
    from IPython import display
    display.clear_output(wait=True)
    
    for msg in json.loads(response.content):
        if isinstance(msg, str):
            print(msg)
        
        else:
            print(json.dumps(msg, indent=4))

{
    "sender": "planner",
    "content": {
        "user_request": "scaffold the residues index 10-30 of pdb id 1xiw chain A. Also give me the aa sequence and 3D structure of this scaffold.",
        "analysis": "The user wants to scaffold residues 10-30 of chain A from the PDB ID 1xiw. They also require the amino acid sequence and 3D structure of the scaffold. Since the PDB ID is provided, we can directly fetch the structure and sequence from the PDB database.",
        "step_1": {
            "tool": "get_pdb",
            "reason": "Fetch the PDB file for the given PDB ID 1xiw."
        },
        "step_2": {
            "tool": "pdb2aaseq",
            "reason": "Extract the amino acid sequence from the fetched PDB file."
        },
        "step_3": {
            "tool": "rfdiffusion_motif_scaffolding",
            "reason": "Use the RFDiffusion Motif Scaffolding tool to scaffold the residues 10-30 of chain A from the PDB structure."
        },
        "step_4": {
            "to

In [329]:
s = """\
{
    "sender": "planner",
    "content": {
        "user_request": "Based on the given protein sequence MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK, design a novel protein sequence that has similar characteristics but with improved stability.",
        "analysis": "The user wants to design a novel protein sequence based on the given sequence. The goal is to improve the stability of the protein. The user has provided the protein sequence, so we can use it directly. We need to use a protein design tool to generate the novel sequence with improved stability.",
        "step_1": {
                "tool": "proteinmpnn",
                "reason": "proteinmpnn is a tool that aims to design proteins with specific structures and functions. It can alter or design amino acid sequences to obtain novel proteins with desired properties, such as improved stability."
            },
        "step_2": {
                "tool": "chat",
                "reason": "After designing the novel protein sequence, we need to communicate the result to the user."
            }
        }
    }
}
"""
json.loads(s)

JSONDecodeError: Extra data: line 16 column 1 (char 1203)

In [1209]:
log_path = "/sujin/temp/log.txt"
with open(log_path, "w") as w:
    for msg_dict in message_pool:
        w.write(json.dumps(msg_dict, indent=4) + "\n")

In [977]:
arg_dict = {
    "current_step": "step_1",
    "tool_name": "tmalign",
    "analysis": "The context mentions that the user wants to calculate the TM-score between 'asa.pdb' and 'b.pdb'. These are the two protein structures that need to be aligned using the TMAlign tool. Therefore, the required arguments for the tool are the paths to these PDB files.",
    "tool_arg": {
        "structure_path_1": "asddsa.pdb",
        "structure_path_2": "b.pdb"
    }
}
tool_name = arg_dict["tool_name"]
tool_arg = arg_dict["tool_arg"]
lm = tool_executor.stream_chat(tool_name, tool_arg)
st = 0
for response in lm:
    content = response.content
    print(content[st:], end="")
    st = len(content)

Running log: 
Total CPU time is  0.00 seconds


Results: 
{'error': 'max() arg is an empty sequence'}

In [971]:
tool = tool_manager.get_tool("tmalign")
res = tool.run(
    structure_path_1="example_1.pdb",
    structure_path_2="example_2.pdb"
)
print(res)

{'error': 'max() arg is an empty sequence', 'tmscore': 1.0, 'duration': 0.044183}


# Response evaluator

In [4]:
from agent.agent.sujin_multi_agent_api.evaluator import ResponseEvaluatorAPI
from agent.utils.constants import AGENT_STATUS, AgentResponse

response_evaluator = ResponseEvaluatorAPI(model_name="qwen-turbo")

In [5]:
query = """\
Using the protein with UniProt ID P05067, I want to first find its Pfam entry, then convert a text description of the protein to structural information, fetch its actual 3D structure from UniProt, and finally align this structure with the one in the file 'example_2.pdb' to obtain the TM-score. Could you perform these steps for me?
"""
response = "The protein with UniProt ID P05067 has the following Pfam entries: Kunitz/Bovine pancreatic trypsin inhibitor domain (PF00014), Amyloid A4 N-terminal heparin-binding (PF02177), Beta-amyloid peptide (beta-APP) (PF03494), Beta-amyloid precursor protein C-terminus (PF10515), Copper-binding of amyloid precursor, CuBD (PF12924), and E2 domain of amyloid precursor protein (PF12925). Using its amino acid sequence, we generated structural information via Protrek. The actual 3D structure was fetched from UniProt and aligned with the structure in 'example_2.pdb' using TM-align, resulting in a TM-score of 0.33552."

lm = response_evaluator.stream_chat(query, response)
st = 0
for response in lm:
    content = response.content
    print(content[st:], end="")
    st = len(content)

<Evaluator>
{
    "sender": "evaluator",
    "content": {
        "analysis": "The user's request involves several specific tasks related to the protein with UniProt ID P05067: finding its Pfam entries, converting a text description to structural information, fetching its 3D structure from UniProt, and aligning it with the structure in 'example_2.pdb' to compute the TM-score. The response provides the Pfam entries for the protein, mentions that structural information was generated via Protrek, states that the actual 3D structure was fetched from UniProt, and reports a TM-score of 0.33552 after alignment with 'example_2.pdb'. This output directly addresses each part of the user's request and provides the requested information in a clear manner. Therefore, the response meets the user's expectations.",
        "decision": "Yes"
    }
}
</Evaluator>

# Chat namer

In [7]:
from agent.agent.sujin_multi_agent_api.chat_namer import ChatNamerAPI
from agent.utils.constants import AGENT_STATUS, AgentResponse

chat_namer = ChatNamerAPI(model_name="glm-4-flash")

In [9]:
query = """\
I have uploaded the following files:\n\"example_1.pdb\"\n\"esmfold_predicted_structure.pdb\"\ncompute tmscore
"""
response = """\
I'm sorry, but I couldn't find specific information about 'ltenjoy'. It might be a less common term or a typo. If you can provide more context, I might be able to help further.
"""

lm = chat_namer.stream_chat(query, response)
st = 0
for response in lm:
    content = response.content
    print(content[st:], end="")
    st = len(content)

Retrying...


KeyboardInterrupt: 

In [49]:
json_path = "/sujin/PycharmProjects/ProtAgent/output/sujin/chat_history/15.json"
with open(json_path, "r") as f:
    chat_history = json.load(f)

message_pool = json.loads(chat_history["messages"][-1]["content"])
for msg in message_pool:
    if isinstance(msg, str):
        print(msg)
    
    else:
        print(json.dumps(msg, indent=4))

{
    "sender": "planner",
    "content": {
        "user_request": "predict the structure of AAAAAAAAAAAAAAAAAAAAAAA",
        "analysis": "The user wants to predict the structure of the given protein sequence. There is no previous chat history provided. The sequence is short and lacks evolutionary information, so tools like AlphaFold2 or ESMFold would be suitable for predicting the structure.",
        "step_1": {
            "tool": "esmfold",
            "reason": "ESMFold is suitable for predicting the structure of proteins without homologous sequence information, which is ideal for this short sequence."
        },
        "step_2": {
            "tool": "chat",
            "reason": "After obtaining the predicted structure, inform the user about the result."
        }
    }
}
{
    "sender": "plan_optimizer",
    "content": {
        "user_request": "predict the structure of AAAAAAAAAAAAAAAAAAAAAAA",
        "analysis": "The user wants to predict the structure of the given protei

In [57]:
message_pool[:10]

[{'sender': 'planner',
  'content': {'user_request': 'predict the structure of AAAAAAAAAAAAAAAAAAAAAAA',
   'analysis': 'The user wants to predict the structure of the given protein sequence. There is no previous chat history provided. The sequence is short and lacks evolutionary information, so tools like AlphaFold2 or ESMFold would be suitable for predicting the structure.',
   'step_1': {'tool': 'esmfold',
    'reason': 'ESMFold is suitable for predicting the structure of proteins without homologous sequence information, which is ideal for this short sequence.'},
   'step_2': {'tool': 'chat',
    'reason': 'After obtaining the predicted structure, inform the user about the result.'}}},
 {'sender': 'plan_optimizer',
  'content': {'user_request': 'predict the structure of AAAAAAAAAAAAAAAAAAAAAAA',
   'analysis': 'The user wants to predict the structure of the given protein sequence. There is no previous chat history provided. The sequence is short and lacks evolutionary information, s