In [None]:
# !pip install langchain openai
# !pip install python-arango
# !pip install langchain-community
# !pip install langchain-openai
# !pip install --upgrade langchain langchain-community langchain-openai langgraph
# !pip install langgraph
# !pip install biomart

# !pip install DeepPurpose 
# !pip install torch torchvision torchaudio

# !pip install git+https://github.com/bp-kelley/descriptastorus
# !pip install pandas-flavor

In [1]:
import os
import sys
import requests
import ast
import json
import hashlib
from datetime import datetime
from glob import glob
from io import StringIO

import pandas as pd
import numpy as np

from dotenv import load_dotenv
from arango import ArangoClient
from biomart import BiomartServer

from transformers import AutoTokenizer, AutoModel
import torch

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import ChatOpenAI
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool

from DeepPurpose import utils
from DeepPurpose import DTI as models

from rdkit import Chem, DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Draw, AllChem

from Bio.PDB import MMCIFParser

import faiss

In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [3]:
db = ArangoClient(hosts="http://localhost:8529").db('NeuThera', username='root', password='openSesame')
arango_graph = ArangoGraph(db)

drug_collection = db.collection('drug')
link_collection = db.collection('drug-protein') 

In [39]:
cursor = db.aql.execute("FOR doc IN drug RETURN {key: doc._key, embedding: doc.embedding}")

drug_keys = []
embeddings = []

for doc in cursor:
    if doc and "embedding" in doc and "key" in doc:
        drug_keys.append(doc["key"])
        embeddings.append(doc["embedding"])

embeddings = np.array(embeddings, dtype=np.float32)

print("Embeddings shape:", embeddings.shape)
print("Number of compounds:", len(drug_keys))

Embeddings shape: (9010, 768)
Number of compounds: 9010


## Tooling

In [5]:
@tool
def text_to_aql(query: str):
    """Execute a Natural Language Query in ArangoDB, and return the result as text."""
    
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    chain = ArangoGraphQAChain.from_llm(
        llm=llm,
        graph=arango_graph,  # Assuming arango_graph is already initialized
        verbose=True,
        allow_dangerous_requests=True
    )
    
    result = chain.invoke(query)

    return str(result["result"])

In [6]:
@tool
def predict_binding_affinity(X_drug, X_target, y=[7.635]):
    """
    Predicts the binding affinity for given drug and target sequences.

    Parameters:
    X_drug (list): List containing the SMILES representation of the drug.
    X_target (list): List containing the amino acid sequence of the protein target.

    Returns:
    float: Predicted binding affinity (log(Kd) or log(Ki)).
    """

    print("Predicting binding affinity: ", X_drug, X_target)
    
    model = models.model_pretrained(path_dir='DTI_model')

    X_pred = utils.data_process(X_drug, X_target, y,
                                drug_encoding='CNN', 
                                target_encoding='CNN', 
                                split_method='no_split')
   
    predictions = model.predict(X_pred)

    return predictions[0]


In [7]:
@tool
def get_amino_acid_sequence_from_pdb(pdb_id):    
    """
    Extracts amino acid sequences from a given PDB structure file in CIF format.

    Args:
        pdb_id (str): pdb id of the protein.

    Returns:
        dict: A dictionary where keys are chain IDs and values are amino acid sequences.
    """

    print("Getting Amino Acid sequence for ", pdb_id)

    cif_file_path = f"./database/PDBlib/{pdb_id.lower()}.cif"

    parser = MMCIFParser(QUIET=True)
    structure = parser.get_structure("protein", cif_file_path)
    
    sequences = {}
    for model in structure:
        for chain in model:
            seq = "".join(residue.resname for residue in chain if residue.id[0] == " ")
            sequences[chain.id] = seq 
            
    return sequences

In [8]:
sys.path.append(os.path.abspath("./TamGen"))

In [None]:
# Helper Functions for TamGen

tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

def get_chemberta_embedding(smiles):
    """
    Generate a ChemBERTa vector embedding for a given molecule represented as a SMILES string.

    Args:
        smiles (str): A valid SMILES representation of a molecule.

    Returns:
        List[float] or None: A 768-dimensional vector as a list of floats if successful, 
                             otherwise None if the input is invalid.
    """
    
    print("Getting vector embedding")

    if not isinstance(smiles, str) or not smiles.strip():
        return None 

    inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).tolist()[0]

def generate_key(smiles):
    """Generate a unique _key for the compound using SMILES hash."""
    hash_value = hashlib.sha256(smiles.encode()).hexdigest()[:8]
    return f"GEN:{hash_value}"

In [10]:
from TamGen_custom import TamGenCustom

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

worker = TamGenCustom(
    data="./TamGen_Demo_Data",
    ckpt="checkpoints/crossdock_pdb_A10/checkpoint_best.pt",
    use_conditional=True
)

@tool
def prepare_pdb_data(pdb_id):
    """
    Checks if the PDB data for the given PDB ID is available.  
    If not, downloads and processes the data.

    ALWAYS RUN THIS FUNCTION BEFORE WORKING WITH PDB

    Args:
        pdb_id (str): PDB ID of the target structure.

    """

    DemoDataFolder="TamGen_Demo_Data"
    ligand_inchi=None
    thr=10

    out_split = pdb_id.lower()
    FF = glob(f"{DemoDataFolder}/*")
    for ff in FF:
        if f"gen_{out_split}" in ff:
            print(f"{pdb_id} is downloaded")
            return
    
    os.makedirs(DemoDataFolder, exist_ok=True)
    
    with open("tmp_pdb.csv", "w") as fw:
        if ligand_inchi is None:
            print("pdb_id", file=fw)
            print(f"{pdb_id}", file=fw)
        else:
            print("pdb_id,ligand_inchi", file=fw)
            print(f"{pdb_id},{ligand_inchi}", file=fw)

    script_path = os.path.abspath("TamGen/scripts/build_data/prepare_pdb_ids.py")
    os.system(f"python {script_path} tmp_pdb.csv gen_{out_split} -o {DemoDataFolder} -t {thr}")
    os.remove("tmp_pdb.csv")

@tool
def generate_compounds(pdb_id, num_samples=10, max_seed=30):
    """
    Generates and sorts compounds based on similarity to a reference molecule, 
    all generated compounds are added back to the database for futher inference.

    Parameters:
    - pdb_id (str): The PDB ID of the target protein.
    - num_samples (int): Number of compounds to generate. (DEFAULT=500)
    - max_seed (int): Maximum seed variations. (DEFAULT=30)

    Returns:
    - dict: {
        'generated': [list of rdkit Mol objects],
        'reference': rdkit Mol object,
        'reference_smile': SMILE string of the reference compound
        'generated_smiles': [list of SMILES strings, sorted by similarity to reference]
      }
    """

    print("Generating Compounds for PDB ", pdb_id)
    try:
        # Ensure the required PDB data is prepared
        # prepare_pdb_data(pdb_id)

        worker.reload_data(subset=f"gen_{pdb_id.lower()}")

        print(f"Generating {num_samples} compounds...")
        generated_mols, reference_mol = worker.sample(
            m_sample=num_samples, 
            maxseed=max_seed
        )

        if reference_mol:
            # Ensure reference_mol is an RDKit Mol object
            if isinstance(reference_mol, str):
                reference_mol = Chem.MolFromSmiles(reference_mol)

            fp_ref = MACCSkeys.GenMACCSKeys(reference_mol)

            gens = []
            for mol in generated_mols:
                if isinstance(mol, str):  # Convert string SMILES to Mol
                    mol = Chem.MolFromSmiles(mol)
                if mol:  # Ensure conversion was successful
                    fp = MACCSkeys.GenMACCSKeys(mol)
                    similarity = DataStructs.FingerprintSimilarity(fp_ref, fp, metric=DataStructs.TanimotoSimilarity)
                    gens.append((mol, similarity))

            sorted_mols = [mol for mol, _ in sorted(gens, key=lambda e: e[1], reverse=True)]
        
        else:
            sorted_mols = generated_mols

        generated_smiles = [Chem.MolToSmiles(mol) for mol in sorted_mols if mol]

        reference_smile = Chem.MolToSmiles(reference_mol)
        
        print("Inserting to ArangoDB...")
        for smiles in generated_smiles:
            _key = generate_key(smiles) 
            drug_id = f"drug/{_key}"
            protein_id = f"protein/{pdb_id}"

            if drug_collection.has(_key):
                continue

            embedding = get_chemberta_embedding(smiles)
            doc = {
                "_key": _key,
                "_id": drug_id, 
                "accession": "NaN",
                "drug_name": "NaN",
                "cas": "NaN",
                "unii": "NaN",
                "synonym": "NaN",
                "key": "NaN",
                "chembl": "NaN",
                "smiles": smiles,
                "inchi": "NaN",
                "generated": True,
                "embedding": embedding
            }
            drug_collection.insert(doc)

            existing_links = list(db.aql.execute(f'''
                FOR link IN `drug-protein` 
                FILTER link._from == "{drug_id}" AND link._to == "{protein_id}" 
                RETURN link
            '''))

            if not existing_links:
                link_doc = {
                    "_from": drug_id,
                    "_to": protein_id,
                    "generated": True
                }
                link_collection.insert(link_doc)

        return {
            "generated": sorted_mols,
            "reference": reference_mol,
            "reference_smile": reference_smile,
            "generated_smiles": generated_smiles
        }

    except Exception as e:
        print(f"Error in compound generation: {str(e)}")
        return {"error": str(e)}

Namespace(no_progress_bar=False, log_interval=1000, log_format=None, tensorboard_logdir='', tbmf_wrapper=False, seed=1, cpu=False, fp16=False, memory_efficient_fp16=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, min_loss_scale=0.0001, threshold_loss_scale=None, user_dir=None, criterion='cross_entropy', tokenizer=None, bpe=None, optimizer='nag', lr_scheduler='fixed', task='translation_coord', num_workers=1, skip_invalid_size_inputs_valid_test=False, max_tokens=1024, max_sentences=None, required_batch_size_multiple=8, dataset_impl=None, gen_subset='gen_8fln', num_shards=1, shard_id=0, path='checkpoints/crossdock_pdb_A10/checkpoint_best.pt', remove_bpe=None, quiet=False, model_overrides='{}', results_path=None, beam=20, nbest=20, max_len_a=0, max_len_b=200, min_len=1, match_source_len=False, no_early_stop=False, unnormalized=False, no_beamable_mm=False, lenpen=1, unkpen=0, replace_unk=None, sacrebleu=False, score_reference=False, prefix_size=0, prefix_string

In [11]:
@tool
def generate_report(columns, rows):
    """
    Generate a report in CSV format with a timestamped filename. This function uses pandas to create a CSV.
    
    Parameters:
    columns (list): List of column names.
    rows (list of lists): Data rows corresponding to the columns.
    
    Returns:
    str: Path of the generated CSV report.
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"report_{timestamp}.csv"
    
    df = pd.DataFrame(rows, columns=columns)
    df.to_csv(filename, index=False)
    
    return filename

In [None]:
@tool
def find_similar_drugs(smile, top_k=5):
    """
    Finds the top K most similar drugs based on given smile of a query molecule. Automatically gets vector embeddings.

    Args:
        smile (string): Smile of the query molecule.
        top_k (int, optional): Number of most similar drugs to retrieve. Default is 5.

    Returns:
        List[Dict{str, [float]}]: A list of (drug_name, similarity_score) sorted by similarity.
    """
    
    print("Finding similar drugs...")

    embedding = get_chemberta_embedding(smile)
    
    aql_query = f"""
    LET query_vector = @query_vector
    FOR doc IN drug
        LET score = COSINE_SIMILARITY(doc.embedding, query_vector)
        SORT score DESC
        LIMIT @top_k
        RETURN {{ drug: doc._key, similarity_score: score }}
    """
    
    cursor = db.aql.execute(aql_query, bind_vars={"query_vector": embedding, "top_k": top_k})
    
    return list(cursor)

### Agentic RAG

In [77]:
tools = [ text_to_aql, get_amino_acid_sequence_from_pdb, prepare_pdb_data, generate_compounds, predict_binding_affinity, generate_report, get_chemberta_embedding, find_similar_drugs ]

def query_graph(query):
    
    query_template = f"""
    USER INPUT: {query}

    You are an advanced drug discovery assistant with multiple tools.

    - Use your tools as needed to assist in end-to-end drug discovery and answer user queries.
    - Always structure your output as valid JSON string so it can be parsed in python.
    - If possible, always try to generate reports for whatever output you get. Don't generate reports for errors.
    - Do not add explanations or any extra text.
    - When working with multiple outputs, run functions one by one for everything unless stated otherwise by the user.

    TO NOTE:
    - DO NOT USE get_chemberta_embedding TOOL AS ALL GENERATED COMPOUNDS ALREADY HAVE THEIR EMBEDDINGS IN THE DATABASE,
        USE AQL INSTEAD TO FETCH
    """
    
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
    app = create_react_agent(llm, tools)    
    final_state = app.invoke({"messages": [{"role": "user", "content": query_template}]})
    return final_state

In [57]:
    # query_template = f"""
    # user input: {query}

    # You are an advanced drug discovery assistant with multiple tools.

    # - Use your tools as needed to assist in end-to-end drug discovery and answer user queries.
    # - Always structure your output as valid JSON string so it can be parsed in python.
    # - If possible, always try to generate reports for whatever output you get. Don't generate reports for errors
    # - Do not add explanations or any extra text.

    # !WARNING!
    # YOU RUN FOR DEMO PURPOSES ONLY

    # - DO NOT RUN TOOLS ESPECIALLY generate_compounds AND predict_binding_affinity SIMULTANEOUly
    # - IF MULTIPLE OUTPUTS ARE FOUND FOR PROCESSING, JUST PICK THE FIRST ONE AND RUN THE PROCEEDING FUNCTIONS WITH THAT.
    # """

### Example Queries + Results

In [None]:
output = query_graph("What proteins can you find me related to mitochondrial ribosomal protein L36? Generate some compounds and test their binding affinity")

output



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH gene, protein, gene-protein
FOR gene_doc IN gene
  FILTER gene_doc.gene_name == "mitochondrial ribosomal protein L36"
  FOR edge IN gene-protein
    FILTER edge._from == gene_doc._id
    FOR protein_doc IN protein
      FILTER protein_doc._id == edge._to
      RETURN protein_doc
[0m
AQL Query Execution Error: 
[33;1m[1;3msyntax error, unexpected - operator near '-protein
FOR gene_doc IN gene
  ...' at position 2:25[0m

AQL Query (2):[32;1m[1;3m
WITH gene, protein, `gene-protein`
FOR gene_doc IN gene
  FILTER gene_doc.gene_name == "mitochondrial ribosomal protein L36"
  FOR edge IN `gene-protein`
    FILTER edge._from == gene_doc._id
    FOR protein_doc IN protein
      FILTER protein_doc._id == edge._to
      RETURN protein_doc
[0m
AQL Result:
[32;1m[1;3m[{'_key': '3j7y', '_id': 'protein/3j7y', '_rev': '_jUzh2ea-_B'}, {'_key': '3j9m', '_id': 'protein/3j9m', '_rev': '_jUzh2ea-_C'}, {'_key': '5

  0%|          | 0/1 [00:00<?, ?it/s]

Processing 6zm6


100%|██████████| 1/1 [00:13<00:00, 13.18s/it]


Namespace(no_progress_bar=False, log_interval=1000, log_format=None, tensorboard_logdir='', tbmf_wrapper=False, seed=1, cpu=False, fp16=False, memory_efficient_fp16=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, min_loss_scale=0.0001, threshold_loss_scale=None, user_dir=None, criterion='cross_entropy', tokenizer=None, bpe=None, optimizer='nag', lr_scheduler='fixed', task='translation', source_lang='tg', target_lang='m1', trainpref=None, validpref=None, testpref='TamGen_Demo_Data/src/gen_3j7y', destdir='TamGen_Demo_Data/tmp', thresholdtgt=0, thresholdsrc=0, tgtdict='/Users/redomic/Documents/Projects/Hackathons/GDG/GDG-25/TamGen/dict/dict.m1.txt', srcdict='/Users/redomic/Documents/Projects/Hackathons/GDG/GDG-25/TamGen/dict/dict.tg.txt', nwordstgt=-1, nwordssrc=-1, alignfile=None, dataset_impl='mmap', joined_dictionary=False, only_source=False, padding_factor=8, workers=4)
| [tg] Dictionary: 24 types
| [tg] TamGen_Demo_Data/src/gen_3j7y.tg: 2 sents, 152 toke

Traceback (most recent call last):
  File "/Users/redomic/Documents/Projects/Hackathons/GDG/GDG-25/TamGen/scripts/build_data/prepare_pdb_ids.py", line 68, in <module>
    main()
  File "/Users/redomic/Documents/Projects/Hackathons/GDG/GDG-25/TamGen/scripts/build_data/prepare_pdb_ids.py", line 47, in main
    with csv_reader(args.pdb_id_list, dict_reader=True) as reader:
  File "/opt/anaconda3/envs/TamGen/lib/python3.12/contextlib.py", line 137, in __enter__
    return next(self.gen)
           ^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/TamGen/lib/python3.12/site-packages/fy_common_ext/io/io_wrappers.py", line 49, in csv_reader
    with open(filename, 'r', encoding='utf-8') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'tmp_pdb.csv'
Traceback (most recent call last):
  File "/Users/redomic/Documents/Projects/Hackathons/GDG/GDG-25/TamGen/scripts/build_data/prepare_pdb_ids.py", line 68, in <module>
    main()
  File "/Users/re

Generating Compounds for PDB Generating Compounds for PDB  3j9m
 3j7y
Generating Compounds for PDB  6nu3
Generating Compounds for PDB  5ool
| loaded 2 examples from: ./TamGen_Demo_Data/gen_3j7y.tg-m1.tg
| loaded 1 examples from: ./TamGen_Demo_Data/gen_5ool.tg-m1.tg
| loaded 2 examples from: ./TamGen_Demo_Data/gen_3j9m.tg-m1.tg
| loaded 2 examples from: ./TamGen_Demo_Data/gen_6nu3.tg-m1.tg
| loaded 2 examples from: ./TamGen_Demo_Data/gen_3j7y.tg-m1.m1
| ./TamGen_Demo_Data gen_3j7y tg-m1 2 examples
| loaded 1 examples from: ./TamGen_Demo_Data/gen_5ool.tg-m1.m1
| ./TamGen_Demo_Data gen_5ool tg-m1 1 examples
| loaded 2 examples from: ./TamGen_Demo_Data/gen_3j9m.tg-m1.m1
| ./TamGen_Demo_Data gen_3j9m tg-m1 2 examples
| loaded 2 examples from: ./TamGen_Demo_Data/gen_6nu3.tg-m1.m1
| ./TamGen_Demo_Data gen_6nu3 tg-m1 2 examples
| loaded 2 examples from: ./TamGen_Demo_Data/gen_3j7y.tg-m1.tg.coord
Generating 10 compounds...
| loaded 1 examples from: ./TamGen_Demo_Data/gen_5ool.tg-m1.tg.coord
Gen

  0%|          | 0/30 [00:00<?, ?it/s]

| loaded 2 examples from: ./TamGen_Demo_Data/gen_6nu3.tg-m1.tg.coord
Generating 10 compounds...
| loaded 2 examples from: ./TamGen_Demo_Data/gen_3j9m.tg-m1.tg.coord
Generating 10 compounds...



[A

[A[Ahuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlo

Error in compound generation: index out of range in self


  torch.topk(
  torch.gather(
  torch.topk(
  torch.topk(
  3%|▎         | 1/30 [00:57<27:57, 57.83s/it]


Inserting to ArangoDB...
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

  3%|▎         | 1/30 [01:00<29:19, 60.68s/it]


Inserting to ArangoDB...
Getting vector embedding


  7%|▋         | 2/30 [01:26<20:13, 43.34s/it]


Inserting to ArangoDB...
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting vector embedding
Getting Amino Acid sequence for  3j7y
Getting Amino Acid sequence for  5ool
Getting Amino Acid sequence for  6nu3




Predicting binding affinity: Predicting binding affinity:  ['Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O', 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(N)(=O)O)[C@@H](O)[C@H]1O', 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)NP(=O)(O)O)[C@@H](O)[C@H]1O', 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)[C@H](F)P(=O)(O)O)[C@@H](O)[C@H]1O', 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)CP(=O)(O)O)[C@@H](O)[C@H]1O', 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(O)(O)=S)[C@@H](O)[C@H]1O', 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)CP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O', 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P@@](O)(=S)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O', 'N=c1nc(O)c2ncn([C@@H]3O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]3O)c2[nH]1', 'N=c1nc(O)c2ncn([C@@H]3O[C@H](COP(=O)(O)OP(=O)(O)NP(=O)(O)O)[C@@H](O)[C@H]3O)'] ['GCUAAACCUAGCCCCAAACCCCCACCUUACUACCAAACCUUAGCCAAACCAUUUACAUAAAGUAUAGGCGAUAGAAAUUGGGCGCAAUAGAUAUAGUACCGCAAGGGAAAGAUGAAAAAACCAAGCAUAAUAUAGC

{'messages': [HumanMessage(content="\n    USER INPUT: What proteins can you find me related to mitochondrial ribosomal protein L36? Generate some compounds and test their binding affinity\n\n    You are an advanced drug discovery assistant with multiple tools.\n\n    - Use your tools as needed to assist in end-to-end drug discovery and answer user queries.\n    - Always structure your output as valid JSON string so it can be parsed in python.\n    - If possible, always try to generate reports for whatever output you get. Don't generate reports for errors.\n    - Do not add explanations or any extra text.\n    - When working with multiple outputs, run functions one by one for everything unless stated otherwise by the user.\n    - When working with drugs, ignore embeddings.\n    ", additional_kwargs={}, response_metadata={}, id='10bec45d-deed-47fa-863c-e7d1e92e41b4'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_ITmgLhpxgFG2quBI8mstGVGE', 'function': {'arguments

In [67]:
message = json.loads(output["messages"][-1].content)
message

{'report_path': 'report_20250307_134844.csv'}

In [78]:
# output = query_graph("Given pdb 5ool, generate some compounds")
# output = query_graph("Find proteins related to disease Anaphylaxis, generate compounds and test their binding affinity")
output = query_graph("Take a random drug from the database and find top 10 most similar drugs to it")



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH drug
FOR drug IN drug
RETURN drug.smiles
LIMIT 1
[0m
AQL Query Execution Error: 
[33;1m[1;3msyntax error, unexpected LIMIT declaration, expecting end of query string near 'LIMIT 1
' at position 5:1[0m

AQL Query (2):[32;1m[1;3m
WITH drug
FOR drug IN drug
LIMIT 1
RETURN drug.smiles
[0m
AQL Result:
[32;1m[1;3m['CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](Cc1cnc[nH]1)NC(=O)[C@@H]1CCC(=O)N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)NNC(N)=O'][0m

[1m> Finished chain.[0m
Finding similar drugs...
Getting vector embedding


In [79]:
output

{'messages': [HumanMessage(content="\n    USER INPUT: Take a random drug from the database and find top 10 most similar drugs to it\n\n    You are an advanced drug discovery assistant with multiple tools.\n\n    - Use your tools as needed to assist in end-to-end drug discovery and answer user queries.\n    - Always structure your output as valid JSON string so it can be parsed in python.\n    - If possible, always try to generate reports for whatever output you get. Don't generate reports for errors.\n    - Do not add explanations or any extra text.\n    - When working with multiple outputs, run functions one by one for everything unless stated otherwise by the user.\n\n    TO NOTE:\n    - DO NOT USE get_chemberta_embedding TOOL AS ALL GENERATED COMPOUNDS ALREADY HAVE THEIR EMBEDDINGS IN THE DATABASE,\n        USE AQL INSTEAD TO FETCH\n    ", additional_kwargs={}, response_metadata={}, id='50c7f0bd-5c13-4c5f-ae5e-c2a5a9236ed5'),
  AIMessage(content='', additional_kwargs={'tool_calls': 

In [80]:
message = json.loads(output["messages"][-1].content)
message

{'report_path': 'report_20250307_143953.csv'}