### In this tutorial, we give a few basic search queries that can be used to query the elasticsearch databse we prepared in Tutorial 2

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from typing import List

In [2]:
class SearchResult():
    """Represents a product returned from elasticsearch."""
    def __init__(self, id_, image, smiles):
        self.id = id_
        self.fragments = fragments
        self.smiles = smiles

    def from_doc(doc) -> 'SearchResult':
        return SearchResult(
                id_ = doc.meta.id,
                image = doc.fragments,
                smiles = doc.smiles,
            )

### Instantiating and connecting to the ElasticSearch server and creating the DSL search object

In [6]:
DOC_TYPE = 'mol_frags'
INDEX_NAME = 'chembl_data'

es = Elasticsearch()
s = Search(using=es, index=INDEX_NAME, doc_type=DOC_TYPE)

### The first type of queries are Term queries
### Term queries look for the "EXACT" term in a provided field. This can be used to find things like ChemblID's, Mol_reg_no's, or UNIPROT id's which are unique identifiers of molecules and proteins

In [7]:
## This is the format of a sample query where we look at molecules with the following mol_regno number
sample_query = {
            "terms": 
            {"mol_regno":["1065000","1080000","1075000","1070000"]
            }
        }

In [8]:
## execute the search
docs = s.query(sample_query).execute()

In [9]:
## looking at the search results
for doc in docs:
    print('CHEMBL_ID : ' +doc.meta['id'],'MOL_REGNO : ' +str(doc.mol_regno),'Elasticsearch_Score : ' +str(doc.meta['score']))


CHEMBL_ID : CHEMBL1699269 MOL_REGNO : 1080000 Elasticsearch_Score : 1.0
CHEMBL_ID : CHEMBL1649610 MOL_REGNO : 1065000 Elasticsearch_Score : 1.0
CHEMBL_ID : CHEMBL1681819 MOL_REGNO : 1075000 Elasticsearch_Score : 1.0
CHEMBL_ID : CHEMBL1671868 MOL_REGNO : 1070000 Elasticsearch_Score : 1.0


#### the doc.meta object contains meta information for that document. In this case, we look at the Score and see that this is equal to one and we retrieve the exact documents that match our mol_regno's

In [16]:
## In this query, we try to match molecules with similar smiles as the following query

sample_query = {'match':{'smiles':'C\\C(=C/c1cc(F)c(OCCC(F)F)cc1F)\\C(=O)N[C@@H]2[C@H](O)[C@@H](O)[C@H]3OCO[C@H]3[C@@H]2O'}}

In [17]:
docs = s.query(sample_query).execute()

In [None]:
## To evaluate how similar the search results are to our initial sample query, we use rdkit's fingerprint similarity

In [18]:
import rdkit.Chem
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs

In [19]:
query_smiles = "C\\C(=C/c1cc(F)c(OCCC(F)F)cc1F)\\C(=O)N[C@@H]2[C@H](O)[C@@H](O)[C@H]3OCO[C@H]3[C@@H]2O"
query_fingerprint = MACCSkeys.GenMACCSKeys(rdkit.Chem.MolFromSmiles(query_smiles))

In [20]:
for doc in docs:
    retrieved_smiles = doc.smiles
    retrieved_fingerprint = MACCSkeys.GenMACCSKeys(rdkit.Chem.MolFromSmiles(retrieved_smiles))
    similarity = DataStructs.FingerprintSimilarity(query_fingerprint,retrieved_fingerprint)
    print('CHEMBL_ID : ' + doc.meta['id'],'MACCS_similarity : ' + str(round(similarity,2)),'Elasticsearch_Score : ' +str(doc.meta['score']))

CHEMBL_ID : CHEMBL1644781 MACCS_similarity : 1.0 Elasticsearch_Score : 55.43736
CHEMBL_ID : CHEMBL1644782 MACCS_similarity : 0.89 Elasticsearch_Score : 53.282974
CHEMBL_ID : CHEMBL1644783 MACCS_similarity : 0.92 Elasticsearch_Score : 48.45689
CHEMBL_ID : CHEMBL1644784 MACCS_similarity : 0.9 Elasticsearch_Score : 39.24539
CHEMBL_ID : CHEMBL1644786 MACCS_similarity : 0.92 Elasticsearch_Score : 38.983902
CHEMBL_ID : CHEMBL1723900 MACCS_similarity : 0.54 Elasticsearch_Score : 37.94413
CHEMBL_ID : CHEMBL1644785 MACCS_similarity : 0.93 Elasticsearch_Score : 37.829983
CHEMBL_ID : CHEMBL1650941 MACCS_similarity : 0.38 Elasticsearch_Score : 36.028313
CHEMBL_ID : CHEMBL1673377 MACCS_similarity : 0.61 Elasticsearch_Score : 34.83581
CHEMBL_ID : CHEMBL1683011 MACCS_similarity : 0.4 Elasticsearch_Score : 34.223824


#### Here we see that the highest scoring query result from elasticsearch has a similarity of 1.0 (exactly similar). Similarly, the next 5 molecules are have good similarities (>0.8)

In [21]:
## In this query, we try to match molecules with fragments that similar smiles as the following query

sample_query = {'match':{'fragments':"[Xe]c1cc(F)c(F)cc1F"}}

In [22]:
docs = s.query(sample_query).execute()

In [24]:
### Let's see if our fragment query is in the retrieved documents
for doc in docs:
    if "[Xe]c1cc(F)c(F)cc1F" in doc.fragments:
        print('Fragment Found! ','CHEMBL_ID : ' + doc.meta['id'],'Elasticsearch_Score : ' +str(doc.meta['score']))
    else:
        print('Fragment Not Found! ','CHEMBL_ID : ' + doc.meta['id'],'Elasticsearch_Score : ' +str(doc.meta['score']))

Fragment Found!  CHEMBL_ID : CHEMBL1689667 Elasticsearch_Score : 14.380017
Fragment Found!  CHEMBL_ID : CHEMBL1645418 Elasticsearch_Score : 14.260451
Fragment Not Found!  CHEMBL_ID : CHEMBL1729097 Elasticsearch_Score : 14.138317
Fragment Found!  CHEMBL_ID : CHEMBL1683979 Elasticsearch_Score : 13.887561
Fragment Found!  CHEMBL_ID : CHEMBL1683977 Elasticsearch_Score : 13.803261
Fragment Found!  CHEMBL_ID : CHEMBL1683978 Elasticsearch_Score : 13.748348
Fragment Found!  CHEMBL_ID : CHEMBL1683558 Elasticsearch_Score : 13.447982
Fragment Found!  CHEMBL_ID : CHEMBL1683559 Elasticsearch_Score : 13.370429
Fragment Found!  CHEMBL_ID : CHEMBL1681811 Elasticsearch_Score : 13.362166
Fragment Found!  CHEMBL_ID : CHEMBL1683986 Elasticsearch_Score : 13.321281


#### Here we see that only one document did not have the fragment query

In [25]:
### lets look at how similar the mismatched document to the query fragment
query_smiles = "[Xe]c1cc(F)c(F)cc1F"
query_fingerprint = MACCSkeys.GenMACCSKeys(rdkit.Chem.MolFromSmiles(query_smiles))

tuple_score_fingerprint = []
for frag in docs[2].fragments:
    frag_fingerprint = MACCSkeys.GenMACCSKeys(rdkit.Chem.MolFromSmiles(frag))
    similarity = DataStructs.FingerprintSimilarity(query_fingerprint,frag_fingerprint)   
    tuple_score_fingerprint.append((similarity,frag))

In [26]:
tuple_score_fingerprint = sorted(tuple_score_fingerprint,reverse=True)

In [28]:
### here we see even though the string match isn't exact, the MACCS fingerprint shows that this is actually a very similar fragment as our query fragment
tuple_score_fingerprint

[(1.0, '[Xe]c1ccc(F)cc1F'),
 (0.3333333333333333, '[Xe]C(F)(F)F'),
 (0.19047619047619047, '[Xe]c1cc([Xe])nc([Xe])n1'),
 (0.06666666666666667, '[Xe]SC'),
 (0.06666666666666667, '[Xe]O[Xe]')]

In [29]:
### Let's try a boolean query where we look for molecules that SHOULD have an instance of a fragment [Xe]c1cc([Xe])nc([Xe])n1 but MUST also have [Xe]c1ccc(F)cc1F

sample_query = {
        "bool": {
            "must": [
                {"match": 
                {"fragments": '[Xe]c1ccc(F)cc1F'}
                } ],
            "should": [
                {"match":
                {"fragments": "[Xe]c1cc([Xe])nc([Xe])n1"}
                }
                    ]
        }
    }
                

In [30]:
docs = s.query(sample_query).execute()

In [31]:
for doc in docs:
    if "[Xe]c1ccc(F)cc1F" in doc.fragments and "[Xe]c1cc([Xe])nc([Xe])n1" in doc.fragments:
        print('Exact Match Found! ','CHEMBL_ID : ' + doc.meta['id'],'Elasticsearch_Score : ' +str(doc.meta['score']))
    elif "[Xe]c1ccc(F)cc1F" in doc.fragments or "[Xe]c1cc([Xe])nc([Xe])n1" in doc.fragments:
        print('MUST Criteria Met! ','CHEMBL_ID : ' + doc.meta['id'],'Elasticsearch_Score : ' +str(doc.meta['score']))
    else:
        print('MUST Criteria Not Met! ','CHEMBL_ID : ' + doc.meta['id'],'Elasticsearch_Score : ' +str(doc.meta['score']))

Exact Match Found!  CHEMBL_ID : CHEMBL1729097 Elasticsearch_Score : 17.376602
MUST Criteria Met!  CHEMBL_ID : CHEMBL1722796 Elasticsearch_Score : 16.957664
MUST Criteria Not Met!  CHEMBL_ID : CHEMBL1716756 Elasticsearch_Score : 15.275121
MUST Criteria Not Met!  CHEMBL_ID : CHEMBL1726494 Elasticsearch_Score : 14.338987
MUST Criteria Met!  CHEMBL_ID : CHEMBL1688410 Elasticsearch_Score : 13.874198
MUST Criteria Not Met!  CHEMBL_ID : CHEMBL1703330 Elasticsearch_Score : 13.368966
MUST Criteria Not Met!  CHEMBL_ID : CHEMBL1725290 Elasticsearch_Score : 13.261469
MUST Criteria Met!  CHEMBL_ID : CHEMBL1688421 Elasticsearch_Score : 13.12858
MUST Criteria Met!  CHEMBL_ID : CHEMBL1688422 Elasticsearch_Score : 13.12858
MUST Criteria Met!  CHEMBL_ID : CHEMBL1688423 Elasticsearch_Score : 13.002883


#### You might be surprised as to why we have some molecules not Fulfiling the matching criteria at all. For example document 2, 3, 5 and 6

In [32]:
docs[2].fragments

['[Xe]NC(=NS(=O)(=O)c1ccc(F)cc1F)c1ccccc1', '[Xe]c1cc(C)on1']

In [154]:
docs[3].fragments

['[Xe]NC', '[Xe]c1cc(F)nc(F)n1']

In [155]:
docs[5].fragments

['[Xe]S(=O)(=O)c1ccc(F)cc1F', '[Xe]NC(=Nc1cccnc1)c1ccccc1']

In [156]:
docs[6].fragments

['[Xe]S(=O)(=O)c1ccc(C)cc1', '[Xe]NC(=Nc1ccc(F)cc1F)c1ccc(F)cc1']

#### In the next tutorial, we will talk about how special characters affect how Elasticsearch performs the indexing and retrieval of text and how we can control these settings for better matchings
#### Unlike in regular pieces of text, the smiles format of molecules do not conform to the normal rules of tokenization, so we have to go deeper and control how we index our molecules