# Similarities between molecules

So the goal of this notebook, is to use the fingerprints provided in the ```mol_bits.pkl``` to find similar molecules.

The essential idea is to retrieve similar items from a database. We assume that these items are complex objects with a variable amout of features

In [29]:
import pickle
import matplotlib.pyplot as plt 
import numpy as np
from datasketch import MinHash, MinHashLSH
from nltk.metrics import jaccard_distance

In [2]:
with open("../mol_bits.pkl", "rb") as f:
    MOL_BITS = pickle.load(f)

In [37]:
# Função para criar MinHash a partir de uma lista de valores
def create_minhash(values):
    m = MinHash(num_perm=256)
    for val in values:
        m.update(str(val).encode('utf8'))
    return m

In [38]:
# Criar LSH
lsh = MinHashLSH(threshold=0.5, num_perm=256)

In [39]:
for key, values in MOL_BITS.items():
    m = create_minhash(values)
    lsh.insert(key, m)

In [60]:
# Função para encontrar keys similares
def find_similar_keys(target_key, threshold=0.5):
    target_minhash = create_minhash(MOL_BITS[target_key])
    result = lsh.query(target_minhash)
    toReturn = []
    for key in result:
        score = 1 - jaccard_distance(set(MOL_BITS[target_key]), set(MOL_BITS[key]))
        if score > threshold:
            toReturn.append({(target_key, key): score})

    return sorted(toReturn, key=lambda x: list(x.values())[0], reverse=True)

In [64]:
# Exemplo de consulta para encontrar keys similares à 'key1'
KEY =  "CHEMBL2022243"
similar_keys = find_similar_keys(KEY)

for d in similar_keys:
    print(d)


{('CHEMBL2022243', 'CHEMBL2022243'): 1.0}
{('CHEMBL2022243', 'CHEMBL2022244'): 0.8148148148148149}
{('CHEMBL2022243', 'CHEMBL2022245'): 0.7951807228915663}
{('CHEMBL2022243', 'CHEMBL2022249'): 0.7283950617283951}
{('CHEMBL2022243', 'CHEMBL2022250'): 0.5531914893617021}
{('CHEMBL2022243', 'CHEMBL2022251'): 0.5106382978723405}
