# Similarities between molecules

So the goal of this notebook, is to use the fingerprints provided in the ```mol_bits.pkl``` to find similar molecules.

The essential idea is to retrieve similar items from a database. We assume that these items are complex objects with a variable amout of features

In [1]:
import pickle
import matplotlib.pyplot as plt 
import numpy as np
from datasketch import MinHash, MinHashLSH
from nltk.metrics import jaccard_distance

In [2]:
with open("../mol_bits.pkl", "rb") as f:
    MOL_BITS = pickle.load(f)

In [3]:
# Função para criar MinHash a partir de uma lista de valores
def create_minhash(values):
    m = MinHash(num_perm=256)
    for val in values:
        m.update(str(val).encode('utf8'))
    return m

In [4]:
# Criar LSH
lsh = MinHashLSH(threshold=0.5, num_perm=256)

In [5]:
for key, values in MOL_BITS.items():
    m = create_minhash(values)
    lsh.insert(key, m)

In [6]:
# Função para encontrar keys similares
def find_similar_keys(target_key, threshold=0.5):
    target_minhash = create_minhash(MOL_BITS[target_key])
    result = lsh.query(target_minhash)
    toReturn = []
    for key in result:
        score = 1 - jaccard_distance(set(MOL_BITS[target_key]), set(MOL_BITS[key]))
        if score > threshold:
            toReturn.append({(target_key, key): score})

    return sorted(toReturn, key=lambda x: list(x.values())[0], reverse=True)

In [7]:
KEY =  "CHEMBL2022258"
similar_keys = find_similar_keys(KEY)

for d in similar_keys:
    print(d)


{('CHEMBL2022258', 'CHEMBL2022258'): 1.0}
{('CHEMBL2022258', 'CHEMBL2022247'): 0.825}
{('CHEMBL2022258', 'CHEMBL2022576'): 0.7816091954022988}
{('CHEMBL2022258', 'CHEMBL2022577'): 0.7391304347826086}
{('CHEMBL2022258', 'CHEMBL2047149'): 0.7391304347826086}
{('CHEMBL2022258', 'CHEMBL2047150'): 0.7391304347826086}
{('CHEMBL2022258', 'CHEMBL2047151'): 0.7311827956989247}
{('CHEMBL2022258', 'CHEMBL2047153'): 0.7311827956989247}
{('CHEMBL2022258', 'CHEMBL2047152'): 0.7311827956989247}
{('CHEMBL2022258', 'CHEMBL1829174'): 0.7311827956989247}
{('CHEMBL2022258', 'CHEMBL2179472'): 0.7311827956989247}
{('CHEMBL2022258', 'CHEMBL2047155'): 0.7234042553191489}
{('CHEMBL2022258', 'CHEMBL2047156'): 0.7234042553191489}
{('CHEMBL2022258', 'CHEMBL2047158'): 0.7234042553191489}
{('CHEMBL2022258', 'CHEMBL2047159'): 0.7234042553191489}
{('CHEMBL2022258', 'CHEMBL2047154'): 0.7234042553191489}
{('CHEMBL2022258', 'CHEMBL2047161'): 0.7083333333333333}
{('CHEMBL2022258', 'CHEMBL2022257'): 0.7078651685393258}
{(