In [1]:
import pickle
import pandas as pd
from nltk.metrics import jaccard_distance

In [3]:
train_columns = ['uniprot_id', 'mol_id', 'activity']
test_columns = ['uniprot_id', 'mol_id', 'activity']

## import the data
ACTIVITY_TRAIN = pd.read_csv("../activity_train.csv", names = train_columns, header = None)
ACTIVITY_TEST_BLANKED = pd.read_csv("../activity_test_blanked.csv", names = test_columns, header = None)

with open("../mol_bits.pkl", "rb") as f:
    MOL_BITS = pickle.load(f)

In [4]:
def jaccard_similarity(s1, s2):
    intersection = len(s1.intersection(s2))
    union = len(s1.union(s2))
    return float(intersection) / union

In [4]:
keys_to_predict = ACTIVITY_TEST_BLANKED["uniprot_id"].unique().tolist()
keys_that_we_have_data = ACTIVITY_TRAIN["uniprot_id"].unique().tolist()

In [5]:
if not all([key in keys_that_we_have_data for key in keys_to_predict]):
    print("We have missing data")
    print([key for key in keys_to_predict if key not in keys_that_we_have_data])

In [6]:
mols_to_predict = ACTIVITY_TEST_BLANKED["uniprot_id"].unique().tolist()
mols_that_we_have = ACTIVITY_TRAIN["uniprot_id"].unique().tolist()

In [7]:
if not all([key in mols_that_we_have for key in keys_to_predict]):
    print("We have missing data")
    print([key for key in mols_to_predict if key not in mols_that_we_have])

In [7]:
# Example for the first protein of the dataset

uniprot_id = keys_to_predict[0]
df_uniprot = ACTIVITY_TRAIN[ACTIVITY_TRAIN["uniprot_id"] == uniprot_id]

# print(df_uniprot)

mols = []
for mol in df_uniprot["mol_id"]:
    mol = mol.strip()
    mols.append({mol: MOL_BITS[mol]})

mol_to_predict = MOL_BITS["CHEMBL2022258"]

# Matriz de similaridade
similarity_matrix = []
for mol in mols:
    for key, value in mol.items():
        activity = df_uniprot[df_uniprot["mol_id"] == " " + key]["activity"].values.tolist()[0]
        score = jaccard_similarity(set(value), set(mol_to_predict))
        if score > 0.5:
            similarity_matrix.append({(key, "CHEMBL2022258"): (score, activity)})


In [8]:
similarity_matrix


[{('CHEMBL2022247', 'CHEMBL2022258'): (0.825, 4)},
 {('CHEMBL2022249', 'CHEMBL2022258'): (0.5106382978723404, 3)},
 {('CHEMBL2022252', 'CHEMBL2022258'): (0.6666666666666666, 6)},
 {('CHEMBL2022253', 'CHEMBL2022258'): (0.5742574257425742, 4)},
 {('CHEMBL2022254', 'CHEMBL2022258'): (0.6020408163265306, 3)},
 {('CHEMBL2022255', 'CHEMBL2022258'): (0.5789473684210527, 5)},
 {('CHEMBL2022256', 'CHEMBL2022258'): (0.5544554455445545, 8)},
 {('CHEMBL2022257', 'CHEMBL2022258'): (0.7078651685393258, 6)},
 {('CHEMBL2022576', 'CHEMBL2022258'): (0.7816091954022989, 7)},
 {('CHEMBL2022577', 'CHEMBL2022258'): (0.7391304347826086, 6)},
 {('CHEMBL2047149', 'CHEMBL2022258'): (0.7391304347826086, 6)},
 {('CHEMBL2047150', 'CHEMBL2022258'): (0.7391304347826086, 4)},
 {('CHEMBL2047151', 'CHEMBL2022258'): (0.7311827956989247, 7)},
 {('CHEMBL2047152', 'CHEMBL2022258'): (0.7311827956989247, 6)},
 {('CHEMBL2047153', 'CHEMBL2022258'): (0.7311827956989247, 4)},
 {('CHEMBL2047154', 'CHEMBL2022258'): (0.723404255319