In [1]:
import pickle
import numpy as np
import pandas as pd
import json

test = pd.read_csv("data/activity_test_blanked.csv", header=None)
test.columns = ["uniprot_id", "mol_id", "activity"]
test["mol_id"] = test["mol_id"].str.strip()

train = pd.read_csv("data/activity_train.csv", header=None)
train.columns = ["uniprot_id", "mol_id", "activity"]
train["mol_id"] = train["mol_id"].str.strip()


with open("data/mol_bits.pkl", "rb") as f:
    data = pickle.load(f)

with open("buckets.json", "r") as f:
    buckets = json.load(f)

In [2]:
def jaccard_similarity(ids):
    id1, id2 = ids
    set1 = set(data[id1])
    set2 = set(data[id2])
    return len(set1.intersection(set2)) / len(set1.union(set2))

## Collaborative filtering 

$$r_{xi} = \frac{\sum_{j \in N(i;x)}S_{ij} \,\cdot\, r_{xj}}{\sum_{j \in N(i;x)}S_{ij}}$$

In [61]:
buckets

{'50812': ['CHEMBL2022243',
  'CHEMBL405446',
  'CHEMBL2022245',
  'CHEMBL256822',
  'CHEMBL256873',
  'CHEMBL272363',
  'CHEMBL272571',
  'CHEMBL444800',
  'CHEMBL480584',
  'CHEMBL257886',
  'CHEMBL218699',
  'CHEMBL4592284',
  'CHEMBL3685834',
  'CHEMBL16529',
  'CHEMBL3676917',
  'CHEMBL1744039',
  'CHEMBL227977',
  'CHEMBL560953',
  'CHEMBL1990884',
  'CHEMBL3290578',
  'CHEMBL517503',
  'CHEMBL471628',
  'CHEMBL1085872',
  'CHEMBL100426',
  'CHEMBL3690184',
  'CHEMBL16195',
  'CHEMBL250026',
  'CHEMBL371903',
  'CHEMBL1957007',
  'CHEMBL3775554',
  'CHEMBL141035',
  'CHEMBL386701',
  'CHEMBL4555340',
  'CHEMBL388844',
  'CHEMBL3967683',
  'CHEMBL3946901',
  'CHEMBL3647302',
  'CHEMBL3618336',
  'CHEMBL2331807',
  'CHEMBL402162',
  'CHEMBL392402',
  'CHEMBL1766030',
  'CHEMBL1935748',
  'CHEMBL3669548'],
 '94844': ['CHEMBL4205970',
  'CHEMBL2022244',
  'CHEMBL9615',
  'CHEMBL4852077',
  'CHEMBL575498',
  'CHEMBL577407',
  'CHEMBL3105385',
  'CHEMBL3105384',
  'CHEMBL3105467',
  'C

In [60]:
# for all different proteins get each similarity and do the averages

# P34972, CHEMBL259255, 6
# P21554, CHEMBL259255, 6
# lets predict first for P34972

buckets["87760"]

ref = "CHEMBL259255"
sims = {}
prots = []
for prot in buckets["87760"]:
    if prot != ref:
        sims[prot] = jaccard_similarity((ref, prot))
        prots.append(prot)

query = "P21554"
rating = 0
for prot in prots:
    subset = train[(train["uniprot_id"] == query) & train["mol_id"].isin([prot])]
    if subset.shape[0] > 0:
        print(subset)
        sim = sims[prot]
        rating += sim * subset["activity"].values[0]


print(rating)

      uniprot_id        mol_id  activity
31995     P21554  CHEMBL261381         6
4.342105263157895


In [5]:
test["mol_id"].value_counts()

mol_id
CHEMBL1242923    12
CHEMBL2031737     9
CHEMBL555146      9
CHEMBL467094      9
CHEMBL1112        8
                 ..
CHEMBL4063825     1
CHEMBL4093165     1
CHEMBL4175043     1
CHEMBL4164392     1
CHEMBL4205802     1
Name: count, Length: 4047, dtype: int64