In [139]:
import json
import pickle
import numpy as np
import pandas as pd
from utils import *

with open("data/results/buckets.pkl", "rb") as f:
    buckets = pickle.load(f)

with open("data/mol_bits.pkl", "rb") as f:
    mol_bits = pickle.load(f)

with open("data/results/test_mols_neighbors.pkl", "rb") as f:
    test_mols_neighbors = pickle.load(f)


train = pd.read_csv("data/activity_train.csv", header=None)
train.columns = ["uniprot_id", "mol_id", "activity"]
train["mol_id"] = train["mol_id"].astype(str).str.strip()

test = pd.read_csv("data/activity_test_blanked.csv", header=None)
test.columns = ["uniprot_id", "mol_id", "activity"]
test["mol_id"] = test["mol_id"].astype(str).str.strip()

train_pivot = train.pivot(
    index="uniprot_id", columns="mol_id", values="activity"
).fillna(0)

mol_ids = list(mol_bits.keys())
prot_ids = list(train["uniprot_id"].unique())

# Baseline Estimation for CF
- Define similarity $s_{ij}$ of molecules i and j.
- Select k nearest neighbors N(i;x).
    - Molecules most similar to i, that have activity on protein x.
- Estimate rating $r_{xi}$ as the weighted average:

$$ r_{xi} = b_{xi} + \frac{\sum_{j\in N(i;x)} S_{ij} \cdot (r_{xj} - b_{xj})}{\sum_{j\in N(i;x)} s_{ij}} $$
Where $b_{xi}$ is the baseline estimate for $r_{xi}$:
$$ b_{xi} = \mu + b_x + b_i $$

- $\mu$ - overall mean protein activity
- $b_x$ - activity deviation of molecule x (molecule bias)
- $b_i$ - activity deviation of protein i (protein bias)


## Obtaining $b_{xi}$

In [140]:
def Build_bxi(df):
    mu = df.mean(axis=1).mean()
    bx = df.mean(axis=0) - mu
    bi = df.mean(axis=1) - mu

    bxi = pd.DataFrame(np.full(df.shape, mu), index=df.index, columns=df.columns)
    for i, row in df.iterrows():
        bxi.loc[i] = mu + bx + bi.loc[i]
    return bxi, bx, bi

## Similarities and Nearest Neighbors

- Jaccard similarity - $J(A, B) = \frac{A \cap B}{ A \cup B}$
- Find in each bucket, the top k highest similarity neighbors.


In [141]:
# subsetting for validation

prots = 120
molecules = 60_000

validation_set = (
    train_pivot.iloc[prots:, molecules:]
    .reset_index()
    .melt(id_vars="uniprot_id", value_name="activity")
)
validation_set = validation_set[validation_set["activity"] != 0]
validation_set["activity"] = validation_set["activity"].astype(int)

# reno
training_set = (
    train.set_index(["uniprot_id", "mol_id"])
    .drop(validation_set.set_index(["uniprot_id", "mol_id"]).index)
    .reset_index()
)

training_set_pivot = training_set.pivot(
    index="uniprot_id", columns="mol_id", values="activity"
).fillna(0)

In [142]:
bxi, bx, bi = Build_bxi(training_set_pivot)
bxi

mol_id,CHEMBL10,CHEMBL1000,CHEMBL100003,CHEMBL100004,CHEMBL100045,CHEMBL100052,CHEMBL10007,CHEMBL100071,CHEMBL10009,CHEMBL100104,...,CHEMBL99939,CHEMBL99967,CHEMBL9997,CHEMBL99979,CHEMBL9998,CHEMBL99982,CHEMBL99983,CHEMBL99987,CHEMBL9999,CHEMBL99994
uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
O14842,0.003481,-0.010408,-0.045131,-0.045131,0.114592,0.065981,0.031258,-0.045131,0.072925,0.045147,...,0.003481,0.024314,0.072925,0.003481,-0.024297,0.010425,-0.031242,0.003481,-0.045131,-0.031242
O43193,-0.000706,-0.014595,-0.049317,-0.049317,0.110405,0.061794,0.027072,-0.049317,0.068738,0.040961,...,-0.000706,0.020127,0.068738,-0.000706,-0.028484,0.006238,-0.035428,-0.000706,-0.049317,-0.035428
O43613,0.078303,0.064414,0.029692,0.029692,0.189414,0.140803,0.106081,0.029692,0.147748,0.119970,...,0.078303,0.099137,0.147748,0.078303,0.050526,0.085248,0.043581,0.078303,0.029692,0.043581
O43614,0.107581,0.093692,0.058970,0.058970,0.218692,0.170081,0.135359,0.058970,0.177026,0.149248,...,0.107581,0.128415,0.177026,0.107581,0.079803,0.114526,0.072859,0.107581,0.058970,0.072859
O95665,0.003311,-0.010577,-0.045300,-0.045300,0.114423,0.065811,0.031089,-0.045300,0.072756,0.044978,...,0.003311,0.024145,0.072756,0.003311,-0.024466,0.010256,-0.031411,0.003311,-0.045300,-0.031411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9UBY5,-0.002369,-0.016258,-0.050980,-0.050980,0.108742,0.060131,0.025408,-0.050980,0.067075,0.039297,...,-0.002369,0.018464,0.067075,-0.002369,-0.030147,0.004575,-0.037092,-0.002369,-0.050980,-0.037092
Q9UKP6,0.008062,-0.005827,-0.040549,-0.040549,0.119173,0.070562,0.035840,-0.040549,0.077506,0.049729,...,0.008062,0.028895,0.077506,0.008062,-0.019716,0.015006,-0.026660,0.008062,-0.040549,-0.026660
Q9Y5N1,0.286152,0.272263,0.237541,0.237541,0.397263,0.348652,0.313930,0.237541,0.355597,0.327819,...,0.286152,0.306985,0.355597,0.286152,0.258374,0.293097,0.251430,0.286152,0.237541,0.251430
Q9Y5X5,-0.002087,-0.015976,-0.050699,-0.050699,0.109024,0.060413,0.025690,-0.050699,0.067357,0.039579,...,-0.002087,0.018746,0.067357,-0.002087,-0.029865,0.004857,-0.036810,-0.002087,-0.050699,-0.036810


In [143]:
masked_MSE(bxi.values, training_set_pivot.values)

0.5304278338328786

In [144]:
validation_set

Unnamed: 0,uniprot_id,mol_id,activity
520,Q9H3N8,CHEMBL461360,1
640,Q9H3N8,CHEMBL461535,1
664,Q9H3N8,CHEMBL461536,1
688,Q9H3N8,CHEMBL461537,5
717,Q9Y5N1,CHEMBL461550,4
...,...,...,...
294817,Q14416,CHEMBL94990,1
295587,Q15722,CHEMBL95453,10
295941,Q9Y5N1,CHEMBL95645,10
295968,Q14289,CHEMBL95692,1


In [145]:
## get neighbors
all_test_val_mol_ids = list(validation_set["mol_id"].unique()) + list(
    test["mol_id"].unique()
)
all_test_val_mol_ids = set(all_test_val_mol_ids)
id = mol_ids.index("CHEMBL461360")
prot = "Q9H3N8"


neighbors = k_neighbors(
    id, buckets, mol_bits, mol_ids, k=5, test_ids=all_test_val_mol_ids
)

## does the neighbor have activity for said interaction

mols = [mol_ids[i[0]] for i in neighbors]
if train[(train["mol_id"].isin(mols)) & (train["uniprot_id"] == prot)].shape[0] > 0:
    print("Similar Molecules with activity for the same protein found")

# get the activity of the similar molecules
similar_activities = train[(train["mol_id"].isin(mols)) & (train["uniprot_id"] == prot)]

# reorder the molecules
similar_activities = similar_activities.set_index("mol_id").loc[mols].reset_index()
sims = np.array([i[1] for i in neighbors if mol_ids[i[0]]])

Similar Molecules with activity for the same protein found


## collaborative filtering
- get neighbors 
- check if neighbors have activity with same protein
- do the equation for calculation of the $r_xi$