In [1]:
import json
import pickle
import numpy as np
import pandas as pd
from utils import *

with open("data/results/buckets.pkl", "rb") as f:
    buckets = pickle.load(f)

with open("data/mol_bits.pkl", "rb") as f:
    mol_bits = pickle.load(f)

with open("data/results/test_mols_neighbors.pkl", "rb") as f:
    test_mols_neighbors = pickle.load(f)


train = pd.read_csv("data/activity_train.csv", header=None)
train.columns = ["uniprot_id", "mol_id", "activity"]
train["mol_id"] = train["mol_id"].astype(str).str.strip()

train_pivot = train.pivot(
    index="uniprot_id", columns="mol_id", values="activity"
).fillna(0)

mol_ids = list(mol_bits.keys())
prot_ids = list(train["uniprot_id"].unique())

# Baseline Estimation for CF
- Define similarity $s_{ij}$ of molecules i and j.
- Select k nearest neighbors N(i;x).
    - Molecules most similar to i, that have activity on protein x.
- Estimate rating $r_{xi}$ as the weighted average:

$$ r_{xi} = b_{xi} + \frac{\sum_{j\in N(i;x)} S_{ij} \cdot (r_{xj} - b_{xj})}{\sum_{j\in N(i;x)} s_{ij}} $$
Where $b_{xi}$ is the baseline estimate for $r_{xi}$:
$$ b_{xi} = \mu + b_x + b_i $$

- $\mu$ - overall mean protein activity
- $b_x$ - activity deviation of molecule x (molecule bias)
- $b_i$ - activity deviation of protein i (protein bias)


## Obtaining $b_{xi}$

In [2]:
mu = train_pivot.mean(axis=1).mean()
bx = train_pivot.mean(axis=0) - mu
bi = train_pivot.mean(axis=1) - mu

bxi = pd.DataFrame(
    np.full(train_pivot.shape, mu), index=train_pivot.index, columns=train_pivot.columns
)
for i, row in train_pivot.iterrows():
    bxi.loc[i] = mu + bx + bi.loc[i]

In [3]:
# MSE

masked_MSE(bxi.values, train_pivot.values)

0.5274407791195926

## Similarities and Nearest Neighbors

- Jaccard similarity - $J(A, B) = \frac{A \cap B}{ A \cup B}$
- Find in each bucket, the top k highest similarity neighbors.


In [4]:
K = 60
L = 0.1
R = train_pivot.values

P, Q, mses_final = Reg_SGD(R, epochs=150, k=K, seed=42, LR=0.001, Lp=L, Lq=L)

In [10]:
masked_MSE(R, P @ Q)

0.4401520085678894

In [15]:
train.sample(1000, random_state=42)

Unnamed: 0,uniprot_id,mol_id,activity
127208,Q99705,CHEMBL209109,8
34158,P21554,CHEMBL3950235,1
108257,P41594,CHEMBL3603910,1
96976,P35462,CHEMBL3959698,10
116476,P49286,CHEMBL4084568,10
...,...,...,...
86393,P35367,CHEMBL1193187,8
31744,P21554,CHEMBL229698,8
14664,P0DMS8,CHEMBL383144,7
32020,P21554,CHEMBL408174,1


In [23]:
P.shape, Q.shape

((144, 60), (60, 72632))

In [34]:
prot_ids.index("Q99705"), mol_ids.index("CHEMBL209109")

P[prot_ids.index("P35367")] @ Q[:, mol_ids.index("CHEMBL1193187")]

0.06333157868769368

## With latent factors 