# Using Collaborative Filtering algorithm

**Objective**: Implement a hybrid method of Collaborative Filtering (using user-based and item-based)

1. Pre process the data
    - Load the data
    - Select some lines to test our estimator
    - Create a miniHashLsh where we can retrieve the K nearest neighbors of a molecule (based on Jaccard similarity).
    - Create a activity matrix (prot x activity)
    - Center the data (to be used in the user-based algorithm)

2. Implement the user based algorithm
    - Given a ```uniprot_id``` this algorithm should return the top 5 most similar proteins assuming a minimum of 0.5 similarity.

3. Implement the item based algorithm
    - Given a ```molecule_id```.
    - Retrieve the K nearest neighbors of the molecule (Molecules most similar to the given molecule, that have an activity value for the proteins identified in the user-based algorithm)
    - Estimate the activity level as weighted average.

4. Test the estimator with the selected lines (notice that the line of the protein does not count as a neighbor, only as groundTruth).

5. Make the predictions for the ```activity_test_blanked.csv``` file.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# imports
import random
import pickle

import pandas as pd
import numpy as np

from helpers import *
from datasketch import MinHash, MinHashLSH
from nltk.metrics import jaccard_distance

# 1. Pre process the data

Load the data

In [3]:
with open("../mol_bits.pkl", "rb") as f:
    MOL_BITS = pickle.load(f)

ACTIVITY_TRAIN = pd.read_csv('./activity_train.csv', names=['uniprot_id', 'mol_id', 'activity'])
ACTIVITY_TRAIN["mol_id"] = ACTIVITY_TRAIN["mol_id"].apply(remove_blank_space)

ACTIVITY_TEST = pd.read_csv('./activity_test_blanked.csv', names=['uniprot_id', 'mol_id', 'activity'])
ACTIVITY_TEST["mol_id"] = ACTIVITY_TEST["mol_id"].apply(remove_blank_space)

print("Total molecules: ", len(MOL_BITS))
print("Total rows in the train dataset: ", len(ACTIVITY_TRAIN))
print("Total rows in the test dataset: ", len(ACTIVITY_TEST))

Total molecules:  73865
Total rows in the train dataset:  135711
Total rows in the test dataset:  4628


Select some lines to test our estimator (33% of the data)

In [4]:
random.seed(42)
random_indexes = random.sample(range(0, len(ACTIVITY_TRAIN)), len(ACTIVITY_TRAIN) // 33)
ACTIVITY_VAL = ACTIVITY_TRAIN.iloc[random_indexes]
ACTIVITY_VAL["predicted"] = [0] * len(ACTIVITY_VAL)
print("Total rows in the validation dataset: ", len(ACTIVITY_VAL))

Total rows in the validation dataset:  4112


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ACTIVITY_VAL["predicted"] = [0] * len(ACTIVITY_VAL)


In [5]:
# Create MinHashLSH
LSH = MinHashLSH(threshold=0.5, num_perm=256)

# Create MinHash for each molecule
def create_minhash(values):
    m = MinHash(num_perm=256)
    for val in values:
        m.update(str(val).encode('utf8'))
    return m

# Insert MinHashes into LSH
for key, values in MOL_BITS.items():
    m = create_minhash(values)
    LSH.insert(key, m)

"""Find similar keys for a given target key
"""
def find_similar_keys(target_key, threshold=0.5):
    target_minhash = create_minhash(MOL_BITS[target_key])
    result = LSH.query(target_minhash)
    toReturn = {}
    for key in result:
        if key == target_key:
            continue
        score = 1 - jaccard_distance(set(MOL_BITS[target_key]), set(MOL_BITS[key]))
        if score > threshold:
            toReturn[key] = score

    return toReturn

Create the activity matrix (prot x activity)

In [12]:
def RowCenterMatrix(M):
    V  = M.T
    mat= np.nanmean(V, axis=0)
    VC = V - mat
    VC[np.isnan(VC)]=0
    return VC.T

def create_activityMatrix(rowLabel, colLabel, df):
    rows = list(set(ACTIVITY_TRAIN[rowLabel]))
    cols = list(set(ACTIVITY_TRAIN[colLabel]))
    n_rows = len(rows)
    n_cols = len(cols)

    rows = dict(zip(rows, np.arange(n_rows)))
    cols = dict(zip(cols, np.arange(n_cols)))
    mat = np.zeros((n_rows, n_cols))
    # mat[mat==0]=np.nan
    for rw in df.values:
        mat[rows[rw[0]], cols[rw[1]]]=rw[2]
    return mat, rows, cols

In [13]:
MAT, MAP_PROTS, MAP_MOLS=create_activityMatrix("uniprot_id", "mol_id", ACTIVITY_TRAIN)
MAT_CENTERED = RowCenterMatrix(MAT)

In [14]:
def CosSim_SingleRow(M, row_index):
    # Extraia a linha especificada
    target_row = M[row_index, :]
    
    # Calcule as normas de todas as linhas
    norms = np.sqrt(np.sum(M * M, axis=1))
    norms[norms < 0.001] = 0.001  # Solução para linhas ou colunas sem variância
    
    # Calcule a norma da linha alvo
    target_norm = norms[row_index]
    
    # Calcule o produto escalar da linha alvo com todas as outras linhas
    dot_products = np.dot(M, target_row)
    
    # Calcule a similaridade coseno dividindo pelo produto das normas
    cos_similarities = dot_products / (norms * target_norm)
    
    return cos_similarities

----------------------------

# 2. Implement the user based algorithm

This algorith uses the ```unitprot_id``` and check in with ```Pearson Correlation Coefficient``` the similarity between the proteins, returning the top 5 most similar proteins.

In [32]:
def get_similar_prots(target_prot, threshold=0.5):
    target_prot_index = MAP_PROTS[target_prot]
    df = pd.DataFrame(CosSim_SingleRow(MAT_CENTERED, target_prot_index), index=MAP_PROTS.keys(), columns=[target_prot])
    df_filtrado = df[df[target_prot] > threshold].sort_values(by=target_prot, ascending=False).head()
    return df_filtrado.to_dict()[target_prot]

# 3. Implement the item based algorithm

In [243]:
use_case = ACTIVITY_VAL.iloc[222]
knn = find_similar_keys(use_case["mol_id"], threshold=0.5)

In [244]:
use_case

uniprot_id          P30968
mol_id        CHEMBL176284
activity                 5
predicted                0
Name: 69521, dtype: object

In [245]:
similar_prots = get_similar_prots(use_case["uniprot_id"])

In [246]:
similar_prots

{'P30968': 1.0000000000000082}

In [247]:
uniprot_mols = set(ACTIVITY_TRAIN[ACTIVITY_TRAIN["uniprot_id"].isin(similar_prots.keys())]["mol_id"].values)

In [248]:
sim_mols = set(knn.keys())
intercept_mols = sim_mols.intersection(uniprot_mols)

In [249]:
subset = ACTIVITY_TRAIN[ACTIVITY_TRAIN["mol_id"].isin(intercept_mols) & ACTIVITY_TRAIN["uniprot_id"].isin(similar_prots.keys())]
subset

Unnamed: 0,uniprot_id,mol_id,activity
69494,P30968,CHEMBL366701,6
69495,P30968,CHEMBL175821,5
69499,P30968,CHEMBL360447,5
69500,P30968,CHEMBL180421,3
69501,P30968,CHEMBL368153,4
69508,P30968,CHEMBL179023,6
69511,P30968,CHEMBL175491,5
69513,P30968,CHEMBL360472,3
69514,P30968,CHEMBL177523,6
69517,P30968,CHEMBL175869,5


In [250]:
similar_prots

{'P30968': 1.0000000000000082}

In [251]:
count = {d : {"num": 0, "den":0, "weigth": similar_prots[d]} for d in subset["uniprot_id"].unique()} 
for i, row in subset.iterrows():
    count[row["uniprot_id"]]["num"] += knn[row["mol_id"]] * row["activity"]
    count[row["uniprot_id"]]["den"] += knn[row["mol_id"]]


In [252]:
final_num = 0
final_den = 0
for key in count.keys():
    count[key]["score"] = round(count[key]["num"] / count[key]["den"])
    final_num += count[key]["score"] * count[key]["weigth"]
    final_den += count[key]["weigth"]

final_score = round(final_num / final_den, 3)

In [253]:
final_score

5.0

PARA AVALIAR SE O MODELO FOI CORRIDO COM INTEIROS

In [1]:
# # Contando o número de linhas onde A == B
# num_iguais = (ACTIVITY_VAL['activity'] == ACTIVITY_VAL['predicted']).sum()

# # Contando o número de linhas onde A != B
# num_diferentes = (ACTIVITY_VAL['activity'] != ACTIVITY_VAL['predicted']).sum()

# num_diff_por_um = (abs(ACTIVITY_VAL['activity'] - ACTIVITY_VAL['predicted']) == 1).sum()
# num_diff_por_dois = (abs(ACTIVITY_VAL['activity'] - ACTIVITY_VAL['predicted']) == 2).sum()

# print(f"Número de linhas onde activity == predicted: {num_iguais/len(ACTIVITY_VAL) * 100:.2f}%")
# print(f"Número de linhas onde activity != predicted: {(num_diferentes)/len(ACTIVITY_VAL) * 100:.2f}%")

# print()

# print(f"Número de linhas onde |activity - predicted| == 1: {num_diff_por_um/len(ACTIVITY_VAL) * 100:.2f}%")
# print(f"Número de linhas onde |activity - predicted| == 2: {num_diff_por_dois/len(ACTIVITY_VAL) * 100:.2f}%")

# print()

# print(f"Retirando diferenças por 1, falhados: {(num_diferentes - num_diff_por_um)/len(ACTIVITY_VAL) * 100:.2f}%")
# print(f"Retirando diferenças por 1, acertados: {(num_iguais + num_diff_por_um)/len(ACTIVITY_VAL) * 100:.2f}%")

# print() 

# print(f"Retirando diferenças por 1 e 2, falhados: {(num_diferentes - num_diff_por_dois - num_diff_por_um)/len(ACTIVITY_VAL) * 100:.2f}%")
# print(f"Retirando diferenças por 1 e 2, acertados: {(num_iguais + num_diff_por_um + num_diff_por_dois)/len(ACTIVITY_VAL) * 100:.2f}%")