In [1]:
import numpy.typing as npt
import numpy as np
from pathlib import Path
import pandas as pd
import pickle
from rdkit import Chem  # type: ignore[import-not-found]
from tqdm import tqdm
tqdm.pandas(desc='Applying')
import polars as pl

# Add src to path
import os
os.chdir('../..')
import sys
sys.path.append('../..')

In [2]:
# Src imports
from src.setup.setup_data import setup_inference_data
from src.typing.xdata import XData, DataRetrieval

In [3]:
TRIAZINE_SMARTS = "c1ncncn1"
TRIAZINE = Chem.MolFromSmarts(TRIAZINE_SMARTS)

print(os.getcwd())
path = Path(os.getcwd() + "/data/")
# Print full path
print(path.resolve())

/home/schobbejak/competitions/q4-detect-protein
/home/schobbejak/competitions/q4-detect-protein/data


In [4]:
def combine_prediction_datasets(
    directory: Path,
    x: XData,
    preds_easy: npt.NDArray[np.float_],
    preds_medium: npt.NDArray[np.float_],
    preds_hard: npt.NDArray[np.float_],
) -> pd.DataFrame:
    """Combine predictions on the easy, medium and hard dataset.

    :param directory: Raw path to the unique training blocks
    :param x: Input data containing building block information
    :param preds_easy: Predictions on the easy part of the test data
    :param preds_medium: Predictions on the medium part of the test data
    :param preds_hard: Predictions on the hard part of the test data
    :return: Dataframe with predictions filtered
    """
    # Extract the first building blocks in train
    with open(directory / "train_dicts/BBs_dict_reverse_1.p", "br") as f:
        blocks = list(pickle.load(f).values())  # noqa: S301 (Security issue)

    ZEROS = 0 

    x.retrieval = DataRetrieval.SMILES_MOL
    for idx in tqdm(range(len(x)), desc="Replacing triazine"):
        mol = Chem.MolFromSmiles(x[idx])
        if mol.HasSubstructMatch(TRIAZINE):
            preds_hard[idx] = ZEROS
        else:
            preds_easy[idx] = ZEROS
            preds_medium[idx] = ZEROS

    # Initialize the retrieval to first building block
    x.retrieval = DataRetrieval.SMILES_BB1

    for idx in range(len(x)):
        if x[idx] in blocks:
            preds_medium[idx] = ZEROS
            preds_hard[idx] = ZEROS
        else:
            preds_easy[idx] = ZEROS

    # Add all predictions
    return preds_easy + preds_medium + preds_hard

In [5]:
inference_data = pl.read_parquet(path / "shrunken/test.parquet")
inference_data = inference_data.to_pandas(use_pyarrow_extension_array=True)
X = setup_inference_data(path / "shrunken/", inference_data)

In [6]:
raw_test_data = pl.read_parquet(path / "raw/test.parquet")
raw_test_data = raw_test_data.to_pandas(use_pyarrow_extension_array=True)

In [7]:
easy_path = path / "submissions/submission_446.csv"
medium_path = path / "submissions/submission_446.csv"
hard_path = path / "submissions/submission_446.csv"

In [8]:
# Load predictions
easy_preds = pd.read_csv(easy_path)
medium_preds = pd.read_csv(medium_path)
hard_preds = pd.read_csv(hard_path)

In [9]:
raw_test_data['easy_binds'] = easy_preds['binds']
raw_test_data['medium_binds'] = medium_preds['binds']
raw_test_data['hard_binds'] = hard_preds['binds']

In [None]:
# Apply power 0.75 to medium and hard binds
raw_test_data['medium_binds'] = raw_test_data['medium_binds'] ** 0.75
raw_test_data['hard_binds'] = raw_test_data['hard_binds'] ** 0.75

In [10]:
raw_test_data

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,easy_binds,medium_binds,hard_binds
0,295246830,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,BRD4,0.000532,0.076762,0.076762
1,295246831,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,HSA,0.001148,0.069264,0.069264
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,sEH,0.000018,0.035902,0.035902
3,295246833,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,BRD4,0.000310,0.080390,0.080390
4,295246834,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,HSA,0.000972,0.076937,0.076937
...,...,...,...,...,...,...,...,...,...
1674891,296921721,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC1CCC(CCN)CC1,COC1CCC(CCNc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@H](C...,HSA,0.000035,0.003208,0.003208
1674892,296921722,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC1CCC(CCN)CC1,COC1CCC(CCNc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@H](C...,sEH,0.000063,0.002074,0.002074
1674893,296921723,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,NCc1cccs1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCc2cccs2)nc(Nc2noc3...,BRD4,0.000008,0.001314,0.001314
1674894,296921724,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,NCc1cccs1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCc2cccs2)nc(Nc2noc3...,HSA,0.000175,0.004564,0.004564


In [11]:
# Create mask for if something is in easy
# Something is in easy if the buildingblock1_smiles is in train_dicts
# Load train_dicts
with open(path / "shrunken/train_dicts/BBs_dict_reverse_1.p", "br") as f:
    train_blocks = list(pickle.load(f).values())  # noqa: S301 (Security issue)

# Create mask for if something is in easy
easy_mask = raw_test_data['buildingblock1_smiles'].isin(train_blocks)
raw_test_data['easy_binds'] = raw_test_data['easy_binds'] * easy_mask
raw_test_data['medium_binds'] = raw_test_data['medium_binds'] * ~easy_mask
raw_test_data['hard_binds'] = raw_test_data['hard_binds'] * ~easy_mask

In [12]:
# Create mask for if something is in hard
# Something is in hard if the molecule does not contain triazine
hard_mask = raw_test_data['molecule_smiles'].progress_apply(lambda x: not Chem.MolFromSmiles(x).HasSubstructMatch(TRIAZINE))

Applying: 100%|██████████| 1674896/1674896 [02:48<00:00, 9922.96it/s] 


In [13]:
raw_test_data['easy_binds'] = raw_test_data['easy_binds'] * ~hard_mask
raw_test_data['medium_binds'] = raw_test_data['medium_binds'] * ~hard_mask
raw_test_data['hard_binds'] = raw_test_data['hard_binds'] * hard_mask

In [14]:
# Calculate binds by summing the three columns
raw_test_data['binds'] = raw_test_data['easy_binds'] + raw_test_data['medium_binds'] + raw_test_data['hard_binds']

In [15]:
# Assert no binds are above 1 or below 0
assert (raw_test_data['binds'] > 1).sum() == 0
assert (raw_test_data['binds'] < 0).sum() == 0

In [16]:
# Save raw_test_data
save_data = raw_test_data[['id', 'binds']]

save_data.to_csv(path / "submissions/emh_ensemble.csv", index=False)