# Common Variables

In [90]:
from pathlib import Path


DATA_PATH = Path('../data/')
SHRUNKEN_PATH = DATA_PATH / 'shrunken/'

# check if global variables are defined
if '_x_train_data' not in globals():
    _x_train_data = None

if '_x_bb' not in globals():
    _x_bb = None


# Dataset Utils

In [91]:
import pyarrow as pa
import pyarrow.parquet as pq

def get_train_data() -> pq.ParquetFile:
    global _x_train_data

    if _x_train_data is not None:
        return _x_train_data
    
    _x_train_data = pq.ParquetFile(SHRUNKEN_PATH / 'train.parquet')
    return _x_train_data

def get_train_building_blocks() -> dict[str, pq.ParquetFile]:

    return {
        "bb1": pq.ParquetFile(SHRUNKEN_PATH / 'train_dicts' / 'BBs_dict_reverse_1'),
        "bb2": pq.ParquetFile(SHRUNKEN_PATH / 'train_dicts' / 'BBs_dict_reverse_2'),
        "bb3": pq.ParquetFile(SHRUNKEN_PATH / 'train_dicts' / 'BBs_dict_reverse_3'),
    }

# Exploration

In [92]:
from tqdm import tqdm

data = get_train_data()
print(f"Groups: {data.num_row_groups}")

MAX_MOLS_PER_GROUP = 10

smiles_bind_brd4: list[list[str]] = []
smiles_bind_brd4_cnt = 0
smiles_bind_hsa: list[list[str]]= []
smiles_bind_hsa_cnt = 0
smiles_bind_seh: list[list[str]] = []
smiles_bind_seh_cnt = 0
smiles_bind_none: list[list[str]] = []
smiles_bind_none_cnt = 0

def update_pbar_desc(pbar: tqdm):
    desc = f"BRD4: {smiles_bind_brd4_cnt}, HSA: {smiles_bind_hsa_cnt}, SEH: {smiles_bind_seh_cnt}, NONE: {smiles_bind_none_cnt}"
    pbar.set_description(desc)

pbar = tqdm(range(data.num_row_groups))
for group_idx in pbar:
    group = data.read_row_group(group_idx)
    smiles_bind_brd4.append([])
    smiles_bind_hsa.append([])
    smiles_bind_seh.append([])
    smiles_bind_none.append([])

    for row_idx in range(group.num_rows):
        mol_smiles = group[3][row_idx]
        binds_brd4 = True if group[4][row_idx].as_py() == 1 else False
        binds_hsa = True if group[5][row_idx].as_py() == 1 else False
        binds_seh = True if group[6][row_idx].as_py() == 1 else False
        
        if binds_brd4 and len(smiles_bind_brd4[group_idx]) < MAX_MOLS_PER_GROUP:
            smiles_bind_brd4[group_idx].append(str(mol_smiles))
            smiles_bind_brd4_cnt += 1
            update_pbar_desc(pbar)
        if binds_hsa and len(smiles_bind_hsa[group_idx]) < MAX_MOLS_PER_GROUP:
            smiles_bind_hsa[group_idx].append(str(mol_smiles))
            smiles_bind_hsa_cnt += 1
            update_pbar_desc(pbar)
        if binds_seh and len(smiles_bind_seh[group_idx]) < MAX_MOLS_PER_GROUP:
            smiles_bind_seh[group_idx].append(str(mol_smiles))
            smiles_bind_seh_cnt += 1
            update_pbar_desc(pbar)
        if not binds_brd4 and not binds_hsa and not binds_seh and len(smiles_bind_none[group_idx]) < MAX_MOLS_PER_GROUP:
            smiles_bind_none[group_idx].append(str(mol_smiles))
            smiles_bind_none_cnt += 1
            update_pbar_desc(pbar)

        if (len(smiles_bind_brd4[group_idx]) == MAX_MOLS_PER_GROUP and
            len(smiles_bind_hsa[group_idx]) == MAX_MOLS_PER_GROUP and
            len(smiles_bind_seh[group_idx]) == MAX_MOLS_PER_GROUP and
            len(smiles_bind_none[group_idx]) == MAX_MOLS_PER_GROUP):
            break
        
        # break
    # break

Groups: 94


BRD4: 940, HSA: 940, SEH: 940, NONE: 940: 100%|██████████| 94/94 [00:21<00:00,  4.38it/s]


In [93]:
from pprint import pprint
from sklearn.utils import shuffle

smiles_bind_brd4 = [x for group in smiles_bind_brd4 for x in group]
smiles_bind_hsa = [x for group in smiles_bind_hsa for x in group]
smiles_bind_seh = [x for group in smiles_bind_seh for x in group]
smiles_bind_none = [x for group in smiles_bind_none for x in group]

# Randomly sample 3 from each group
smiles_bind_brd4 = shuffle(smiles_bind_brd4)[:3]
smiles_bind_hsa = shuffle(smiles_bind_hsa)[:3]
smiles_bind_seh = shuffle(smiles_bind_seh)[:3]
smiles_bind_none = shuffle(smiles_bind_none)[:3]


print("BRD4")
pprint(smiles_bind_brd4)

print("HSA")
pprint(smiles_bind_hsa)

print("SEH")
pprint(smiles_bind_seh)

print("NONE")
pprint(smiles_bind_none)

BRD4
['CC12CCC(CNc3nc(NCc4ccc(CN5CCCC5=O)cc4)nc(Nc4c(C(=O)N[Dy])ccc5ccccc45)n3)(C1)OC2',
 'Cn1cc(Nc2nc(Nc3ncncc3C#N)nc(N[C@H](CC(=O)N[Dy])c3cccs3)n2)ccc1=O',
 'CC(=O)c1ccc(Nc2nc(NCCS(=O)(=O)Nc3ccccc3)nc(Nc3ccc(C(=O)N[Dy])nc3)n2)c(F)c1']
HSA
['O=C(N[Dy])C1c2ccccc2CN1c1nc(NCCS(=O)(=O)C2CCOCC2)nc(Nc2ncnc3[nH]cnc23)n1',
 'CS(=O)(=O)c1cccc(Nc2nc(Nc3cc(Cl)nc(Cl)c3[N+](=O)[O-])nc(N[C@@H](Cc3cccnc3)C(=O)N[Dy])n2)c1',
 'CCn1cc(Nc2nc(Nc3ccc4c(c3)COC4=O)nc(N[C@@H](CC(=O)N[Dy])Cc3cccs3)n2)c(C)n1']
SEH
['Cc1cc2cc(CNc3nc(NCc4cccnc4OC(F)F)nc(N[C@H](Cc4cn(C)c5ccccc45)C(=O)N[Dy])n3)ccc2[nH]1',
 'CCOC(=O)c1ncccc1Nc1nc(Nc2cccc(-n3cncn3)c2)nc(Nc2nc3cc(C(=O)N[Dy])ccc3[nH]2)n1',
 'COC(=O)c1cncc(Nc2nc(NCC3CCCn4ccnc43)nc(Nc3c(Br)cccc3C(=O)N[Dy])n2)c1']
NONE
['CC(C)(C)OC(=O)n1ncc2cc(Nc3nc(NCc4nnc(-c5ccncc5)[nH]4)nc(N[C@H](Cc4ccc(Cl)cc4)C(=O)N[Dy])n3)ccc21',
 'O=C(N[Dy])c1cc(Nc2nc(NC[C@@H]3C[C@@H]4O[C@H]3[C@H]3C[C@H]34)nc(NCC3(N4CCOCC4)CC3)n2)ccc1[N+](=O)[O-]',
 'C#Cc1cccc(Nc2nc(Nc3nc(C(F)(F)F)c(C(=O)N[Dy])s3)n

In [89]:
print(len(smiles_bind_brd4))

3
