In [15]:
import numpy as np
import pandas as pd 
import pickle
from pathlib import Path
import polars as pl

In [16]:
dtypes = {'buildingblock1_smiles': np.int16, 'buildingblock2_smiles': np.int16, 'buildingblock3_smiles': np.int16,
          'binds_BRD4':np.byte, 'binds_HSA':np.byte, 'binds_sEH':np.byte}

directory = Path("../data/shrunken/")

## What is in the train data?

In [None]:
train_data =  pl.read_parquet(directory / "train.parquet")
train_data = train_data.to_pandas(use_pyarrow_extension_array=True)

In [None]:
# Load building blocks
BBs_dict_reverse_1 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_1.p', 'br'))
BBs_dict_reverse_2 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_2.p', 'br'))
BBs_dict_reverse_3 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_3.p', 'br'))

In [19]:
train_data.head(10)

Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,binds_BRD4,binds_HSA,binds_sEH
0,0,0,0,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,0,0,0
1,0,0,1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,0,0,0
2,0,0,2,C#CCOc1ccc(CNc2nc(NCc3ccc(OCC#C)cc3)nc(N[C@@H]...,0,0,0
3,0,0,6,C#CCOc1ccc(CNc2nc(NCCNC(=O)C(=C)C)nc(N[C@@H](C...,0,0,0
4,0,0,10,C#CCOc1ccc(CNc2nc(NCC(=O)NCC=C)nc(N[C@@H](CC#C...,0,0,0
5,0,0,11,C#CCOc1ccc(CNc2nc(NCC(C)OCC=C)nc(N[C@@H](CC#C)...,0,0,0
6,0,0,12,C#CCOc1ccc(CNc2nc(NCCCOCC=C)nc(N[C@@H](CC#C)CC...,0,0,0
7,0,0,13,C#CCOc1ccc(CNc2nc(NCCOCC=C)nc(N[C@@H](CC#C)CC(...,0,0,0
8,0,0,21,C#CCOc1ccc(CNc2nc(Nc3cnn(C4CCN(C(=O)OC(C)(C)C)...,0,0,0
9,0,0,23,C#CCOc1ccc(CNc2nc(Nc3n[nH]c4c3CN(C(=O)OC(C)(C)...,0,0,0


### Counts and unique values

In [23]:
print(f"Length of train data: {len(train_data)}")
print(f"Number of unique building blocks 1: {len(BBs_dict_reverse_1)}")
print(f"Number of unique building blocks 2: {len(BBs_dict_reverse_2)}")
print(f"Number of unique building blocks 3: {len(BBs_dict_reverse_3)}")

Length of train data: 98415610
Number of unique building blocks 1: 271
Number of unique building blocks 2: 693
Number of unique building blocks 3: 872


In [25]:
print(f"Value counts of binds_BRD4: {train_data['binds_BRD4'].value_counts()}")
print(f"Value counts of binds_HSA: {train_data['binds_HSA'].value_counts()}")
print(f"Value counts of binds_sEH: {train_data['binds_sEH'].value_counts()}")

Value counts of binds_BRD4: binds_BRD4
0    97958646
1      456964
Name: count, dtype: int64[pyarrow]
Value counts of binds_HSA: binds_HSA
0    98007200
1      408410
Name: count, dtype: int64[pyarrow]
Value counts of binds_sEH: binds_sEH
0    97691078
1      724532
Name: count, dtype: int64[pyarrow]


In [28]:
print(f"Value counts of buildingblock1_smiles: {train_data['buildingblock1_smiles'].value_counts()}")

Value counts of buildingblock1_smiles: buildingblock1_smiles
120    363469
266    363460
101    363456
81     363454
215    363443
        ...  
218    362487
76     362476
75     362464
96     362458
73     362325
Name: count, Length: 271, dtype: int64[pyarrow]


In [29]:
print(f"Value counts of buildingblock2_smiles: {train_data['buildingblock2_smiles'].value_counts()}")

Value counts of buildingblock2_smiles: buildingblock2_smiles
524    235364
421    234792
510    234610
48     234440
632    234067
        ...  
91      49734
180     49388
653     49121
415     48917
438     48904
Name: count, Length: 693, dtype: int64[pyarrow]


In [30]:
print(f"Value counts of buildingblock3_smiles: {train_data['buildingblock3_smiles'].value_counts()}")

Value counts of buildingblock3_smiles: buildingblock3_smiles
613    187397
216    187388
656    187366
87     187341
836    187335
        ...  
618      1354
809       995
500       792
57        542
640       266
Name: count, Length: 872, dtype: int64[pyarrow]


In [46]:
train_BB1 = set(BBs_dict_reverse_1.values())
train_BB2 = set(BBs_dict_reverse_2.values())
train_BB3 = set(BBs_dict_reverse_3.values())

print(f"Overlap between building blocks 1 and 2: {len(train_BB1.intersection(train_BB2))}")
print(f"Overlap between building blocks 1 and 3: {len(train_BB1.intersection(train_BB3))}")
print(f"Overlap between building blocks 2 and 3: {len(train_BB2.intersection(train_BB3))}")

print(f"Difference between building blocks 1 and 2: {len(train_BB1.difference(train_BB2))}")
print(f"Difference between building blocks 1 and 3: {len(train_BB1.difference(train_BB3))}")
print(f"Difference between building blocks 2 and 3: {len(train_BB2.difference(train_BB3))}")

Overlap between building blocks 1 and 2: 0
Overlap between building blocks 1 and 3: 0
Overlap between building blocks 2 and 3: 691
Difference between building blocks 1 and 2: 271
Difference between building blocks 1 and 3: 271
Difference between building blocks 2 and 3: 2


In [47]:
train_BB1

{'C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21',
 'C#CC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C#CC[C@@](C)(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C#CC[C@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21',
 'C#CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CCC(CC=C)(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CCC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'CC(=O)c1ccc(C[C@H](NC(=O)OCC2c3ccccc3-c3ccccc32)C(=O)O)cc1',
 'CC(C)(C)OC(=O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'CC(C)(C)OC(=O)CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'CC(C)(C)OC(=O)N1CCN(C(=O)OCC2c3ccccc3-c3ccccc32)C1C(=O)O',
 'CC(C)(C)OC(=O)N1C[C@@H](NC(=O)OCC2c3ccccc3-c3ccccc32)[C@H](C(=O)O)C1',
 'CC(C)(C)OCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'CC(C)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'CC(OC(C)(C)C)C(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'CCC(C)C(NC(=O)OCC1c2ccc

## What is in the test data?

In [31]:
test_data =  pl.read_parquet(directory / "test.parquet")
test_data = test_data.to_pandas(use_pyarrow_extension_array=True)

In [33]:
test_BBS_dict_reverse_1 = pickle.load(open(directory / 'test_dicts/BBs_dict_reverse_1_test.p', 'br'))
test_BBS_dict_reverse_2 = pickle.load(open(directory / 'test_dicts/BBs_dict_reverse_2_test.p', 'br'))
test_BBS_dict_reverse_3 = pickle.load(open(directory / 'test_dicts/BBs_dict_reverse_3_test.p', 'br'))

In [34]:
test_data.head(10)

Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,is_BRD4,is_HSA,is_sEH
0,0,17,17,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,True,True,True
1,0,17,87,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,True,True,True
2,0,17,99,C#CCCC[C@H](Nc1nc(NCC2(O)CCCC2(C)C)nc(Nc2ccc(C...,True,True,True
3,0,17,244,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2sc(Cl)c...,True,True,True
4,0,17,394,C#CCCC[C@H](Nc1nc(NCC2CCC(SC)CC2)nc(Nc2ccc(C=C...,True,True,True
5,0,17,499,C#CCCC[C@H](Nc1nc(NCc2ccc(C)cc2N2CCCC2)nc(Nc2c...,True,True,True
6,0,17,518,C#CCCC[C@H](Nc1nc(NCCc2ccc(OCC(=O)Nc3cccc(C)c3...,True,True,True
7,0,17,536,C#CCCC[C@H](Nc1nc(NCCCNC(=O)c2occc2C)nc(Nc2ccc...,True,True,True
8,0,17,651,C#CCCC[C@H](Nc1nc(NCc2nnc(N3CCCC3)o2)nc(Nc2ccc...,True,True,True
9,0,17,731,C#CCCC[C@H](Nc1nc(NCCc2csc3ccccc23)nc(Nc2ccc(C...,True,True,True


#### Counts and unique values

In [35]:
print(f"Length of test data: {len(test_data)}")
print(f"Number of unique building blocks 1: {len(test_BBS_dict_reverse_1)}")
print(f"Number of unique building blocks 2: {len(test_BBS_dict_reverse_2)}")
print(f"Number of unique building blocks 3: {len(test_BBS_dict_reverse_3)}")

Length of test data: 878022
Number of unique building blocks 1: 341
Number of unique building blocks 2: 1140
Number of unique building blocks 3: 1389


In [36]:
print(f"Value counts of is_BRD4: {test_data['is_BRD4'].value_counts()}")

Value counts of is_BRD4: is_BRD4
True     558859
False    319163
Name: count, dtype: int64[pyarrow]


In [37]:
print(f"Value counts of is_HSA: {test_data['is_HSA'].value_counts()}")

Value counts of is_HSA: is_HSA
True     557895
False    320127
Name: count, dtype: int64[pyarrow]


In [38]:
print(f"Value counts of is_sEH: {test_data['is_sEH'].value_counts()}")

Value counts of is_sEH: is_sEH
True     558142
False    319880
Name: count, dtype: int64[pyarrow]


In [39]:
print(f"Value counts of buildingblock1_smiles: {test_data['buildingblock1_smiles'].value_counts()}")

Value counts of buildingblock1_smiles: buildingblock1_smiles
136    13708
106    13665
316    13650
16     13637
339    13631
       ...  
212      663
229      663
265      663
292      663
309      663
Name: count, Length: 341, dtype: int64[pyarrow]


In [40]:
print(f"Value counts of buildingblock2_smiles: {test_data['buildingblock2_smiles'].value_counts()}")

Value counts of buildingblock2_smiles: buildingblock2_smiles
1025    4879
991     3387
264     3359
1019    2677
952     2434
        ... 
321       68
80        51
515       51
719       34
277       17
Name: count, Length: 1140, dtype: int64[pyarrow]


In [41]:
print(f"Value counts of buildingblock3_smiles: {test_data['buildingblock3_smiles'].value_counts()}")

Value counts of buildingblock3_smiles: buildingblock3_smiles
621     3441
576     2827
882     2787
1046    2238
1135    2142
        ... 
935        5
693        3
658        2
1085       1
913        1
Name: count, Length: 1389, dtype: int64[pyarrow]


In [52]:
test_BB1 = set(test_BBS_dict_reverse_1.values())
test_BB2 = set(test_BBS_dict_reverse_2.values())
test_BB3 = set(test_BBS_dict_reverse_3.values())

print(f"Overlap between building blocks 1 and 2: {len(test_BB1.intersection(test_BB2))}")
print(f"Overlap between building blocks 1 and 3: {len(test_BB1.intersection(test_BB3))}")
print(f"Overlap between building blocks 2 and 3: {len(test_BB2.intersection(test_BB3))}")

Overlap between building blocks 1 and 2: 0
Overlap between building blocks 1 and 3: 0
Overlap between building blocks 2 and 3: 760


## Overlap between train set and test set

In [53]:
# Check for any overlap between the training and test set
print(f"Overlap between train BB1 and test BB1: {len(train_BB1.intersection(test_BB1))}")
print(f"Overlap between train BB1 and test BB2: {len(train_BB1.intersection(test_BB2))}")
print(f"Overlap between train BB1 and test BB3: {len(train_BB1.intersection(test_BB3))}")
print(f"Overlap between train BB2 and test BB1: {len(train_BB2.intersection(test_BB1))}")
print(f"Overlap between train BB2 and test BB2: {len(train_BB2.intersection(test_BB2))}")
print(f"Overlap between train BB2 and test BB3: {len(train_BB2.intersection(test_BB3))}")
print(f"Overlap between train BB3 and test BB1: {len(train_BB3.intersection(test_BB1))}")
print(f"Overlap between train BB3 and test BB2: {len(train_BB3.intersection(test_BB2))}")
print(f"Overlap between train BB3 and test BB3: {len(train_BB3.intersection(test_BB3))}")

Overlap between train BB1 and test BB1: 271
Overlap between train BB1 and test BB2: 0
Overlap between train BB1 and test BB3: 0
Overlap between train BB2 and test BB1: 0
Overlap between train BB2 and test BB2: 693
Overlap between train BB2 and test BB3: 690
Overlap between train BB3 and test BB1: 0
Overlap between train BB3 and test BB2: 691
Overlap between train BB3 and test BB3: 871


In [56]:
# Overlap in molecule_smiles
len(set(train_data['molecule_smiles']).intersection(set(test_data['molecule_smiles'])))

0