In [14]:
import pandas as pd
import polars as pl
import pickle
from tqdm import tqdm

In [15]:
CSV_FILE = '../../data/submissions/xgb_398.csv'

In [16]:
# Read the CSV file
predictions = pd.read_csv(CSV_FILE)

# Print the first 5 rows
print(predictions.head())

          id     binds
0  295246830  0.000247
1  295246831  0.000668
2  295246832  0.000067
3  295246833  0.000099
4  295246834  0.000443


In [17]:
# Read test data
test_data = pl.read_parquet('../../data/raw/test.parquet')
test_data = test_data.to_pandas()
test_data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name
0,295246830,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,BRD4
1,295246831,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,HSA
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,sEH
3,295246833,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,BRD4
4,295246834,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,HSA


In [18]:
# Get unique values of buildingblock1_smiles
buildingblock1_smiles = test_data['buildingblock1_smiles'].unique()

In [19]:
# Add buildingblock1_smiles column to predictions
predictions['buildingblock1_smiles'] = test_data['buildingblock1_smiles']


predictions.head()

Unnamed: 0,id,binds,buildingblock1_smiles
0,295246830,0.000247,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
1,295246831,0.000668,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
2,295246832,6.7e-05,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
3,295246833,9.9e-05,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
4,295246834,0.000443,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O


In [20]:
# Extract first building blocks in train
with open("../../data/shrunken/train_dicts/BBs_dict_reverse_1.p", "br") as f:
    blocks = list(pickle.load(f).values())    

blocks[:10]

['C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21',
 'C#CC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C#CC[C@@](C)(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C#CC[C@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21',
 'C#CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CCC(CC=C)(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CCC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O',
 'C=CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O']

In [21]:
# Extract building blocks in test
with open("../../data/shrunken/test_dicts/BBs_dict_reverse_1_test.p", "br") as f:
    blocks_test = pickle.load(f)

In [22]:
predictions

Unnamed: 0,id,binds,buildingblock1_smiles
0,295246830,0.000247,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
1,295246831,0.000668,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
2,295246832,0.000067,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
3,295246833,0.000099,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
4,295246834,0.000443,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
...,...,...,...
1674891,296921721,0.000064,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...
1674892,296921722,0.000154,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...
1674893,296921723,0.000023,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...
1674894,296921724,0.000508,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...


In [23]:
# Loop through dataframe if buildingblock1_smiles is in blocks, set binds to 0
count = 0
for i in tqdm(range(len(predictions)), desc='Processing'):
    if predictions['buildingblock1_smiles'][i] in blocks:
        predictions.loc[i, 'binds'] = 0
        count += 1

print(f"Number of binds set to 0: {count}")

Processing: 100%|██████████| 1674896/1674896 [00:59<00:00, 28253.56it/s] 

Number of binds set to 0: 1107117





In [24]:
predictions.drop(columns=['buildingblock1_smiles'], inplace=True)

In [25]:
predictions.head()

Unnamed: 0,id,binds
0,295246830,0.000247
1,295246831,0.000668
2,295246832,6.7e-05
3,295246833,9.9e-05
4,295246834,0.000443


In [26]:
# Save the new predictions
predictions.to_csv('../../data/submissions/submission_unknown_only.csv', index=False)