In [1]:
import pandas as pd
import pubchempy as pcp

from tqdm.notebook import tqdm

In [2]:
binding_data = pd.read_csv('../data/extra_albumin_binding_data.csv', header=None)

In [3]:
binding_data

Unnamed: 0,0,1
0,Glipizide,0
1,Halofenate,0
2,Haloperidol,0
3,Flunitrazepam,0
4,Indoprofen,0
...,...,...
139,Avanafil,0
140,Asenapine,0
141,Acalabrutinib,0
142,Alpelisib,0


In [4]:
binding_data = binding_data.rename(columns={0: 'name', 1: 'nonbinding'})

In [5]:
smiles = []
ligands = tqdm(list(binding_data.name))
broken_names = []
for name in ligands:
    try:
        cs = pcp.get_compounds(name, 'name')
        smiles.append(cs[0].isomeric_smiles)
    except Exception as e:
        broken_names.append(name)
        smiles.append(None)

  0%|          | 0/144 [00:00<?, ?it/s]

In [6]:
binding_data['smiles'] = pd.Series(smiles)

In [7]:
binding_data.isna().sum()

name          0
nonbinding    0
smiles        0
dtype: int64

In [10]:
binding_data

Unnamed: 0,name,nonbinding,smiles
0,Glipizide,0,CC1=CN=C(C=N1)C(=O)NCCC2=CC=C(C=C2)S(=O)(=O)NC...
1,Halofenate,0,CC(=O)NCCOC(=O)C(C1=CC=C(C=C1)Cl)OC2=CC=CC(=C2...
2,Haloperidol,0,C1CN(CCC1(C2=CC=C(C=C2)Cl)O)CCCC(=O)C3=CC=C(C=...
3,Flunitrazepam,0,CN1C(=O)CN=C(C2=C1C=CC(=C2)[N+](=O)[O-])C3=CC=...
4,Indoprofen,0,CC(C1=CC=C(C=C1)N2CC3=CC=CC=C3C2=O)C(=O)O
...,...,...,...
139,Avanafil,0,COC1=C(C=C(C=C1)CNC2=NC(=NC=C2C(=O)NCC3=NC=CC=...
140,Asenapine,0,CN1C[C@@H]2[C@@H](C1)C3=C(C=CC(=C3)Cl)OC4=CC=C...
141,Acalabrutinib,0,CC#CC(=O)N1CCC[C@H]1C2=NC(=C3N2C=CN=C3N)C4=CC=...
142,Alpelisib,0,CC1=C(SC(=N1)NC(=O)N2CCC[C@H]2C(=O)N)C3=CC(=NC...


In [12]:
binding_data['active'] = binding_data['nonbinding'] == 0

In [14]:
binding_data.active = [int(i) for i in binding_data.active]

In [15]:
binding_data = binding_data[['smiles', 'active']]

In [16]:
binding_data

Unnamed: 0,smiles,active
0,CC1=CN=C(C=N1)C(=O)NCCC2=CC=C(C=C2)S(=O)(=O)NC...,1
1,CC(=O)NCCOC(=O)C(C1=CC=C(C=C1)Cl)OC2=CC=CC(=C2...,1
2,C1CN(CCC1(C2=CC=C(C=C2)Cl)O)CCCC(=O)C3=CC=C(C=...,1
3,CN1C(=O)CN=C(C2=C1C=CC(=C2)[N+](=O)[O-])C3=CC=...,1
4,CC(C1=CC=C(C=C1)N2CC3=CC=CC=C3C2=O)C(=O)O,1
...,...,...
139,COC1=C(C=C(C=C1)CNC2=NC(=NC=C2C(=O)NCC3=NC=CC=...,1
140,CN1C[C@@H]2[C@@H](C1)C3=C(C=CC(=C3)Cl)OC4=CC=C...,1
141,CC#CC(=O)N1CCC[C@H]1C2=NC(=C3N2C=CN=C3N)C4=CC=...,1
142,CC1=C(SC(=N1)NC(=O)N2CCC[C@H]2C(=O)N)C3=CC(=NC...,1


In [17]:
binding_data.to_csv('../data/extra_albumin_binding_data_preprocessed.csv', index=False)

## Merging datasets

In [18]:
baseline_dataset = pd.read_csv('../data/lexa_with_broken_fixed.csv')

In [19]:
baseline_dataset

Unnamed: 0,smiles,active
0,CC(=O)C1=CC2=CC=CC=C2OC1=O,1
1,C1COC2=CC=CC=C2C1O,1
2,C1=CC=C2C(=C1)C(=CC(=O)O2)O,1
3,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O,1
4,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,1
...,...,...
337,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl,1
338,CC(=O)CC(C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O,1
339,CC(C1=CC=C(C=C1)C(=O)C2=CC=CS2)C(=O)OC,1
340,C1=CC=C2C(=C1)C(=CC=N2)CO,0


In [22]:
concat_data = pd.concat([baseline_dataset, binding_data]).reset_index()

In [23]:
concat_data

Unnamed: 0,index,smiles,active
0,0,CC(=O)C1=CC2=CC=CC=C2OC1=O,1
1,1,C1COC2=CC=CC=C2C1O,1
2,2,C1=CC=C2C(=C1)C(=CC(=O)O2)O,1
3,3,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O,1
4,4,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,1
...,...,...,...
481,139,COC1=C(C=C(C=C1)CNC2=NC(=NC=C2C(=O)NCC3=NC=CC=...,1
482,140,CN1C[C@@H]2[C@@H](C1)C3=C(C=CC(=C3)Cl)OC4=CC=C...,1
483,141,CC#CC(=O)N1CCC[C@H]1C2=NC(=C3N2C=CN=C3N)C4=CC=...,1
484,142,CC1=C(SC(=N1)NC(=O)N2CCC[C@H]2C(=O)N)C3=CC(=NC...,1


In [24]:
concat_data.active.value_counts()

1    374
0    112
Name: active, dtype: int64

In [25]:
concat_data.to_csv('../data/merged_final_dataset.csv', index=False)