In [1]:
import pandas as pd
import pubchempy as pcp

from tqdm.notebook import tqdm

In [2]:
binding_data = pd.read_csv('../data/binding_data.csv')

In [3]:
binding_data

Unnamed: 0,ligand,Binder/\nNonbinder,Average HSA score,SD HSA score
0,2-(R)-phenylproionamides 1,0,99.80,0.00
1,2-(R)-phenylproionamides 10,0,70.10,0.14
2,2-(R)-phenylproionamides 2 (Reparixin),0,99.90,0.00
3,2-(R)-phenylproionamides 3,0,91.98,0.03
4,2-(R)-phenylproionamides 4,0,70.00,0.01
...,...,...,...,...
427,L-tryptophan,2,73.37,39.66
428,neostigmine,2,62.30,28.17
429,salicylic acid,2,66.75,34.29
430,sulfaphenazole,2,68.66,43.15


In [4]:
binding_data_names = binding_data[['Binder/\nNonbinder', 'ligand']]

In [5]:
binding_data_names = binding_data_names.rename(columns={'Binder/\nNonbinder': 'binder'}) 

In [6]:
binding_data_names.head()

Unnamed: 0,binder,ligand
0,0,2-(R)-phenylproionamides 1
1,0,2-(R)-phenylproionamides 10
2,0,2-(R)-phenylproionamides 2 (Reparixin)
3,0,2-(R)-phenylproionamides 3
4,0,2-(R)-phenylproionamides 4


In [91]:
smiles = []
ligands = tqdm(list(binding_data_names.ligand))
broken_names = []
for name in ligands:
    try:
        cs = pcp.get_compounds(name, 'name')
        smiles.append(cs[0].isomeric_smiles)
    except Exception as e:
        broken_names.append(name)
        smiles.append(None)

  0%|          | 0/432 [00:00<?, ?it/s]

In [92]:
broken_names = pd.DataFrame({'name': broken_names})
broken_names.to_csv('../data/lexa_broken_names')

In [93]:
broken_names

Unnamed: 0,name
0,2-(R)-phenylproionamides 1
1,2-(R)-phenylproionamides 10
2,2-(R)-phenylproionamides 2 (Reparixin)
3,2-(R)-phenylproionamides 3
4,2-(R)-phenylproionamides 4
...,...
113,"R-,S-Warfarin"
114,Suprofen ester
115,4-hydroxylmethyl-quinolone
116,Moxisylate


In [15]:
binding_data_names['smiles'] = pd.Series(smiles)

In [18]:
binding_data_names.isna().sum()

binder      0
ligand      0
smiles    118
dtype: int64

## NaNs among nonbinders

In [78]:
binding_data_names.describe()

Unnamed: 0,binder
count,432.0
mean,0.229167
std,0.477602
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,2.0


In [83]:
binding_data_names.binder.value_counts()

0    344
1     77
2     11
Name: binder, dtype: int64

## Data without NA smiles

In [86]:
binding_data_wo_na = binding_data_names.dropna()

In [87]:
binding_data_wo_na = binding_data_wo_na[binding_data_wo_na.binder != 2]

In [88]:
binding_data_wo_na.binder.value_counts()

0    229
1     75
Name: binder, dtype: int64

In [89]:
binding_data_wo_na.describe()

Unnamed: 0,binder
count,304.0
mean,0.246711
std,0.431808
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [90]:
binding_data_wo_na

Unnamed: 0,binder,ligand,smiles
11,0,3-acetylcoumarin,CC(=O)C1=CC2=CC=CC=C2OC1=O
13,0,4-chromanol,C1COC2=CC=CC=C2C1O
14,0,4-hydroxycoumarin,C1=CC=C2C(=C1)C(=CC(=O)O2)O
23,0,8-hydroxydaidzein,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O
24,0,Abacavir,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...
...,...,...,...
416,1,Tranexamic acid,C1CC(CCC1CN)C(=O)O
417,1,Venlafaxine,CN(C)CC(C1=CC=C(C=C1)OC)C2(CCCCC2)O
418,1,Zalcitabine,C1C[C@@H](O[C@@H]1CO)N2C=CC(=NC2=O)N
419,1,Zanamivir,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...


In [54]:
binding = [bool(x) for x in binding_data_wo_na.binder]

In [57]:
binding = [not x for x in binding]

In [59]:
binding = [int(x) for x in binding]

In [67]:
binding = pd.Series(binding)

0      1
1      1
2      1
3      1
4      1
      ..
299    0
300    0
301    0
302    0
303    0
Length: 304, dtype: int64

In [68]:
binding_data_wo_na['active'] = binding

In [69]:
binding_data_wo_na

Unnamed: 0,binder,ligand,smiles,active
11,1.0,3-acetylcoumarin,CC(=O)C1=CC2=CC=CC=C2OC1=O,1
13,1.0,4-chromanol,C1COC2=CC=CC=C2C1O,1
14,1.0,4-hydroxycoumarin,C1=CC=C2C(=C1)C(=CC(=O)O2)O,1
23,1.0,8-hydroxydaidzein,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O,1
24,1.0,Abacavir,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,1
...,...,...,...,...
416,,Tranexamic acid,C1CC(CCC1CN)C(=O)O,0
417,,Venlafaxine,CN(C)CC(C1=CC=C(C=C1)OC)C2(CCCCC2)O,0
418,,Zalcitabine,C1C[C@@H](O[C@@H]1CO)N2C=CC(=NC2=O)N,0
419,,Zanamivir,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,0


In [70]:
binding_data_wo_na = binding_data_wo_na[['smiles', 'active']]

In [71]:
binding_data_wo_na

Unnamed: 0,smiles,active
11,CC(=O)C1=CC2=CC=CC=C2OC1=O,1
13,C1COC2=CC=CC=C2C1O,1
14,C1=CC=C2C(=C1)C(=CC(=O)O2)O,1
23,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O,1
24,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,1
...,...,...
416,C1CC(CCC1CN)C(=O)O,0
417,CN(C)CC(C1=CC=C(C=C1)OC)C2(CCCCC2)O,0
418,C1C[C@@H](O[C@@H]1CO)N2C=CC(=NC2=O)N,0
419,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,0


In [72]:
binding_data_wo_na.to_csv('../data/lexa_preprocessed.csv', index=False)