# Goal of phase 1:
Build descriptors for a set of compounds from the Lazar web service (stored in `compounds.csv`)

## Import libraries needed

In [11]:
from pip._internal import main as pip

try:
    import requests
except ImportError:
    pip(['install', 'requests'])
    import requests

try:
    import pandas as pd
except ImportError:
    pip(['install', 'pandas'])
    import pandas as pd
       
try:
    from rdkit import Chem
except ImportError:
    print("Run the following from command line:\n\tconda install -c conda-forge rdkit")
    
try:
    from mordred import Calculator, descriptors
except ImportError:
    pip(['install', 'mordred'])
    from mordred import Calculator, descriptors
    
try:
    import urllib
except ImportError:
    pip(['install', 'urllib3'])
    import urllib

from IPython.display import display, SVG, HTML, Image

## Communicate with Lazar to obtain the dataset

In [13]:

url = 'https://lazar.prod.openrisknet.org/endpoint'
headers = {'accept': 'application/json',
           'Content-Type': 'application/x-www-form-urlencoded'}

r1 = requests.get(url, headers=headers)

print("LAZAR Status code GET endpoints: {0}".format(r1.status_code))
if r1.status_code == 200:
    endpoints = r1.json()

LAZAR Status code GET endpoints: 200


In [14]:
data = []

def extractSpecies(speciesList):
    for item in speciesList:
        for species, url, in item.items():
            yield species, url
            
for endpoint in endpoints:
    url = 'https://lazar.prod.openrisknet.org/endpoint/' + endpoint
    r2 = requests.get(url, headers=headers);

    if r2.status_code == 200:
        for species, url in extractSpecies(r2.json()):
            data.append([endpoint, species, url])

dfLAZAR = pd.DataFrame(data, columns=["Endpoint", "Species", "URL"])

dfLAZAR
selector = 0
model_url = dfLAZAR['URL'][selector]
headers = {'accept': 'application/json',
           'Content-Type': 'application/x-www-form-urlencoded'}

r1 = requests.get(model_url, headers=headers)

print("LAZAR Status code GET model details: {0}".format(r1.status_code))
if r1.status_code == 200:
    model = r1.json()
    
headers = {'accept': 'text/csv'}
            
training_dataset = 'https://lazar.prod.openrisknet.org/dataset/' + model['training_dataset']
r3 = requests.get(training_dataset, headers=headers);

if r3.status_code == 200:
    
    csv = r3.text

LAZAR Status code GET model details: 200


#### Dataset has been retrieved in csv format, transforming from csv into pandas dataframe

In [15]:

from io import StringIO
pd.set_option('display.max_colwidth', -1)
data = StringIO(csv)
df = pd.read_csv(data,index_col=False)
df

Unnamed: 0,SMILES,Blood-Brain-Barrier Penetration
0,OC[C@](c1onc(n1)c1ncn2-c3cccc(c3C(=O)N(Cc12)C)Cl)(O)C,non-penetrating
1,NCCc1nc2n(c1)cccc2,non-penetrating
2,NCCc1nc2n(c1)cccc2,non-penetrating
3,CCCN(CCC)CCc1ccc(c2c1CC(=C)N2)O,penetrating
4,Fc1ccc2c(c1)onc2C1CCN(CC1)CCc1c(C)nc2n(c1=O)CCC[C@H]2O,penetrating
...,...,...
410,CSCc1cnc(c(c1CNCC)O)C,penetrating
411,O=C(C1=C(O)c2sccc2S(=O)(=O)N1C)Nc1ccccn1,non-penetrating
412,O[C@@H](c1ccc(cc1)C(C)(C)C)CCCN1CCC(CC1)C(c1ccccc1)(c1ccccc1)O,non-penetrating
413,O=C1CC[C@]2(C(=C1)CC[C@@H]1[C@@H]2CC[C@]2([C@H]1CC[C@@H]2O)C)C,penetrating


##### Alternatively, one can load the set of compounds from csv (removing all # is needed to get the next cell to work

In [29]:
#df = pd.read_csv('compounds.csv')
#df

## Convert Smiles to Mol representation

In [30]:
mols = []

for smile in df['SMILES']:
    mols.append(Chem.MolFromSmiles(smile))

df['Mol'] = mols    

# remove compounds with non-processible Smiles
df = df.dropna()

## Calculate Mordred descriptors

In [31]:
calc = Calculator(descriptors)

dfMord = calc.pandas(df['Mol'])

dfMord.head()

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,21.47408,17.978542,0,0,34.5534,2.54198,4.93359,34.5534,1.27976,4.25118,...,10.428837,78.871649,389.089082,9.048583,1727,52,152.0,187.0,9.67361,5.763889
1,9.151948,8.206878,0,1,15.659,2.37835,4.57188,15.659,1.30491,3.42249,...,9.190852,56.587917,161.095297,7.004143,197,14,60.0,69.0,3.33333,2.777778
2,9.151948,8.206878,0,1,15.659,2.37835,4.57188,15.659,1.30491,3.42249,...,9.190852,56.587917,161.095297,7.004143,197,14,60.0,69.0,3.33333,2.777778
3,14.946702,13.14067,0,1,25.0359,2.45245,4.79766,25.0359,1.2518,3.90305,...,9.742908,67.137495,274.204513,5.960968,862,28,98.0,113.0,7.16667,4.666667
4,24.862776,17.808737,0,1,40.9336,2.46674,4.9288,40.9336,1.32044,4.38836,...,10.513824,81.350168,426.206719,7.348392,3047,54,172.0,208.0,9.08333,6.638889


### Curate Mordred descriptors:
- convert from int64, float64 to int32, float32
- drop columns that are of object type, because they contain text

In [32]:
# convert from 64 to 32 types
for col in dfMord.columns:
    if dfMord[col].dtype == 'float64':
        dfMord[col] = dfMord[col].astype('float32')
    elif dfMord[col].dtype == 'int64':
        dfMord[col] = dfMord[col].astype('int32')
        
dfMord.dtypes

ABC         float32
ABCGG       float32
nAcid       int32  
nBase       int32  
SpAbs_A     object 
             ...   
WPol        int32  
Zagreb1     float32
Zagreb2     float32
mZagreb1    object 
mZagreb2    float32
Length: 1826, dtype: object

In [33]:
# drop all object columns

print('Dataframe shape before dropping:', dfMord.shape)

# list to collect columns to be dropped
toDrop = []

for col in dfMord.columns:
    if dfMord[col].dtype == 'object':
        toDrop.append(col)

dfMord = dfMord.drop(toDrop, axis=1)

print('Dataframe shape after dropping:', dfMord.shape)

Dataframe shape before dropping: (388, 1826)
Dataframe shape after dropping: (388, 888)


## Join the compounds with descriptors

In [34]:
df = pd.concat([df, dfMord], axis=1)
df = df.rename(columns = {'Blood-Brain-Barrier Penetration': 'True'})
df = df.drop('Mol', axis=1)

df.to_csv('compounds_descriptors.csv', index=False)