In [None]:
import numpy as np
import pandas as pd

from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show
output_notebook()


pd.set_option('display.max_columns', 25)

In [None]:
huskin= pd.read_csv('data/huskinDB.csv')

In [None]:
huskin.head()

### Publications

In [None]:
print ('Top 10 publications with the highest number of reported values: \n \n', huskin['reference'].value_counts()[:10], '\n')
print ('Total records: \t \t \t', len(huskin['reference']))
print ('Total unique publications: \t', len(huskin['reference'].unique()))

### Compounds

In [None]:
print ('Top 10 most frequently assessed compounds: \n \n', huskin['Compound name'].value_counts()[0:10], '\n')
print ('Total compounds: \t \t', len(huskin['Compound name']))
print ('Total unique compounds: \t', len(huskin['Smiles'].unique()))

### Skin source type

In [None]:
print ('Top skin source types: \n \n', huskin['skin source type'].value_counts(), '\n')
print ('Total source_type: \t \t', len(huskin['skin source type']))
print ('Total unique skin source types: \t', len(huskin['skin source type'].unique()))

### Skin source sites

In [None]:
print ('Top skin source sites: \n \n', huskin['skin source site'].value_counts(), '\n')
print ('Total skin types: \t \t', len(huskin['skin source site'])-np.sum(huskin['skin source site']=='unknown'))
print ('Total unique skin source sites: \t', len(huskin['skin source site'].unique()))

### Skin preparation techniques

In [None]:
print ('Skin preparation: \n \n', huskin['skin preparation'].value_counts(), '\n')
print ('Total skin preparation types: \t', len(huskin['skin preparation'])-np.sum(huskin['skin preparation']=='unknown'))
print ('Total skin preparation types: \t', len(huskin['skin preparation'].unique()))

### Layer types

In [None]:
print ('Layers used: \n \n', huskin['used layer'].value_counts(), '\n')
print ('Total layers used (known): \t', len(huskin['used layer'])-np.sum(huskin['used layer']=='unknown'))
print ('Total unique layers used: \t', len(huskin['used layer'].unique()))

### Storage temperatures

In [None]:
print ('Storage temperatures (°C): \n \n', huskin['storage temperature (°C)'].value_counts(), '\n')
print ('Total storage temperatures: \t \t', len(huskin['storage temperature (°C)'])-np.sum(huskin['storage temperature (°C)']=='unknown'))
print ('Total unique storage temperatures: \t', len(huskin['storage temperature (°C)'].unique()))

### Storage duration

In [None]:
print ('Storage durations (days): \n \n', huskin['storage duration (days)'].value_counts(), '\n')
print ('Total storage durations: \t \t', len(huskin['storage duration (days)'])-np.sum(huskin['storage duration (days)']=='unknown'))
print ('Total unique storage durations: \t', len(huskin['storage duration (days)'].unique()))

### Whether the compound was tested neat or in a solution

In [None]:
print ('Neat: \n \n', huskin['neat'].value_counts(), '\n')
print ('Total neat: \t \t', len(huskin['neat']))
print ('Total unique neat: \t', len(huskin['neat'].unique()))

### Donor solution/skin surface temperature

In [None]:
print ('Donor temperatures (°C): \n \n', huskin['donor/skin surface temperature (°C)'].value_counts(), '\n')
print ('Total donor temperatures: \t \t', len(huskin['donor/skin surface temperature (°C)'])-np.sum(huskin['donor/skin surface temperature (°C)']=='unknown'))
print ('Total unique donor temperatures: \t', len(huskin['donor/skin surface temperature (°C)'].unique()))

### Donor pH

In [None]:
print ('Donor pH: \n \n', huskin['donor pH'].value_counts(), '\n')
print ('Total donor pH: \t \t', len(huskin['donor pH'])-np.sum(huskin['donor pH']=='unknown'))
print ('Total unique donor pH values: \t', len(huskin['donor pH'].unique()))

### Donor types

In [None]:
print ('Donor types: \n \n', huskin['donor type'].value_counts(), '\n')
print ('Total donor types: \t \t', len(huskin['donor type'])-np.sum(huskin['donor type']=='unknown'))
print ('Total unique donor types: \t', len(huskin['donor type'].unique()))

### Acceptor temperatures

In [None]:
print ('Acceptor temperatures (°C): \n \n', huskin['acceptor temperature (°C)'].value_counts(), '\n')
print ('Total acceptor temperatures: \t', len(huskin['acceptor temperature (°C)'])-np.sum(huskin['acceptor temperature (°C)']=='unknown'))
print ('Total unique acceptor temperatures: \t', len(huskin['acceptor temperature (°C)'].unique()))

### Acceptor pH

In [None]:
print ('Acceptor pH: \n \n', huskin['acceptor pH'].value_counts(), '\n')
print ('Total acceptor pH types: \t \t', len(huskin['acceptor pH'])-np.sum(huskin['acceptor pH']=='unknown'))
print ('Total unique acceptor pH values: \t', len(huskin['acceptor pH'].unique()))

### Acceptor types

In [None]:
print ('Acceptor types: \n \n', huskin['acceptor type'].value_counts(), '\n')
print ('Total acceptor types: \t', len(huskin['acceptor type'])-np.sum(huskin['acceptor type']=='unknown'))
print ('Total unique acceptor types: \t', len(huskin['acceptor type'].unique()))

### Measurement chamber types

In [None]:
print ('Cell types: \n \n', huskin['cell type'].value_counts(), '\n')
print ('Total cell types: \t', len(huskin['cell type'])-np.sum(huskin['cell type']=='unknown'))
print ('Total unique cell types: \t', len(huskin['cell type'].unique()))

### Skin permeation values

In [None]:
huskin['logkp (cm/s)'].describe()

### 5 Records with the highest skin permeation values

In [None]:
huskin.sort_values(by='logkp (cm/s)', ascending=False).head(5)

### 5 Records with the lowest skin permeation values

In [None]:
huskin.sort_values(by='logkp (cm/s)', ascending=True).head(5)

### Molecular weight of the compounds
#### molecular weights and logP for huskinDB and DrugBank molecules were calculated using [RDKit](https://www.rdkit.org) version 2019.09.3.0

In [None]:
huskin['molecular weight'].describe()

### 5 Records with the highest molecular weight values

In [None]:
huskin.sort_values(by='molecular weight', ascending=False).head(5)

### 16 Records with the lowest molecular weight values

In [None]:
huskin.sort_values(by='molecular weight', ascending=True).head(16)

### Figure 1a

In [None]:
p = figure(plot_width=600, plot_height=600, y_axis_label='Value count', x_axis_label='Skin permeation logkp (cm/s)',
          background_fill_alpha = 0.5)

hist, edges = np.histogram(huskin['logkp (cm/s)'], density=False, bins=40)

left_edges=edges[:-1]
right_edges=edges[1:]

p.quad(top=hist, bottom=0, left=left_edges, right=right_edges,
   fill_color='#1f78b4', line_color='black')


font_size_labels= '28pt'
font_size_major= '22pt'

p.xaxis.axis_label_text_font_size= font_size_labels
p.xaxis.major_label_text_font_size= font_size_major
p.yaxis.axis_label_text_font_size= font_size_labels
p.yaxis.major_label_text_font_size= font_size_major
p.output_backend = "svg"

show(p)

### Figure 1b

In [None]:
p3 = figure(plot_width=600, plot_height=600, y_axis_label='Value count', x_axis_label='Molecular weight (g/mol)',
          background_fill_alpha = 0.5)

hist, edges = np.histogram(huskin['molecular weight'], density=False, bins=40)

left_edges=edges[:-1]
right_edges=edges[1:]

p3.quad(top=hist, bottom=0, left=left_edges, right=right_edges,
   fill_color='#1f78b4', line_color='black')

font_size_labels= '28pt'
font_size_major= '22pt'

p3.xaxis.axis_label_text_font_size= font_size_labels
p3.xaxis.major_label_text_font_size= font_size_major
p3.yaxis.axis_label_text_font_size= font_size_labels
p3.yaxis.major_label_text_font_size= font_size_major
p3.output_backend = "svg"

show(p3)

### Figure 1c

In [None]:
p = figure(plot_width=600, plot_height=600,
   x_axis_label='Molecular weight (g/mol)', y_axis_label='Skin permeation logkp (cm/s)', background_fill_alpha = 0.5)

p.circle(huskin['molecular weight'], huskin['logkp (cm/s)'], fill_color='#1f78b4', line_color="black", size=8,
        alpha=0.8)

font_size_labels= '28pt'
font_size_major= '22pt'

p.xaxis.axis_label_text_font_size= font_size_labels
p.xaxis.major_label_text_font_size= font_size_major
p.yaxis.axis_label_text_font_size= font_size_labels
p.yaxis.major_label_text_font_size= font_size_major
p.output_backend = "svg"
show(p)

### Figure 2
#### [Drugbank 5.0](https://www.drugbank.ca/releases/5-0) data was used for the Figure 2
#### following compounds were excluded:
- Oxaliplatin
- 5,10,15,20-Tetrakis(4-Sulpfonatophenyl)-21h,23h-Porphine
- Temoporfin
- Nedaplatin

#### only DrugBank molecules with molecular weight <=2500 and -20 <= calculated logP <=20 are used
#### additionally, one huskinDB molecule with calculated logP less than -20 was removed (octaborate tetrahydrate)

In [None]:
drugbank=pd.read_csv('data/DrugBank.csv', index_col=0)

In [None]:
# clip DrugBank and huskinDB data
plot_drugbank_data= drugbank[(drugbank['mw'] <= 2500) &(drugbank['logp'] >= -20) & (drugbank['logp'] <=20)]
plot_dhuskin_data= huskin[huskin['LogP'] >= -20]

In [None]:
p = figure(plot_width=600, plot_height=600,
           x_axis_label='Molecular weight (g/mol)', y_axis_label='Widmann-Crippen LogP', background_fill_alpha = 0.5)

p.circle(plot_drugbank_data['mw'], plot_drugbank_data['logp'], legend_label="DrugBank molecules", fill_color='#a6cee3', line_color="#1f78b4", size=3)
p.circle(plot_dhuskin_data['molecular weight'], plot_dhuskin_data['LogP'], legend_label="huskinDB molecules", fill_color="#e41a1c", line_color="black", size=4)

font_size_labels= '16pt'
font_size_major= '14pt'

p.xaxis.axis_label_text_font_size= font_size_labels
p.xaxis.major_label_text_font_size= font_size_major
p.yaxis.axis_label_text_font_size= font_size_labels
p.yaxis.major_label_text_font_size= font_size_major
p.legend.label_text_font_size= font_size_major

p.output_backend = "svg"
show(p)

### Figure 3
#### a subset of huskinDB data points

In [None]:
steroids= pd.read_csv('data/steroids.csv')

In [None]:
anderson_e_d=steroids[(steroids['reference']=='Anderson, 1988') & (steroids['used layer']=='epidermis, dermis')]
anderson_sc=steroids[(steroids['reference']=='Anderson, 1988') & (steroids['used layer']=='stratum corneum')]
johnson_e=steroids[steroids['reference']=='Johnson, 1995']
scheuplein_e=steroids[steroids['reference']=='Scheuplein, 1969']

In [None]:
p = figure(plot_width=600, plot_height=600,
           x_axis_label='Molecular weight (g/mol)', y_axis_label='Skin permeation logkp (cm/s)', background_fill_alpha = 0.5)
p.square(scheuplein_e['molecular weight'], scheuplein_e['logkp (cm/s)'], fill_color="#b2df8a", line_color="black", size=10, alpha=0.8)
p.square(johnson_e['molecular weight'], johnson_e['logkp (cm/s)'], fill_color="#1f78b4", line_color="black", size=10, alpha=0.8)
p.diamond(anderson_sc['molecular weight'], anderson_sc['logkp (cm/s)'], fill_color="#e31a1c", line_color="black", size=16, alpha=0.8)

font_size_labels= '28pt'
font_size_major= '22pt'

p.xaxis.axis_label_text_font_size= font_size_labels
p.xaxis.major_label_text_font_size= font_size_major
p.yaxis.axis_label_text_font_size= font_size_labels
p.yaxis.major_label_text_font_size= font_size_major

p.output_backend = "svg"
show(p)