In [None]:
import numpy as np
import pandas as pd

from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show
output_notebook()


from rdkit.Chem import PandasTools

In [None]:
huskin= pd.read_csv('data/huskinDB.csv')

In [None]:
huskin.head()

In [None]:
for column in huskin.columns:
    huskin[column]= huskin[column].fillna('unknown')

### Publications

In [None]:
print ('Top 10 publications with the highest number of reported values: \n \n', huskin['reference'].value_counts()[:10], '\n')
print ('Total records: \t \t \t', len(huskin['reference']))
print ('Total unique publications: \t', len(huskin['reference'].unique()))

### Compounds

In [None]:
print ('Top 10 most frequently assessed compounds: \n \n', huskin['Compound name'].value_counts()[0:10], '\n')
print ('Total compounds: \t \t', len(huskin['Compound name']))
print ('Total unique compounds: \t', len(huskin['Smiles'].unique()))

### Skin source type

In [None]:
print ('Top 14 source_type: \n \n', huskin['source_type'].value_counts()[0:14], '\n')
print ('Total source_type: \t \t', len(huskin['source_type']))
print ('Total unique source_type: \t', len(huskin['source_type'].unique()))

### Skin source sites

In [None]:
print ('Top skin types: \n \n', huskin['skin source site'].value_counts(), '\n')
print ('Total skin types: \t \t', len(huskin['skin source site'])-np.sum(huskin['skin source site']=='unknown'))
print ('Total unique skin types: \t', len(huskin['skin source site'].unique()))

### Skin preparation techniques

In [None]:
print ('Skin preparation: \n \n', huskin['skin preparation'].value_counts(), '\n')
print ('Total skin preparation types: \t', len(huskin['skin preparation'])-np.sum(huskin['skin preparation']=='unknown'))
print ('Total skin preparation types: \t', len(huskin['skin preparation'].unique()))

### Layer types

In [None]:
print ('Layers used: \n \n', huskin['used layer'].value_counts(), '\n')
print ('Total layers used (known): \t', len(huskin['used layer'])-np.sum(huskin['used layer']=='unknown'))
print ('Total unique layers used: \t', len(huskin['used layer'].unique()))

### Storage temperatures

In [None]:
print ('Storage temperatures: \n \n', huskin['storage temperature (°C)'].value_counts(), '\n')
print ('Total storage temperatures: \t \t', len(huskin['storage temperature (°C)'])-np.sum(huskin['storage temperature (°C)']=='unknown'))
print ('Total unique storage temperatures: \t', len(huskin['storage temperature (°C)'].unique()))

### Storage duration

In [None]:
print ('Storage durations: \n \n', huskin['storage duration (days)'].value_counts(), '\n')
print ('Total storage durations: \t \t', len(huskin['storage duration (days)'])-np.sum(huskin['storage duration (days)']=='unknown'))
print ('Total unique storage durations: \t', len(huskin['storage duration (days)'].unique()))

### Whether the compound was tested neat or in a solution

In [None]:
print ('Neat: \n \n', huskin['neat'].value_counts(), '\n')
print ('Total neat: \t \t', len(huskin['neat']))
print ('Total unique neat: \t', len(huskin['neat'].unique()))

### Donor solution/skin surface temperature

In [None]:
print ('Donor temperatures: \n \n', huskin['donor/skin surface temperature (°C)'].value_counts(), '\n')
print ('Total donor temperatures: \t \t', len(huskin['donor/skin surface temperature (°C)'])-np.sum(huskin['donor/skin surface temperature (°C)']=='unknown'))
print ('Total unique donor temperatures: \t', len(huskin['donor/skin surface temperature (°C)'].unique()))

### Donor pH

In [None]:
print ('Donor pH: \n \n', huskin['donor pH'].value_counts(), '\n')
print ('Total donor pH: \t \t', len(huskin['donor pH'])-np.sum(huskin['donor pH']=='unknown'))
print ('Total unique donor pH values: \t', len(huskin['donor pH'].unique()))

### Donor types

In [None]:
print ('Donor types: \n \n', huskin['donor type'].value_counts(), '\n')
print ('Total donor types: \t \t', len(huskin['donor type'])-np.sum(huskin['donor type']=='unknown'))
print ('Total unique donor types: \t', len(huskin['donor type'].unique()))

### Acceptor temperatures

In [None]:
print ('Acceptor temperatures: \n \n', huskin['acceptor temperature (°C)'].value_counts(), '\n')
print ('Total acceptor temperatures: \t', len(huskin['acceptor temperature (°C)'])-np.sum(huskin['acceptor temperature (°C)']=='unknown'))
print ('Total unique acceptor temperatures: \t', len(huskin['acceptor temperature (°C)'].unique()))

### Acceptor pH

In [None]:
print ('Acceptor pH: \n \n', huskin['acceptor pH'].value_counts(), '\n')
print ('Total acceptor pH types: \t \t', len(huskin['acceptor pH'])-np.sum(huskin['acceptor pH']=='unknown'))
print ('Total unique acceptor pH values: \t', len(huskin['acceptor pH'].unique()))

### Acceptor types

In [None]:
print ('Acceptor types: \n \n', huskin['acceptor type'].value_counts(), '\n')
print ('Total acceptor types: \t', len(huskin['acceptor type'])-np.sum(huskin['acceptor type']=='unknown'))
print ('Total unique acceptor types: \t', len(huskin['acceptor type'].unique()))

### Measurement chamber types

In [None]:
print ('Cell types: \n \n', huskin['cell type'].value_counts(), '\n')
print ('Total cell types: \t', len(huskin['cell type'])-np.sum(huskin['cell type']=='unknown'))
print ('Total unique cell types: \t', len(huskin['cell type'].unique()))

In [None]:
huskin['logkp (cm/s)'].describe()

In [None]:
huskin.head(5).sort_values(by='logkp (cm/s)', ascending=False)

In [None]:
huskin.head(5).sort_values(by='logkp (cm/s)', ascending=True)

In [None]:
PandasTools.AddMoleculeColumnToFrame(huskin, smilesCol='Smiles', molCol='Mol')

In [None]:
from rdkit import Chem

def descriptor_calculator(mol_array, module):
    '''outputs list with calculated descriptor '''
    result=[module(x) for x in mol_array]
    return result

def add_desc(df, mol_array, d_dictionary):
    '''adds descriptor columns to pandas df'''
    for key, value in d_dictionary.items():
        print('calculating: \t', key)
        df[key] = descriptor_calculator(mol_array, value)
    return df

from rdkit.Chem import AllChem
from rdkit.Chem import Crippen


'''
Dictionary with descroptors
add other descriptors for further processing
find more information here: https://www.rdkit.org/docs/GettingStartedInPython.html
and here: https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors.html

'''
descriptor_dict={'Crippen_MolLogP': Crippen.MolLogP, 
                 'ExactMolWt': AllChem.CalcExactMolWt}

                    
huskin = add_desc(huskin.copy(), huskin['Mol'], descriptor_dict)

In [None]:
huskin['ExactMolWt'].describe()

In [None]:
huskin.head(5).sort_values(by='ExactMolWt', ascending=False)

In [None]:
huskin.head(5).sort_values(by='ExactMolWt', ascending=True)

In [None]:
p = figure(plot_width=600, plot_height=600,
   x_axis_label='Molecular weight', y_axis_label='logkp [cm/s]', background_fill_alpha = 0.5)

p.circle(huskin['ExactMolWt'], huskin['logkp (cm/s)'], fill_color='#1f78b4', line_color="black", size=8,
        alpha=0.8)

font_size_labels= '28pt'
font_size_major= '22pt'

p.xaxis.axis_label_text_font_size= font_size_labels
p.xaxis.major_label_text_font_size= font_size_major
p.yaxis.axis_label_text_font_size= font_size_labels
p.yaxis.major_label_text_font_size= font_size_major
p.output_backend = "svg"
show(p)

In [None]:
p = figure(plot_width=600, plot_height=600, y_axis_label='Value count', x_axis_label='logkp',
          background_fill_alpha = 0.5)

hist, edges = np.histogram(huskin['logkp (cm/s)'], density=False, bins=40)

left_edges=edges[:-1]
right_edges=edges[1:]

p.quad(top=hist, bottom=0, left=left_edges, right=right_edges,
   fill_color='#1f78b4', line_color='black')


font_size_labels= '28pt'
font_size_major= '22pt'

p.xaxis.axis_label_text_font_size= font_size_labels
p.xaxis.major_label_text_font_size= font_size_major
p.yaxis.axis_label_text_font_size= font_size_labels
p.yaxis.major_label_text_font_size= font_size_major
p.output_backend = "svg"

show(p)

In [None]:
p3 = figure(plot_width=600, plot_height=600, y_axis_label='Value count', x_axis_label='Molecular weight',
          background_fill_alpha = 0.5)

hist, edges = np.histogram(huskin['ExactMolWt'], density=False, bins=40)

left_edges=edges[:-1]
right_edges=edges[1:]

p3.quad(top=hist, bottom=0, left=left_edges, right=right_edges,
   fill_color='#1f78b4', line_color='black')

font_size_labels= '28pt'
font_size_major= '22pt'

p3.xaxis.axis_label_text_font_size= font_size_labels
p3.xaxis.major_label_text_font_size= font_size_major
p3.yaxis.axis_label_text_font_size= font_size_labels
p3.yaxis.major_label_text_font_size= font_size_major
p3.output_backend = "svg"

show(p3)