In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
from rdkit import Chem
from rdkit.Chem.Descriptors import MolLogP
from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from umap import UMAP

ModuleNotFoundError: No module named 'umap'

Make Pandas use Seaborn for plots

In [None]:
sns.set()

Enable Pandas progress_apply

In [None]:
tqdm.pandas()

A few settings to make plots look better. Here's a link to [my gist](https://gist.github.com/PatWalters/1b7600dd6d195e2cb8dded8454e1777e) with a bunch of tricks for making Seaborn plots look better. 

In [None]:
sns.set(rc={'figure.figsize': (10, 10)})
sns.set_style('whitegrid')
sns.set_context('talk')

Examine solubility data from https://www.nature.com/articles/s41597-019-0151-1

In [None]:
df = pd.read_csv("curated-solubility-dataset.csv")

In [None]:
df

- G1 - occurs once in the dataset
- G2 - occurs twice in the dataset, SD > 0.5
- G3 - occurs twice in the dataset, SD <= 0.5
- G4 - occurs three or more times in the dataset, SD > 0.5
- G5 - occurs three or more times in the dataset, SD <= 0.5

In [None]:
df.Group.value_counts()

In [None]:
df.Group.value_counts(normalize=True)

In [None]:
df.Group.value_counts().to_frame().plot(kind="bar")

In [None]:
df_ok = df.query("Group in ['G3','G5']").copy()
df_ok.shape

Plot a frequency distribution for the solubility data using Seaborn's [displot](https://seaborn.pydata.org/generated/seaborn.displot.html)

Experiment with
- kind = "kde"
- kind = "hist"
- kind = "ecdf"

In [None]:
sns.displot(x=df_ok.Solubility,kind="hist",kde=True, height=8)

Let's bin the data
- &gt;200 uM (green)
- 30-200 uM (yellow)
- <30 uM (red)

In [None]:
bins = [np.log10(x*1e-6) for x in [30,200]]
bins = [-100] + bins + [100]
df_ok['bin'] = pd.cut(df.Solubility,bins=bins,labels=["Low","Medium","High"])

In [None]:
color_map_3 = {"Low":"red","Medium":"yellow","High":"green"}
g = sns.displot(x="Solubility",kind="hist",kde=True, height=8, hue="bin",data=df_ok,palette=color_map_3)
g.fig.legends[0].set_title("Solubility Bin")

In [None]:
ax = sns.boxplot(x="bin",y="Solubility",data=df_ok)
ax.set_xlabel("Solubility Bin")

In [None]:
df_ok['is_sol'] = [True if x == "High" else False for x in df_ok.bin]

In [None]:
color_map_2 = {False :"red", True: "green"}
g = sns.displot(x="Solubility",kind="hist",kde=True, height=8, hue="is_sol",data=df_ok,palette=color_map_2)
g.fig.legends[0].set_title("Solubility Bin")

In [None]:
desc_columns = df_ok.select_dtypes([int,float]).columns[3:]
scaler = StandardScaler()
scaled_descriptors = scaler.fit_transform(df_ok[desc_columns])

In [None]:
scaled_descriptors

Use Truncated Stochastic Neighbor Embedding ([TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)) to view the relationship between solubility and our descriptors. 

In [None]:
tsne = TSNE()
tsne_crds = tsne.fit_transform(scaled_descriptors)

In [None]:
ax = sns.scatterplot(x=tsne_crds[:,0],y=tsne_crds[:,1],hue=df_ok.bin,palette=color_map_3)
ax.get_legend().set_title("Solubility Bin")

Some will argue that [Uniform Manifold Approximation](https://umap-learn.readthedocs.io/en/latest/) (UMAP) is a better way to do this. I'm not particularly partial to either, but here's how to do the same thing with UMAP.  As you can see, the APIs are very similar. 

In [None]:
umap = UMAP()
umap_crds = umap.fit_transform(scaled_descriptors)

In [None]:
ax = sns.scatterplot(x=umap_crds[:,0],y=umap_crds[:,1],hue=df_ok.bin,palette=color_map_3)
ax.get_legend().set_title("Solubility Bin")

Note that we are only using 17 descriptors here.  In this case, we're ok running TSNE on our data.  If we have more than 50 dimensions, it's usually a good idea to run [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) on the dataset before running TSNE. 

In [None]:
df_ok.to_csv("solubility_data_ok.csv",index=False)