In [3]:
import pandas as pd
from rdkit import Chem
from tqdm.auto import tqdm
from rdkit.Chem.Draw import MolsToGridImage
import seaborn as sns

Import data from a CSV file

In [4]:
df = pd.read_csv("curated-solubility-dataset.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'curated-solubility-dataset.csv'

Read the first few lines of the dataframe

In [None]:
df.head()

Examine the datatypes for the dataframe

In [None]:
df.dtypes

The Pandas **describe** function provides summary statistics for numeric dataframe columns

In [None]:
df.describe()

The **columns** attribute of dataframe provides the column names

In [None]:
df.columns

The **sort_values** function sorts a dataframe.  Note that this returns another dataframe.  To sort the dataframe in place set **inplace=True**

In [None]:
df.sort_values("Group",ascending=False)

I like to be able have progress bar for operations that take more than a couple of seconds.  The tqdm library provides a simple way of adding a progress bar to a python program. 

In [None]:
tqdm.pandas()

In [None]:
df['Mol'] = df.SMILES.progress_apply(Chem.MolFromSmiles)

In [None]:
query = Chem.MolFromSmarts("c1ccccn1")

In [None]:
query

In [None]:
df['pyridine'] = [x.HasSubstructMatch(query) for x in tqdm(df.Mol)]

In [None]:
hits = df.query("pyridine").head(10)
MolsToGridImage(hits.Mol,legends=hits.ID.to_list(),molsPerRow=5)

In [None]:
match_list = [x.GetSubstructMatch(query) for x in hits.Mol]

In [None]:
MolsToGridImage(hits.Mol,legends=hits.ID.to_list(),molsPerRow=3,highlightAtomLists=match_list,subImgSize=(400, 400))

In [None]:
res = []
for k,v in df.groupby("Group"):
    res.append([k,len(v),v.Solubility.mean()])
res_df = pd.DataFrame(res,columns=["Group","Size","Mean"]).round(decimals=2)
res_df

In [None]:
df.boxplot(column="Solubility",by="Group",figsize=(8,6))

In [None]:
sns.boxplot(x="Group",y="Solubility",data=df)