# Synutils repo walk-through 7th Sep 2023

This walk-through series primarily aims to explore Synple's Python utility module, providing an in-depth understanding of the technical details of computational services, with a primary focus on cheminformatics functions.

However, as we delve into the fundamental programming practices of cheminformatic functions, I extend this invitation to non-Synple members to join the series.

## Git usage
There are tones of great tutorials on internet, here is one (thx Julian!) https://codingforchemists.com/vcs-basics/

More a practical understanding of commits: 
![git_snapshot](../resources/figs/git_snapshot.png)

## Installation
```
% git clone https://github.com/Synple-Chem/synple-utils.git
% cd synple-utils
% make env
% conda activate ./env
```




In [9]:
# import packages
import pandas as pd
import numpy as np
from rdkit.Chem import MolFromSmiles
from synutils.featurizers import AVAILABLE_FEATURIZERS, get_featurizer
from synutils.path import ROOT_PATH

In [6]:
# load sample data
data_path = ROOT_PATH/'data'/ 'data_230907.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,parent_id,smiles
0,492995,CC(C)(C)C(=O)CC#N
1,2729170,Clc1ccc2c(CCc3cccnc3C2=O)c1
2,2724050,CCOC(=O)c1ccccc1S(N)(=O)=O
3,6871863,CCOC(=O)c1cc(Br)c[nH]1
4,526873,COc1ccccc1CO


In [7]:
# featurizers
avalable_feature_names = AVAILABLE_FEATURIZERS.keys()
print(f"There are {len(avalable_feature_names)} available featurizers: {avalable_feature_names}")

There are 9 available featurizers: dict_keys(['rdkit_2d', 'morgan', 'morgan_count', 'rdkit', 'rdkit_count', 'topological_torsion', 'topological_torsion_count', 'morgan_count_rdkit_2d', 'morgan_rdkit_2d'])


In [8]:
rdkit_2d_featurizer = get_featurizer('rdkit_2d')
print(f"rdkit 2D featurizer returns you rdkit 2D features includes {rdkit_2d_featurizer.desc_list}")

rdkit 2D featurizer returns you rdkit 2D features includes ['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_V

In [10]:
# add features to dataframe
df["mol"] = df["smiles"].apply(lambda x: MolFromSmiles(x))
df["rdkit_2d"] = df["mol"].apply(lambda x: rdkit_2d_featurizer.get_feat(x))
df.head()

Unnamed: 0,parent_id,smiles,mol,rdkit_2d
0,492995,CC(C)(C)C(=O)CC#N,<rdkit.Chem.rdchem.Mol object at 0x14a35e730>,"[10.842083, 10.842083, 0.0046296297, -0.351458..."
1,2729170,Clc1ccc2c(CCc3cccnc3C2=O)c1,<rdkit.Chem.rdchem.Mol object at 0x14a35e650>,"[12.333679, 12.333679, 0.00441358, 0.00441358,..."
2,2724050,CCOC(=O)c1ccccc1S(N)(=O)=O,<rdkit.Chem.rdchem.Mol object at 0x14a35e810>,"[11.369954, 11.369954, 0.039953705, -3.8976412..."
3,6871863,CCOC(=O)c1cc(Br)c[nH]1,<rdkit.Chem.rdchem.Mol object at 0x14a35e7a0>,"[10.983704, 10.983704, 0.3211111, -0.3211111, ..."
4,526873,COc1ccccc1CO,<rdkit.Chem.rdchem.Mol object at 0x14a35e880>,"[8.769537, 8.769537, 0.03175926, 0.03175926, 0..."


In [11]:
morgan_featurizer = get_featurizer('morgan')
df["morgan"] = df["mol"].apply(lambda x: morgan_featurizer.get_feat(x))
df.head()

Unnamed: 0,parent_id,smiles,mol,rdkit_2d,morgan
0,492995,CC(C)(C)C(=O)CC#N,<rdkit.Chem.rdchem.Mol object at 0x14a35e730>,"[10.842083, 10.842083, 0.0046296297, -0.351458...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2729170,Clc1ccc2c(CCc3cccnc3C2=O)c1,<rdkit.Chem.rdchem.Mol object at 0x14a35e650>,"[12.333679, 12.333679, 0.00441358, 0.00441358,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2724050,CCOC(=O)c1ccccc1S(N)(=O)=O,<rdkit.Chem.rdchem.Mol object at 0x14a35e810>,"[11.369954, 11.369954, 0.039953705, -3.8976412...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,6871863,CCOC(=O)c1cc(Br)c[nH]1,<rdkit.Chem.rdchem.Mol object at 0x14a35e7a0>,"[10.983704, 10.983704, 0.3211111, -0.3211111, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,526873,COc1ccccc1CO,<rdkit.Chem.rdchem.Mol object at 0x14a35e880>,"[8.769537, 8.769537, 0.03175926, 0.03175926, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
