# Exercises with RDKit

## Configuration

In [None]:
# Magic commands
%load_ext autoreload
%autoreload 2

In [None]:
# Imports
# 1. Standard library imports
from pathlib import Path
import sys
sys.path.append('../my_modules') # to tell where to find local modules

# 2. Third-party library imports
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import (
    Descriptors,
    Draw,
    PandasTools
)
PandasTools.RenderImagesInAllDataFrames(images=True) # to molecules as images in DataFrames
from rdkit.Chem.Draw import IPythonConsole # needed to show molecules
# from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions # only needed if modifying defaults

# 3. Local application imports
import kernel_infos

In [None]:
# Information about the kernel
kernel_infos.show_kernel_info()

In [None]:
# Global variables
HERE = Path().resolve()
print(f"HERE: {HERE}")
ROOT = HERE.parent
print(f"ROOT: {ROOT}")
DATA = ROOT / 'data'
print(f"DATA: {DATA}")

## Exercise: rdkit version

### Questions

Replace XXX with the correct code to complete the notebook.

In [None]:
# What is the version of rdkit ?
print(f"RDKit version is:{XXX}")

## Exercise: molecular weights of aspirin

In [None]:
# Smiles
aspirin_smiles = 'CC(=O)Oc1ccccc1C(=O)O'

### Questions

Replace XXX with the correct code to complete the notebook.

In [None]:
# Generate the molecule object from the smiles
aspirin_mol = XXX

In [None]:
# Display the molecule
XXX

In [None]:
# Compute aMW () and eMW () with methods from Descriptors module, print them with 3 digits after the decimal point, compare and explain
aMW = XXX
eMW = XXX

print(f"Aspirin average molecular weight : {XXX} g/mol")
print(f"Aspirin monoisotopic molecular weight: {XXX} g/mol")

## Exercise: several molecules
### Questions

Replace XXX with the correct code to complete the notebook.

In [None]:
# Initialize three lists
mols_l = XXX
names_l = XXX
ambinter_ids_l = XXX

In [None]:
# Read the sdf file mol by mol and store the molecule objects in mols_l and the Ambinter_reference in aminter_ids_l
sdf_file_path = DATA/ "J1/ambinter_example.sdf"

suppl = Chem.SDMolSupplier(XXX)

for mol in suppl:
    if mol is not None:
        # append here the mol in mols_l
        if mol.HasProp('Ambinter_reference'):
            # append here the Ambinter_reference property in ambinter_ids_l
        else:
            # append here None in ambinter_ids_l

In [None]:
# Append the right name at the right index in names_l
# 'Amb17757514' <-> 'Aspirin'
# 'Amb1209347' <-> 'Paracetamol'

for id in ambinter_ids_l:
    if id == 'Amb17757514':
        # append 'Aspirin' to names_l
    elif id == 'Amb1209347':
        # append 'Paracetamol' to names_l
    else:
        names_l.append('Unknown')

In [None]:
# Print names_l and ambinter_ids_l element by element side by side with zip()
for name, ambinter_id in XXX:
    print(f"Name: {name}, Ambinter ID: {ambinter_id}")

In [None]:
# Create a DataFrame from the three lists and check it
molecules_df = pd.DataFrame({
    'Name': XXX,
    'Ambinter_ID': XXX,
    'ROMol': XXX
})

# Check the Dataframe
print(f"molecules_df shape: {XXX}")
molecules_df

In [None]:
# Save the 2 molecules with their ID in a MolsGridIMage file.
grid_image = Draw.XXX(
    mols = XXX
    legends=XXX,
    molsPerRow=,
    subImgSize=(200,200)
)

In [None]:
# Dislay the image
XXX

In [None]:
# Save the image in a png file
png_file_path = DATA / 'J1/ambinter_grid.png'

with open(png_file_path, 'wb') as f:
    f.write(XXX.data)

## Exercise: isomers
### Questions

Replace XXX with the correct code to complete the notebook.

In [None]:
# Smiles strings for geraniol and nerol
nerol_smi = r"CC(C)=CCC/C(C)=C\CO"
geraniol_smi = r"CC(C)=CCC/C(C)=C/CO"

In [None]:
# Compare the smiles strings. What is the difference between them ?

In [None]:
# Generate the molecules from their SMILES
nerol_mol = Chem.XXX(nerol_smi)
geraniol_mol = Chem.XXX(geraniol_smi)  

In [None]:
# Show them in a MolsGridImage with their names as legends
Draw.MolsToGridImage(
    mols=[XXX],
    legends=[XXX],
    molsPerRow=2,
    subImgSize=(200,200)
)

In [None]:
# What kind of isomers are they ?

### Solutions

In [None]:
# Compare the smiles strings. What is the difference between them ?
print(f"The 2 smiles string differ by the direction of the slash before CO:\n \\ for nerol and / for geraniol.")

In [None]:
# Generate the molecules from their SMILES
nerol_mol = Chem.MolFromSmiles(nerol_smi)
geraniol_mol = Chem.MolFromSmiles(geraniol_smi)  

In [None]:
# Show them in a MolsGridImage with their names as legends
Draw.MolsToGridImage(
    mols=[geraniol_mol, nerol_mol],
    legends=['Nerol', 'Geraniol'],
    molsPerRow=2,
    subImgSize=(200,200)
)

In [None]:
# What kind of isomers are they ?
soluce = "Nerol is a cis-isomer, which means that CH2OH group and the bigger part of the chain are on the same side of double C=C bond.\nGeraniol is a trans-isomer so CH2OH group and bigger part of the chain are on the opposite sides of C=C bond.\nThis slight difference changes their smell."
print(f"{soluce}")

## Exercise: from lists to dataframe
### Questions

Replace XXX with the correct code to complete the notebook.

In [None]:
# Initialize three lists
mols_l = XXX
names_l = XXX
ambinter_ids_l = XXX



In [None]:
# Read the sdf file mol by mol and store the molecule objects in mols_l and the Ambinter_reference IDs aminter_ids_l
sdf_file_path = DATA/ "J1/ambinter_example.sdf"

suppl = Chem.XXX

for mol in suppl:
    if mol is not None:
        mols_l.XXX # append here the mol in mols_l
        if mol.HasProp('Ambinter_reference'):
            ambinter_ids_l.append(mol.GetProp('Ambinter_reference'))
        else:
            ambinter_ids_l.append(None)

In [None]:
# Append the right name at the right index in names_l
# 'Amb17757514' : 'Aspirin'
# 'Amb1209347' : 'Paracetamol'

for id in XXX:
    if id == XXX:
        names_l.append('Aspirin')
    elif id == XXX:
        names_l.append('Paracetamol')
    else:
        names_l.append('Unknown')

In [None]:
# Print names_l and ambinter_ids_l element by element side by side with zip()
for name, ambinter_id in zip(XXX, XXX):
    print(f"Name: {name}, Ambinter ID: {ambinter_id}")

In [None]:
# Create a DataFrame from the three lists
molecules_df = pd.DataFrame({
    'Name': XXX,
    'Ambinter_ID': XXX,
    'ROMol': XXX
})

In [None]:
# Check the Dataframe
print(f"molecules_df shape: {XXX}")
molecules_df

### Solutions

In [None]:
# Initialize three lists
mols_l = list()
names_l = list()
ambinter_ids_l = list()

In [None]:
# Read the sdf file mol by mol and store the molecule objects in mols_l and the Ambinter_reference IDs aminter_ids_l
sdf_file_path = DATA/ "J1/ambinter_example.sdf"

suppl = Chem.SDMolSupplier(sdf_file_path)

for mol in suppl:
    if mol is not None:
        mols_l.append(mol) # append here the mol in mols_l
        if mol.HasProp('Ambinter_reference'):
            ambinter_ids_l.append(mol.GetProp('Ambinter_reference'))
        else:
            ambinter_ids_l.append(None)

In [None]:
ambinter_ids_l

In [None]:
# Append the right name at the right index in names_l
# 'Amb17757514' : 'Aspirin'
# 'Amb1209347' : 'Paracetamol'

for id in ambinter_ids_l:
    if id == 'Amb17757514':
        names_l.append('Aspirin')
    elif id == 'Amb1209347':
        names_l.append('Paracetamol')
    else:
        names_l.append('Unknown')

In [None]:
names_l

In [None]:
# Print names_l and ambinter_ids_l element by element side by side with zip()
for name, ambinter_id in zip(names_l, ambinter_ids_l):
    print(f"Name: {name}, Ambinter ID: {ambinter_id}")

In [None]:
# Create a DataFrame from the three lists
molecules_df = pd.DataFrame({
    'Name': names_l,
    'Ambinter_ID': ambinter_ids_l,
    'ROMol': mols_l
})

In [None]:
# Check the Dataframe
print(f"molecules_df shape: {molecules_df.shape}")
molecules_df