# Enriching SMILES Dataset with RDKit & PubChempy

This notebook demonstrates how I enriched a dataset of chemical compounds by computing canonical SMILES representations using RDKit.

## Objective
Standardize the chemical structure representation of each compound to avoid duplicates and ensure consistency when training machine learning models.

In [2]:
import pandas as pd

# Original dataset
df = pd.read_csv("adr_dataset.csv")
df.head()

Unnamed: 0,Chemical Compound,Hepatobiliary disorders,Metabolism and nutrition disorders,Eye disorders,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Immune system disorders,Reproductive system and breast disorders,"Neoplasms benign, malignant and unspecified (incl cysts and polyps)",General disorders and administration site conditions,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,CC12CCC3C(C1CCC2=O)CC=C4C3(CCC(C4)O)C,0,1,0,1,1,1,1,1,1,...,0,1,1,1,1,0,1,1,1,1
1,C[N+](C)(C)CC(CC(=O)O)O,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,1,1,1
2,C(CC(=O)O)CN,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,C1C(N(C2=C(N1)NC(=NC2=O)N)C=O)CNC3=CC=C(C=C3)C...,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,CCCCCC(C=CC1C(CC(=O)C1CC=CCCCC(=O)O)O)O,0,0,0,1,1,1,0,0,1,...,1,1,0,1,1,1,0,1,1,1


## RDKit Installation

In [3]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


## Generate Canonical SMILES

In [5]:
from rdkit import Chem
import numpy as np

# Function to get canonical SMILES from raw SMILES
def get_canonical_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
        else:
            return None
    except:
        return None

# Apply to dataset
df["Canonical_SMILES"] = df["Chemical Compound"].apply(get_canonical_smiles)
df.head()

Unnamed: 0,Chemical Compound,Hepatobiliary disorders,Metabolism and nutrition disorders,Eye disorders,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Immune system disorders,Reproductive system and breast disorders,"Neoplasms benign, malignant and unspecified (incl cysts and polyps)",General disorders and administration site conditions,...,Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications",Canonical_SMILES
0,CC12CCC3C(C1CCC2=O)CC=C4C3(CCC(C4)O)C,0,1,0,1,1,1,1,1,1,...,1,1,1,1,0,1,1,1,1,CC12CCC3C(CC=C4CC(O)CCC43C)C1CCC2=O
1,C[N+](C)(C)CC(CC(=O)O)O,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,C[N+](C)(C)CC(O)CC(=O)O
2,C(CC(=O)O)CN,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,NCCCC(=O)O
3,C1C(N(C2=C(N1)NC(=NC2=O)N)C=O)CNC3=CC=C(C=C3)C...,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)...
4,CCCCCC(C=CC1C(CC(=O)C1CC=CCCCC(=O)O)O)O,0,0,0,1,1,1,0,0,1,...,1,0,1,1,1,0,1,1,1,CCCCCC(O)C=CC1C(O)CC(=O)C1CC=CCCCC(=O)O


In [7]:
pip install pubchempy

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13819 sha256=737cd0469f4177128911f83ed568884ffabe58fc49aca51c6cec068fa987e4bd
  Stored in directory: /root/.cache/pip/wheels/8b/e3/6c/3385b2db08b0985a87f5b117f98d0cb61a3ae3ca3bcbbd8307
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [8]:
import pubchempy as pcp
import time

# SMILES unique list
smiles_list = df["Chemical Compound"].drop_duplicates().tolist()

# Dictionary to store results
nombre_comun_dict = {}

# PubChem
for smi in smiles_list:
    try:
        result = pcp.get_compounds(smi, namespace='smiles')
        if result:
            nombre_comun_dict[smi] = result[0].iupac_name or "Sin nombre"
        else:
            nombre_comun_dict[smi] = "No encontrado"
    except Exception as e:
        nombre_comun_dict[smi] = "Error"
    time.sleep(0.2)  # evitar bloquear la API

# Adding to the dataframe
df["Nombre_Comun"] = df["Chemical Compound"].map(nombre_comun_dict)


## Save Enriched Dataset


In [9]:
# Saved new dataset
df.to_csv("data_sample_enriched.csv", index=False)
print(" Enriched dataset saved")

 Enriched dataset saved
