# Sanitizing salts in Smiles

## Import Pandas
- Pandas is the library for manage DataFrame and CSV files, so it's the only one we need

In [1]:
import pandas as pd

### Read the original Compound CSV

In [2]:
data_types_dictionary = {
    'compound-id': 'int',
    'chembl-id': 'str',
    'chembl-name': 'str',
    'smiles': 'str',
    'drugbank-id': 'str',
    'drugbank-name': 'str'
}

In [3]:
path_similarities = '../data/linked_drugs.csv'
drugs_csv = pd.read_csv(path_similarities, dtype=data_types_dictionary)

In [4]:
drugs_csv

Unnamed: 0,compound-id,chembl_id,chembl-name,smiles,drugbank-id,drugbank-name
0,0,CHEMBL6329,,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,,
1,1,CHEMBL6328,,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,,
2,2,CHEMBL265667,,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,,
3,3,CHEMBL6362,,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,,
4,4,CHEMBL267864,,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,,
...,...,...,...,...,...,...
1952292,1952292,,,NCC1=CC(=NC(OC2=CC(=CC=C2)C(=O)N2C[C@@H](O)[C@...,DB16735,PAT-1251
1952293,1952293,,,CC(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC1...,DB16737,MM3122
1952294,1952294,,,CC(C)C1=CC=C2N(CC3=CC=C(Cl)C=C3)C(CC(C)(C)C(O)...,DB16739,MK-886
1952295,1952295,,,[H][C@@]1(OB(O[C@]1([H])[C@H](O)CO)[C@H](CC(C)...,DB16741,Bortezomib D-mannitol


## Process to sanitize the SMILES for each drug
1. We define a dictionary to ensure that there are no repeated SMILES after salt removal.
2. Iterate over all CSV rows converting them to a dictionary to reduce access time.
3. As we know, salts are distinguished in a SMILES through the "." character, so we tokenize the SMILES string to obtain all the items of each compound.
4. Check if the SMILES has a salt (it has more than one element).
5. If it has only one element, it's a pure SMILES, otherwise, we take the longest item (string length) as the pure SMILES.
6. Check if this SMILES is already part of the dictionary as a key.
7. If it exists, add the new ID to the ID array. Otherwise, we create a new record for that SMILES.

In [7]:
sanitize_smiles = {}

In [8]:
for drug in drugs_csv.to_dict('records'):
    raw_smiles = str(drug['smiles'])
    smiles_items = raw_smiles.split('.')

    if len(smiles_items) > 1:
        pure_smiles = max(smiles_items, key=len)
    else:
        pure_smiles = raw_smiles

    if pure_smiles not in sanitize_smiles:
        compound_ids = [drug['compound-id']]
    else:
        compound_ids = sanitize_smiles[pure_smiles]
        compound_ids.append(drug['compound-id'])

    sanitize_smiles.update({pure_smiles: compound_ids})

## Create the new CSV File

In [2]:
path_sanitize = '../data/sanitize_smiles.csv'

### Processing the dictionary of Sanitize SMILES
We handle the dictionary as a key, value pair with the value being a variable of type Series in order to store it in the CSV.

In [10]:
sanitize_df = pd.DataFrame({key: pd.Series(value) for key, value in sanitize_smiles.items()})
sanitize_df.transpose().to_csv(path_sanitize, sep=';', index=True, encoding='utf-8')

### Additional processing to CSV
We need a new ID for the Sanitize SMILES

In [3]:
sanitize_csv = pd.read_csv(path_sanitize, sep=';')
sanitize_csv

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,28,29,30,31,32,33,34,35,36,37
0,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,0.0,,,,,,,,,...,,,,,,,,,,
1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,1.0,,,,,,,,,...,,,,,,,,,,
2,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,2.0,,,,,,,,,...,,,,,,,,,,
3,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,3.0,,,,,,,,,...,,,,,,,,,,
4,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,4.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860502,NCC1=CC(=NC(OC2=CC(=CC=C2)C(=O)N2C[C@@H](O)[C@...,1952292.0,,,,,,,,,...,,,,,,,,,,
1860503,CC(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC1...,1952293.0,,,,,,,,,...,,,,,,,,,,
1860504,CC(C)C1=CC=C2N(CC3=CC=C(Cl)C=C3)C(CC(C)(C)C(O)...,1952294.0,,,,,,,,,...,,,,,,,,,,
1860505,[H][C@@]1(OB(O[C@]1([H])[C@H](O)CO)[C@H](CC(C)...,1952295.0,,,,,,,,,...,,,,,,,,,,


In [4]:
sanitize_csv = sanitize_csv.rename(columns={'Unnamed: 0': 'smiles'})
sanitize_csv

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,28,29,30,31,32,33,34,35,36,37
0,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,0.0,,,,,,,,,...,,,,,,,,,,
1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,1.0,,,,,,,,,...,,,,,,,,,,
2,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,2.0,,,,,,,,,...,,,,,,,,,,
3,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,3.0,,,,,,,,,...,,,,,,,,,,
4,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,4.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860502,NCC1=CC(=NC(OC2=CC(=CC=C2)C(=O)N2C[C@@H](O)[C@...,1952292.0,,,,,,,,,...,,,,,,,,,,
1860503,CC(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC1...,1952293.0,,,,,,,,,...,,,,,,,,,,
1860504,CC(C)C1=CC=C2N(CC3=CC=C(Cl)C=C3)C(CC(C)(C)C(O)...,1952294.0,,,,,,,,,...,,,,,,,,,,
1860505,[H][C@@]1(OB(O[C@]1([H])[C@H](O)CO)[C@H](CC(C)...,1952295.0,,,,,,,,,...,,,,,,,,,,


In [5]:
sanitize_csv['sanitize-id'] = range(0, len(sanitize_csv))
sanitize_csv

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,29,30,31,32,33,34,35,36,37,sanitize-id
0,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,0.0,,,,,,,,,...,,,,,,,,,,0
1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,1.0,,,,,,,,,...,,,,,,,,,,1
2,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,2.0,,,,,,,,,...,,,,,,,,,,2
3,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,3.0,,,,,,,,,...,,,,,,,,,,3
4,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,4.0,,,,,,,,,...,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860502,NCC1=CC(=NC(OC2=CC(=CC=C2)C(=O)N2C[C@@H](O)[C@...,1952292.0,,,,,,,,,...,,,,,,,,,,1860502
1860503,CC(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC1...,1952293.0,,,,,,,,,...,,,,,,,,,,1860503
1860504,CC(C)C1=CC=C2N(CC3=CC=C(Cl)C=C3)C(CC(C)(C)C(O)...,1952294.0,,,,,,,,,...,,,,,,,,,,1860504
1860505,[H][C@@]1(OB(O[C@]1([H])[C@H](O)CO)[C@H](CC(C)...,1952295.0,,,,,,,,,...,,,,,,,,,,1860505


In [6]:
sanitize_csv = sanitize_csv.reindex(columns=(['sanitize-id'] + list([a for a in sanitize_csv.columns if a != 'sanitize-id']) ))
sanitize_csv

Unnamed: 0,sanitize-id,smiles,0,1,2,3,4,5,6,7,...,28,29,30,31,32,33,34,35,36,37
0,0,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,0.0,,,,,,,,...,,,,,,,,,,
1,1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,1.0,,,,,,,,...,,,,,,,,,,
2,2,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,2.0,,,,,,,,...,,,,,,,,,,
3,3,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,3.0,,,,,,,,...,,,,,,,,,,
4,4,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860502,1860502,NCC1=CC(=NC(OC2=CC(=CC=C2)C(=O)N2C[C@@H](O)[C@...,1952292.0,,,,,,,,...,,,,,,,,,,
1860503,1860503,CC(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC1...,1952293.0,,,,,,,,...,,,,,,,,,,
1860504,1860504,CC(C)C1=CC=C2N(CC3=CC=C(Cl)C=C3)C(CC(C)(C)C(O)...,1952294.0,,,,,,,,...,,,,,,,,,,
1860505,1860505,[H][C@@]1(OB(O[C@]1([H])[C@H](O)CO)[C@H](CC(C)...,1952295.0,,,,,,,,...,,,,,,,,,,


In [7]:
sanitize_csv.to_csv(path_sanitize, sep=';', index=False, encoding='utf-8')