# Molecule processing


In this notebook, the processing of the molecules will be carried out in order to obtain a similarity table of all the molecules belonging to the chosen data set.


In [2]:
import pandas as pd
from drugSimilarityTable import SimilarityTable
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

Database access

In [7]:
path_similarities = 'sanitize_smiles.csv'
drugs_csv = pd.read_csv(path_similarities)

# Data analysis

The null values ​​of all the data are verified, there would exist errors if any of the smiles values ​​of the compounds is null

In [10]:
drugs_csv.isnull().sum()

sanitize-id;smiles;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35;36;37;38    0
dtype: int64

In this case, the null values ​​that are found refer to the id of the databases from which the compounds were acquired.
The type of data and the names of the tables are also verified.


In [8]:
drugs_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1998972 entries, 0 to 1998971
Data columns (total 1 columns):
 #   Column                                                                                                                         Dtype 
---  ------                                                                                                                         ----- 
 0   sanitize-id;smiles;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35;36;37;38  object
dtypes: object(1)
memory usage: 15.3+ MB


# Calculation of the similarity table
This similarity table consists of the comparison of a molecule with all the rest of the molecules that belong to the database, with the aim of obtaining the top of molecules that contain the most similarity with said molecule.

To achieve this, first, the value of the molecule is obtained from its "smiles" found in the database, then it is necessary to calculate the "finger print" for that molecule. To finish it is necessary to calculate the similarity between molecules using their fingerpaints.

In [9]:
for drug in drugs_csv.iterrows():
    # drug_id (id de la molecula actual)
    drug_id = drug[0]
    # drug_similarity_table (tabla de similaridad para la molecula actual)
    drug_similarity_table = SimilarityTable(drug_id)
    # drug_smiles (smiles de la molecula actual)
    drug_smiles = drug[1]['smiles']
    drug_molecule = Chem.MolFromSmiles(drug_smiles)
    drug_fingerprint = AllChem.GetMorganFingerprint(drug_molecule, 2)

    for drug_to_test in drugs_csv.iterrows():
        # drug_to_test_id (id de la molecula a comparar)
        drug_to_test_id= drug_to_test[0]
        try:
            if drug_to_test_id !=drug_id:
                # drug_to_test_smiles (smiles de la molecula a comparar)
                drug_to_test_smiles = drug_to_test[1]['smiles']
                drug_to_test_molecule = Chem.MolFromSmiles(drug_to_test_smiles)
                drug_to_test_fingerprint = AllChem.GetMorganFingerprint(drug_to_test_molecule, 2)
                drugs_similarity = DataStructs.DiceSimilarity(drug_fingerprint, drug_to_test_fingerprint)

                drug_similarity_table.add_item(drug_to_test_id, drugs_similarity)
        except:
            print("Failed Smiles of Drug", drug_to_test_id)
            continue

    drug_similarity_table.print_dictionary()
    break

KeyError: 'smiles'

In [None]:
for drug in drugs_csv.iterrows():
    # drug_id (id de la molecula actual)
    drug_id = drug[0]
    drug_similarity_table = SimilarityTable(drug_id)
    # drug_smiles (smiles de la molecula actual)
    drug_smiles = drug[1]['smiles']
    drug_molecule = Chem.MolFromSmiles(drug_smiles)
    drug_fingerprint = AllChem.GetMorganFingerprint(drug_molecule, 4)

    for drug_to_test in drugs_csv.iterrows():
        # drug_to_test_id (id de la molecula a comparar)
        drug_to_test_id= drug_to_test[0]
        try:
            if drug_to_test_id !=drug_id:
                # drug_to_test_smiles (smiles de la molecula a comparar)
                drug_to_test_smiles = drug_to_test[1]['smiles']
                drug_to_test_molecule = Chem.MolFromSmiles(drug_to_test_smiles)
                drug_to_test_fingerprint = AllChem.GetMorganFingerprint(drug_to_test_molecule, 4)
                drugs_similarity = DataStructs.DiceSimilarity(drug_fingerprint, drug_to_test_fingerprint)

                drug_similarity_table.add_item(drug_to_test_id, drugs_similarity)
        except:
            print("Failed Smiles of Drug", drug_to_test_id)
            continue

    drug_similarity_table.print_dictionary()
    break

In [7]:
#Anterior
for drug in drugs_csv.iterrows():
    drug_similarity_table = SimilarityTable(drug[0])
    drug_molecule = Chem.MolFromSmiles(drug[1]['smiles'])
    drug_fingerprint = AllChem.GetMorganFingerprint(drug_molecule, 4)

    for drug_to_test in drugs_csv.iterrows():
        try:
            if drug_to_test[0] != drug[0]:
                drug_to_test_molecule = Chem.MolFromSmiles(drug_to_test[1]['smiles'])
                drug_to_test_fingerprint = AllChem.GetMorganFingerprint(drug_to_test_molecule, 4)
                drugs_similarity = DataStructs.DiceSimilarity(drug_fingerprint, drug_to_test_fingerprint)

                drug_similarity_table.add_item(drug_to_test[0], drugs_similarity)
        except:
            print("Failed Smiles of Drug", drug_to_test[0])
            continue

    drug_similarity_table.print_dictionary()
    break

RDKit ERROR: [12:37:16] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21
[12:37:16] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21

RDKit ERROR: 


Failed Smiles of Drug 1833727


RDKit ERROR: [12:39:31] Explicit valence for atom # 0 N, 4, is greater than permitted
[12:39:31] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [12:39:31] Explicit valence for atom # 0 N, 4, is greater than permitted
[12:39:31] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [12:39:31] Explicit valence for atom # 0 N, 4, is greater than permitted
[12:39:31] Explicit valence for atom # 0 N, 4, is greater than permitted


Failed Smiles of Drug 2085105
Failed Smiles of Drug 2085115
Failed Smiles of Drug 2085513


RDKit ERROR: [12:39:31] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[12:39:31] Explicit valence for atom # 13 Cl, 5, is greater than permitted
RDKit ERROR: [12:39:31] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[12:39:31] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
RDKit ERROR: [12:39:31] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S(

Failed Smiles of Drug 2086328
Failed Smiles of Drug 2086395


RDKit ERROR: [12:39:31] Explicit valence for atom # 19 O, 3, is greater than permitted
[12:39:31] Explicit valence for atom # 19 O, 3, is greater than permitted


Failed Smiles of Drug 2086994


RDKit ERROR: [12:39:32] Explicit valence for atom # 6 N, 4, is greater than permitted
[12:39:32] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [12:39:32] Explicit valence for atom # 0 O, 3, is greater than permitted
[12:39:32] Explicit valence for atom # 0 O, 3, is greater than permitted
RDKit ERROR: [12:39:32] Explicit valence for atom # 3 N, 4, is greater than permitted
[12:39:32] Explicit valence for atom # 3 N, 4, is greater than permitted


Failed Smiles of Drug 2087733
Failed Smiles of Drug 2088277
Failed Smiles of Drug 2088397
Failed Smiles of Drug 2088581


RDKit ERROR: [12:39:32] Explicit valence for atom # 4 F, 2, is greater than permitted
[12:39:32] Explicit valence for atom # 4 F, 2, is greater than permitted
RDKit ERROR: [12:39:33] Explicit valence for atom # 13 Be, 3, is greater than permitted
[12:39:33] Explicit valence for atom # 13 Be, 3, is greater than permitted


Failed Smiles of Drug 2091090


RDKit ERROR: [12:39:33] Explicit valence for atom # 84 N, 4, is greater than permitted
[12:39:33] Explicit valence for atom # 84 N, 4, is greater than permitted
RDKit ERROR: [12:39:34] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[12:39:34] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
RDKit ERROR: [12:39:34] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'
[12:39:34] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2

Failed Smiles of Drug 2092021
Failed Smiles of Drug 2092388


RDKit ERROR: [12:39:34] Explicit valence for atom # 1 Cl, 4, is greater than permitted
[12:39:34] Explicit valence for atom # 1 Cl, 4, is greater than permitted


Failed Smiles of Drug 2093064


RDKit ERROR: [12:39:34] Explicit valence for atom # 0 N, 4, is greater than permitted
[12:39:34] Explicit valence for atom # 0 N, 4, is greater than permitted


Failed Smiles of Drug 2093616


RDKit ERROR: [12:39:35] Explicit valence for atom # 5 K, 2, is greater than permitted
[12:39:35] Explicit valence for atom # 5 K, 2, is greater than permitted


Failed Smiles of Drug 2094466
33 0.8216216216216217
5 0.7821229050279329
57 0.7783783783783784
4 0.7582417582417582
6 0.7243243243243244
1 0.7204301075268817
59 0.6927374301675978
18 0.6666666666666666
7 0.6486486486486487
11 0.6483516483516484




In [8]:

for row in drugs_csv.itertuples(index=True, name='Pandas'):
    print (row)
    print (getattr(row, "_1")) # getattr(row, "smiles")

Pandas(Index=0, _1=0, _2='CHEMBL6329', _3=nan, smiles='Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl', _5=nan, _6=nan)
0
Pandas(Index=1, _1=1, _2='CHEMBL6328', _3=nan, smiles='Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1', _5=nan, _6=nan)
1
Pandas(Index=2, _1=2, _2='CHEMBL265667', _3=nan, smiles='Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1', _5=nan, _6=nan)
2
Pandas(Index=3, _1=3, _2='CHEMBL6362', _3=nan, smiles='Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1', _5=nan, _6=nan)
3
Pandas(Index=4, _1=4, _2='CHEMBL267864', _3=nan, smiles='Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1', _5=nan, _6=nan)
4
Pandas(Index=5, _1=5, _2='CHEMBL6363', _3=nan, smiles='Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1', _5=nan, _6=nan)
5
Pandas(Index=6, _1=6, _2='CHEMBL6352', _3=nan, smiles='Cc1cc(Br)ccc1C(=O)c1ccc(-n2ncc(=O)[nH]c2=O)cc1Cl', _5=nan, _6=nan)
6
Pandas(Index=7, _1=7, _2='CHEMBL268097', _3=nan, smiles='O=C(c1ccc(Cl)cc1Cl)c1ccc(-n2ncc(=O)[nH]c2=O)cc1Cl', _5=nan, _6=nan)
7
Pandas(Index=8, _1

KeyboardInterrupt: 

In [9]:
for drug in drugs_csv.itertuples():
    # drug_id (id de la molecula actual)
    drug_id = getattr(drug, "_1")
    drug_similarity_table = SimilarityTable(drug_id)
    # drug_smiles (smiles de la molecula actual)
    drug_smiles = getattr(drug, "smiles")
    drug_molecule = Chem.MolFromSmiles(drug_smiles)
    drug_fingerprint = AllChem.GetMorganFingerprint(drug_molecule, 4)

    for drug_to_test in drugs_csv.itertuples():
        # drug_to_test_id (id de la molecula a comparar)
        drug_to_test_id= getattr(drug_to_test, "_1")
        try:
            if drug_to_test_id !=drug_id:
                # drug_to_test_smiles (smiles de la molecula a comparar)
                drug_to_test_smiles = getattr(drug_to_test, "smiles")
                drug_to_test_molecule = Chem.MolFromSmiles(drug_to_test_smiles)
                drug_to_test_fingerprint = AllChem.GetMorganFingerprint(drug_to_test_molecule, 4)
                drugs_similarity = DataStructs.DiceSimilarity(drug_fingerprint, drug_to_test_fingerprint)

                drug_similarity_table.add_item(drug_to_test_id, drugs_similarity)
        except:
            print("Failed Smiles of Drug", drug_to_test_id)
            continue

    drug_similarity_table.print_dictionary()
    break

Failed Smiles of Drug 2028214
33 0.8216216216216217
5 0.7821229050279329
57 0.7783783783783784
4 0.7582417582417582
6 0.7243243243243244
1 0.7204301075268817
59 0.6927374301675978
18 0.6666666666666666
7 0.6486486486486487
11 0.6483516483516484
