# STEP 2
## Data Exploration
## Featurization

In [12]:
import sqlite3
import csv
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Lipinski
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Crippen

In [13]:
conn = sqlite3.connect('sabs_moonshot.db')

# Extract the compounds with missing descriptors

In [14]:
df = pd.read_sql_query("""
SELECT 
    compounds.id,
    compounds.MW,
    rdkit_descriptors.HBA,
    rdkit_descriptors.HBD, rdkit_descriptors.cLogP,
    compounds.smiles
FROM rdkit_descriptors 
INNER JOIN compounds on compounds.id = rdkit_descriptors.compound_id
WHERE HBA = "" OR HBD = "" OR MW = "" OR cLogP = "" AND assayed = TRUE;
""",
conn)
print(df)

                     id MW HBA HBD cLogP  \
0    DAR-DIA-6a508060-1                    
1    DAR-DIA-6a508060-2                    
2    DAR-DIA-6a508060-3                    
3    DAR-DIA-6a508060-4                    
4    DAR-DIA-6a508060-5                    
5    DAR-DIA-6a508060-7                    
6    DAR-DIA-6a508060-8                    
7    DAR-DIA-6a508060-9                    
8   DAR-DIA-6a508060-10                    
9   DAR-DIA-6a508060-11                    
10  DAR-DIA-6a508060-12                    
11  DAR-DIA-6a508060-13                    
12  DAR-DIA-6a508060-14                    
13  DAR-DIA-6a508060-15                    
14  DAR-DIA-6a508060-16                    
15   ALP-UNI-b33a865d-1                    
16   ALP-UNI-b33a865d-1                    
17   ALP-UNI-b33a865d-1                    
18   DAR-DIA-6a508060-3                    
19   DAR-DIA-6a508060-4                    
20   PRA-UNK-2c426785-1                    
21   EDG-MED-0c930815-1         

# Using RDKit to compute molecular descriptors and Lipinski's rule violations

In [35]:
class Molecule:
    """ A molecule. In particular, either the R1 or R2 group, or the scaffold
    and one or two groups.
    There are methods which tell you the properties of the molecule and if it
    passes the Lipsinki test
    """

    def __init__(self, mol_smiles):
        """Constructor for Molecule class. Initialises Molecule instance from
        smile string.
        :param mol_smiles: smile string of molecule
        :type mol_smiles: String
        """
        self.__mol_smiles = mol_smiles

    # @property
    # def get_smile_string(self):
    #     """Returns molecule's smile string
    #     :return: smile string of molecule
    #     :rtype: String
    #     """
    #     return self.__mol_smiles

    def descriptors(self):
        """Calculate molecule descriptor metrics as dict:
        | mol - smile string
        | MW - molecular weight
        | logP - logP
        | TPSA - topological polar surface area
        | HA - heavy atom count
        | h_acc - H acceptor count
        | h_don - H donator count
        | rings - ring count
        :return: molecule descriptor metrics
        :rtype: dict
        """
        mol = Chem.MolFromSmiles(self.__mol_smiles)
        mw = Descriptors.ExactMolWt(mol)
        log_p = Crippen.MolLogP(mol)
        tpsa = rdMolDescriptors.CalcTPSA(mol)  # topological polar surface area
        ha = Lipinski.HeavyAtomCount(mol)  # heavy atom count
        h_acceptors = Lipinski.NumHAcceptors(mol)
        h_donors = Lipinski.NumHDonors(mol)
        rings = Lipinski.RingCount(mol)
        desc_dict = {'smiles': self.__mol_smiles, #changed from mol to smiles so consistent with database we downloaded
                     'MW': mw,
                     'logP': log_p,
                     'TPSA': tpsa,
                     'HA': ha,
                     'h_acc': h_acceptors,
                     'h_don': h_donors,
                     'rings': rings
                     }
        return desc_dict

    def lipinski(self, desc_dict):
        """Calculate Lipinski from the descriptor dictionary. Returns the
        number of rules broken and whether the molecule passes.
        :param desc_dict: molecule descriptor metrics
        :type desc_dict: dict
        :return: violations
        :rtype: dict
        """
        violations = {'MW': desc_dict['MW'] < 500.0,
                      'h_acc': desc_dict['h_acc'] <= 10,
                      'h_don': desc_dict['h_don'] <= 5,
                      'logP': desc_dict['logP'] < 5}
        return violations

Select smiles strings from database

In [19]:
df = pd.read_sql_query("""
SELECT smiles
FROM compounds;
""",
conn)
print(df[:10])

                                              smiles
0                 N#Cc1ccccc1NC(=O)Cc1c[nH]c2ncccc12
1                         N#Cc1ccccc1NC(=O)Cc1cccnc1
2          CCNc1ccc(C#N)c(NC(=O)Cc2c[nH]c3ncccc23)c1
3                 CS(=O)(=O)Cc1ccc(C(=O)Nc2cccnc2)o1
4                 O=C(Nc1cccnc1)c1ccc(N2CCC(O)CC2)o1
5                      CCNc1ccc(C#N)cc1CCNS(C)(=O)=O
6   CS(=O)(=O)NCCc1c[nH]c2c(CCNS(C)(=O)=O)cc(Cl)cc12
7  CCn1cc(CCNS(C)(=O)=O)c2cc(C#N)cc(CCNS(C)(=O)=O...
8       CC(=O)NCCc1c[nH]c2c(CCNS(C)(=O)=O)cc(Cl)cc12
9     CCn1cc(CCNC(C)=O)c2cc(C#N)cc(CCNS(C)(=O)=O)c21


# Converting compounds database to pandas dataframe

In [68]:
df = pd.read_sql_query("""
SELECT *
FROM compounds;
""",
conn)
print(df[:10])

                   id                                             smiles  \
0  ANT-DIA-3c79be55-1                 N#Cc1ccccc1NC(=O)Cc1c[nH]c2ncccc12   
1  ANT-DIA-3c79be55-2                         N#Cc1ccccc1NC(=O)Cc1cccnc1   
2  ANT-DIA-3c79be55-3          CCNc1ccc(C#N)c(NC(=O)Cc2c[nH]c3ncccc23)c1   
3  ANT-DIA-3c79be55-4                 CS(=O)(=O)Cc1ccc(C(=O)Nc2cccnc2)o1   
4  ANT-DIA-3c79be55-5                 O=C(Nc1cccnc1)c1ccc(N2CCC(O)CC2)o1   
5  ROB-UNI-b2e39629-1                      CCNc1ccc(C#N)cc1CCNS(C)(=O)=O   
6  ROB-UNI-b2e39629-2   CS(=O)(=O)NCCc1c[nH]c2c(CCNS(C)(=O)=O)cc(Cl)cc12   
7  ROB-UNI-b2e39629-3  CCn1cc(CCNS(C)(=O)=O)c2cc(C#N)cc(CCNS(C)(=O)=O...   
8  ROB-UNI-b2e39629-4       CC(=O)NCCc1c[nH]c2c(CCNS(C)(=O)=O)cc(Cl)cc12   
9  ROB-UNI-b2e39629-5     CCn1cc(CCNC(C)=O)c2cc(C#N)cc(CCNS(C)(=O)=O)c21   

        MW NMR_std_ratio  assayed  
0  276.299                      0  
1  237.262                      0  
2  319.368                      0  
3  280.305         

In [38]:

# new_df=pd.DataFrame()
# descriptors={'a': 65, 'b': 66}
# new_df=new_df.append(descriptors, ignore_index=True)
# print(new_df)

In [37]:
# for i in range(len(df)):
#     cur_smile=df.iloc[i]['smiles']
#     print(cur_smile)
#     cur_mol=Molecule(cur_smile)
#     cur_mol.descriptors

# Creating dataframe with smile and molecular descriptors

In [41]:
new_df=pd.DataFrame()
for i in range(4): #change to range(len(df))
    cur_smile=df.iloc[i]['smiles']
    # print(cur_smile)
    cur_mol=Molecule(cur_smile)
    descriptors=cur_mol.descriptors()
    # print(descriptors)
    new_df=new_df.append(descriptors, ignore_index=True)

print(new_df)

     HA          MW   TPSA  h_acc  h_don     logP  rings  \
0  21.0  276.101111  81.57    3.0    2.0  2.61578    3.0   
1  18.0  237.090212  65.78    3.0    1.0  2.13448    2.0   
2  24.0  319.143310  93.60    4.0    3.0  3.04758    3.0   
3  19.0  280.051778  89.27    5.0    1.0  1.47150    2.0   

                                      smiles  
0         N#Cc1ccccc1NC(=O)Cc1c[nH]c2ncccc12  
1                 N#Cc1ccccc1NC(=O)Cc1cccnc1  
2  CCNc1ccc(C#N)c(NC(=O)Cc2c[nH]c3ncccc23)c1  
3         CS(=O)(=O)Cc1ccc(C(=O)Nc2cccnc2)o1  


# Creating dataframe with molecular descriptors and finding the Lipinski's rule of five violations for each compound

In [60]:
new_df=pd.DataFrame()
for i in range(2): #change to range(len(df))
    cur_smile=df.iloc[i]['smiles']
    cur_mol=Molecule(cur_smile)
    descriptors=cur_mol.descriptors()
    lipinski=cur_mol.lipinski(descriptors)
    print(lipinski)
    new_df=new_df.append(descriptors, ignore_index=True)
    
    new_df=new_df.append(lipinski, ignore_index=True)

print(new_df)

{'MW': True, 'h_acc': True, 'h_don': True, 'logP': True}
{'MW': True, 'h_acc': True, 'h_don': True, 'logP': True}
     HA          MW   TPSA  h_acc  h_don     logP  rings  \
0  21.0  276.101111  81.57    3.0    2.0  2.61578    3.0   
1   NaN    1.000000    NaN    1.0    1.0  1.00000    NaN   
2  18.0  237.090212  65.78    3.0    1.0  2.13448    2.0   
3   NaN    1.000000    NaN    1.0    1.0  1.00000    NaN   

                               smiles  
0  N#Cc1ccccc1NC(=O)Cc1c[nH]c2ncccc12  
1                                 NaN  
2          N#Cc1ccccc1NC(=O)Cc1cccnc1  
3                                 NaN  


# Trying to discard compounds that fail Lipinski's rule of 5

In [66]:
lipinski_list = ['MW', 'logP', 'h_acc', 'h_don']
    # r_group_1_id = request.args.get('r1')
    # r_group_2_id = request.args.get('r2')
    # molecule_key = tuple2str((r_group_1_id, r_group_2_id))
    # drug_mol = FinalMolecule(r_group_1_id, r_group_2_id)
    # drug_lipinski = drug_mol.lipinski(drug_mol.descriptors())
    # lipinski_dict = {cur_mol: lipinski}
for label in lipinski_list:
    if "False" not in lipinski:
        pass
    # else:
    #     # delete smiles/compound row

# Trying to discard compounds that violate more than one of Lipinski's rule of 5

In [69]:

# for label in lipinski_list:
#     if "False" appears twice  in lipinski:
#         #delete it
#     # else:
#         pass