# Library Filtering
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Stef0916/chemoinformatics-bioinformatics/blob/main/cheminformatics-workflow/notebooks/2-Library_filtering.ipynb)

## Content

1. [Load DataSet from PubChem](#1)
2. [Remove Nan](#2)
3. [Remove Duplicates](#3)
4. [Molecues Visualization](#4)
5. [Remove Inconclusive Results](#5)
6. [Remove Active Agonist = Toxic](#6)
7. [Remove Weak Compounds](#7)
8. [Save the DataSet](#8)

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.1


In [None]:
!pip install mols2grid

Collecting mols2grid
  Downloading mols2grid-2.0.0-py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.0/107.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets<8,>=7->mols2grid)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, mols2grid
Successfully installed jedi-0.19.1 mols2grid-2.0.0


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools
import copy
import mols2grid

## 1. Load DataSet from PubChem<a name = 1></a>

In [151]:
data = pd.read_csv('AID_1259247_datatable.csv')

In [152]:
data

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Antagonist Activity,Antagonist Potency (uM),Antagonist Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
0,RESULT_TYPE,,,,,,,,STRING,STRING,FLOAT,FLOAT,STRING,FLOAT,FLOAT,STRING
1,RESULT_DESCR,,,,,,,,Type of compound activity based on both the CA...,Type of compound activity in the CAR antagonis...,The concentration of sample yielding half-maxi...,Percent inhibition of CAR.,Type of compound activity in the cell viabilit...,The concentration of sample yielding half-maxi...,Percent inhibition of cell viability.,Where sample was obtained.
2,RESULT_UNIT,,,,,,,,,,MICROMOLAR,PERCENT,,MICROMOLAR,PERCENT,
3,RESULT_IS_ACTIVE_CONCENTRATION,,,,,,,,,,TRUE,,,,,
4,1,144211407.0,91754.0,CC(C)(C)C1=CC=C(C=C1)CSC2=C(C(=O)N(N=C2)C(C)(C...,Active,100.0,,,active antagonist,active antagonist,0.0200875,-109.405,active antagonist,0.0660073,-45.9524,SIGMA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9666,9663,251919788.0,8304.0,CC(C)OP(OC(C)C)OC(C)C,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,TCI America
9667,9664,251919825.0,1127.0,C1CCSC1,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,TCI America
9668,9665,251919641.0,104856.0,CN(CCCC(C1=CN=CC=C1)O)N=O,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,"Toronto Research Chemicals, Inc."
9669,9666,251919642.0,47289.0,CN(CCCC(=O)C1=CN=CC=C1)N=O,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,"Toronto Research Chemicals, Inc."


In [153]:
# SMILES are in a column named 'PUBCHEM_EXT_DATASOURCE_SMILES'
data.rename(columns={'PUBCHEM_EXT_DATASOURCE_SMILES': 'PUBCHEM_SMILES'}, inplace=True)
data

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Antagonist Activity,Antagonist Potency (uM),Antagonist Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
0,RESULT_TYPE,,,,,,,,STRING,STRING,FLOAT,FLOAT,STRING,FLOAT,FLOAT,STRING
1,RESULT_DESCR,,,,,,,,Type of compound activity based on both the CA...,Type of compound activity in the CAR antagonis...,The concentration of sample yielding half-maxi...,Percent inhibition of CAR.,Type of compound activity in the cell viabilit...,The concentration of sample yielding half-maxi...,Percent inhibition of cell viability.,Where sample was obtained.
2,RESULT_UNIT,,,,,,,,,,MICROMOLAR,PERCENT,,MICROMOLAR,PERCENT,
3,RESULT_IS_ACTIVE_CONCENTRATION,,,,,,,,,,TRUE,,,,,
4,1,144211407.0,91754.0,CC(C)(C)C1=CC=C(C=C1)CSC2=C(C(=O)N(N=C2)C(C)(C...,Active,100.0,,,active antagonist,active antagonist,0.0200875,-109.405,active antagonist,0.0660073,-45.9524,SIGMA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9666,9663,251919788.0,8304.0,CC(C)OP(OC(C)C)OC(C)C,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,TCI America
9667,9664,251919825.0,1127.0,C1CCSC1,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,TCI America
9668,9665,251919641.0,104856.0,CN(CCCC(C1=CN=CC=C1)O)N=O,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,"Toronto Research Chemicals, Inc."
9669,9666,251919642.0,47289.0,CN(CCCC(=O)C1=CN=CC=C1)N=O,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,"Toronto Research Chemicals, Inc."


In [154]:
data.columns

Index(['PUBCHEM_RESULT_TAG', 'PUBCHEM_SID', 'PUBCHEM_CID', 'PUBCHEM_SMILES',
       'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_ACTIVITY_SCORE',
       'PUBCHEM_ACTIVITY_URL', 'PUBCHEM_ASSAYDATA_COMMENT', 'Activity Summary',
       'Antagonist Activity', 'Antagonist Potency (uM)',
       'Antagonist Efficacy (%)', 'Viability Activity',
       'Viability Potency (uM)', 'Viability Efficacy (%)', 'Sample Source'],
      dtype='object')

In [155]:
keep_columns = ['PUBCHEM_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'Antagonist Activity', 'Viability Activity', 'Antagonist Efficacy (%)']

In [156]:
data = data.loc[:, keep_columns]
data

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%)
0,,,STRING,STRING,FLOAT
1,,,Type of compound activity in the CAR antagonis...,Type of compound activity in the cell viabilit...,Percent inhibition of CAR.
2,,,,,PERCENT
3,,,,,
4,CC(C)(C)C1=CC=C(C=C1)CSC2=C(C(=O)N(N=C2)C(C)(C...,Active,active antagonist,active antagonist,-109.405
...,...,...,...,...,...
9666,CC(C)OP(OC(C)C)OC(C)C,Inactive,inactive,inactive,0
9667,C1CCSC1,Inactive,inactive,inactive,0
9668,CN(CCCC(C1=CN=CC=C1)O)N=O,Inactive,inactive,inactive,0
9669,CN(CCCC(=O)C1=CN=CC=C1)N=O,Inactive,inactive,inactive,0


In [157]:
len(data)

9671

## 2. Remove NaN<a name = 2></a>

In [158]:
data_non_nan = data.loc[data['PUBCHEM_SMILES'].notna()]
len(data_non_nan)

9524

In [159]:
data_non_nan.loc[data['PUBCHEM_ACTIVITY_OUTCOME'].isna()]

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%)


In [160]:
data_non_nan.loc[data['Viability Activity'].isna()]

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%)


In [163]:
data_non_nan.loc[data['Antagonist Efficacy (%)'].isna()]

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%)
1459,COC1=C(C=CC(=C1)C2=NNC(=O)C=C2)OC(F)F,Inconclusive,inconclusive antagonist,inactive,
1460,C[C@@]1(C(=O)N2[C@H](C(=O)N3CCC[C@H]3[C@@]2(O1...,Inconclusive,inconclusive antagonist,inactive,
1461,CCC(COC(=O)C1=CC(=C(C(=C1)OC)OC)OC)(C2=CC=CC=C...,Inconclusive,inconclusive antagonist,inactive,
1462,C1=CC=C(C=C1)CC2=C(C=CC(=C2)Cl)O,Inconclusive,inconclusive antagonist,inactive,
1463,CC(=O)[C@]1(CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CCC4...,Inconclusive,inconclusive antagonist,inactive,
...,...,...,...,...,...
3095,CC1=CC(=CC=C1)C,Inconclusive,inconclusive,inactive,
3096,C1=CC(=C(C=C1C2=CC(=C(C=C2)N)Cl)Cl)N,Inconclusive,inconclusive,inactive,
3097,CCCCCCCCCCCCCCCCCC(=O)OCC(CO)O,Inconclusive,inconclusive,inactive,
3098,CCOC(=O)CS,Inconclusive,inconclusive,inactive,


In [165]:
data_non_nan = data_non_nan.loc[data['Antagonist Efficacy (%)'].notna()]

In [167]:
data_non_nan.sort_values(by='Antagonist Efficacy (%)')

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%)
2220,C1=CC=C2C(=C1)C(=O)C(=C(C2=O)Cl)Cl,Inconclusive,inconclusive antagonist,active antagonist,-10.5135
636,C1=CC=C(C(=C1)CCl)CCl,Active,active antagonist,inactive,-100.025
488,CN(C(=O)NC1=CC(=C(C=C1)Br)Cl)OC,Active,active antagonist,inactive,-100.03
2044,C/C=C/C(=O)OC1=C(C=C(C=C1[N+](=O)[O-])[N+](=O)...,Inconclusive,active antagonist,active antagonist,-100.048
2043,CN(C)C(=O)C(CCN1CCC(CC1)(C2=CC=C(C=C2)Cl)O)(C3...,Inconclusive,active antagonist,active antagonist,-100.118
...,...,...,...,...,...
2350,C1CCOC(C1)N2C=NC3=C(N=CN=C32)NCC4=CC=CC=C4,Inconclusive,active agonist,inactive,97.4415
2718,C=CC(=O)OCCC(=O)O,Inconclusive,inconclusive agonist,inactive,97.8793
2351,CC1=CC(=C(C=C1)N)N,Inconclusive,active agonist,inactive,99.046
2352,CC(C)OC1=CC2=C(C=C1)C(=O)C(=CO2)C3=CC=CC=C3,Inconclusive,active agonist,inactive,99.467


## 3. Remove Duplicates<a name = 3></a>

In [168]:
len(data_non_nan['PUBCHEM_SMILES'].unique())

7376

In [169]:
data_non_nan.loc[data_non_nan['PUBCHEM_SMILES'].duplicated()].sort_values(by='PUBCHEM_SMILES')

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%)
8671,B(O)(O)O,Inactive,inactive,inactive,0
8310,B(OC(C)C)(OC(C)C)OC(C)C,Inactive,inactive,inactive,0
7052,C(#N)C(Br)Br,Inactive,inactive,inactive,0
90,C(#N)C1=C(C(=C(C(=C1Cl)Cl)Cl)C#N)Cl,Active,active antagonist,active antagonist,-102.965
8309,C(#N)N=C(N)N,Inactive,inactive,inactive,0
...,...,...,...,...,...
9322,[NH2-].[NH2-].Cl[Pt+2]Cl,Inactive,inactive,inactive,0
6925,[NH4+].NS(=O)(=O)[O-],Inactive,inactive,inactive,0
8727,[NH4+].[N+](=O)([O-])[O-],Inactive,inactive,inactive,0
8744,[NH4+].[O-]Cl(=O)(=O)=O,Inactive,inactive,inactive,0


In [170]:
data_clean = data_non_nan.loc[~data_non_nan['PUBCHEM_SMILES'].duplicated()]
len(data_clean)

7376

## 4. Molecule Visualization<a name = 4></a>

In [171]:
data_clean = data_clean.copy()
data_clean['Molecule'] = data_clean['PUBCHEM_SMILES'].apply(lambda x: Chem.MolFromSmiles(x) if x is not None else None)

[16:27:31] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [173]:
data_clean.sort_values(by='Antagonist Efficacy (%)')

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%),Molecule
636,C1=CC=C(C(=C1)CCl)CCl,Active,active antagonist,inactive,-100.025,<rdkit.Chem.rdchem.Mol object at 0x7ec59728a030>
488,CN(C(=O)NC1=CC(=C(C=C1)Br)Cl)OC,Active,active antagonist,inactive,-100.03,<rdkit.Chem.rdchem.Mol object at 0x7ec59729ea40>
2044,C/C=C/C(=O)OC1=C(C=C(C=C1[N+](=O)[O-])[N+](=O)...,Inconclusive,active antagonist,active antagonist,-100.048,<rdkit.Chem.rdchem.Mol object at 0x7ec5972c28f0>
2043,CN(C)C(=O)C(CCN1CCC(CC1)(C2=CC=C(C=C2)Cl)O)(C3...,Inconclusive,active antagonist,active antagonist,-100.118,<rdkit.Chem.rdchem.Mol object at 0x7ec5972c2880>
487,COC1CN(CCC1NC(=O)C2=CC(=C(C=C2OC)N)Cl)CCCOC3=C...,Active,active antagonist,inactive,-100.191,<rdkit.Chem.rdchem.Mol object at 0x7ec59729e9d0>
...,...,...,...,...,...,...
2350,C1CCOC(C1)N2C=NC3=C(N=CN=C32)NCC4=CC=CC=C4,Inconclusive,active agonist,inactive,97.4415,<rdkit.Chem.rdchem.Mol object at 0x7ec5972c9070>
2718,C=CC(=O)OCCC(=O)O,Inconclusive,inconclusive agonist,inactive,97.8793,<rdkit.Chem.rdchem.Mol object at 0x7ec597284660>
2351,CC1=CC(=C(C=C1)N)N,Inconclusive,active agonist,inactive,99.046,<rdkit.Chem.rdchem.Mol object at 0x7ec5972c90e0>
2352,CC(C)OC1=CC2=C(C=C1)C(=O)C(=CO2)C3=CC=CC=C3,Inconclusive,active agonist,inactive,99.467,<rdkit.Chem.rdchem.Mol object at 0x7ec5972c9150>


In [176]:
data_clean['Antagonist Efficacy (%)'].describe()

count     7376
unique    2080
top          0
freq      5286
Name: Antagonist Efficacy (%), dtype: object

In [177]:
data = data_clean.copy()
data_clean['Antagonist Efficacy (%)'] = data_clean['Antagonist Efficacy (%)'].astype('float64')

In [178]:
data_clean['Antagonist Efficacy (%)'].dtype

dtype('float64')

In [179]:
mols2grid.display(data_clean, mol_col = 'Molecule', subset=['PUBCHEM_ACTIVITY_OUTCOME', 'Antagonist Activity', 'Antagonist Efficacy (%)'],
                  transform={"Antagonist Efficacy (%)": lambda x: f"{x:.2f}"})

Output hidden; open in https://colab.research.google.com to view.

## 5. Remove Inconclusive Results<a name = 5></a>

In [180]:
data_clean['PUBCHEM_ACTIVITY_OUTCOME'].unique()

array(['Active', 'Inconclusive', 'Inactive'], dtype=object)

In [181]:
data_clean.loc[data_clean['PUBCHEM_ACTIVITY_OUTCOME'] == 'Inconclusive']

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%),Molecule
1112,CCC1=CC(=C(C(=C1)C(C)(C)C)O)C(C)(C)C,Inconclusive,inconclusive antagonist,inactive,-158.4640,<rdkit.Chem.rdchem.Mol object at 0x7ec5972a9700>
1113,CN(C1CCCCC1)C(=O)CCCOC2=CC3=C(C=C2)NC(=O)C=C3,Inconclusive,inconclusive antagonist,inactive,-151.7800,<rdkit.Chem.rdchem.Mol object at 0x7ec5972a9770>
1114,C1CCN(CC1)C2=NC(=NC3=C2N=C(N=C3N4CCCCC4)N(CCO)...,Inconclusive,inconclusive antagonist,active agonist,-144.6710,<rdkit.Chem.rdchem.Mol object at 0x7ec5972a97e0>
1115,C(CS)C(=O)OCC(COC(=O)CCS)(COC(=O)CCS)COC(=O)CCS,Inconclusive,inconclusive antagonist,inactive,-143.0470,<rdkit.Chem.rdchem.Mol object at 0x7ec5972a9850>
1116,C1=CC=C(C=C1)NC2=CC=C(C=C2)N.Cl,Inconclusive,inconclusive antagonist,inconclusive antagonist,-138.3270,<rdkit.Chem.rdchem.Mol object at 0x7ec5972a98c0>
...,...,...,...,...,...,...
3060,C1=CC(=CC=C1N=NC2=CC(=C(C=C2)O)C(=O)O)[N+](=O)...,Inconclusive,active agonist,inconclusive antagonist,194.8110,<rdkit.Chem.rdchem.Mol object at 0x7ec5972852a0>
3062,C1=CC=C(C=C1)N=NC2=C(N=C(C=C2)N)N.Cl,Inconclusive,inconclusive agonist,active antagonist,34.0911,<rdkit.Chem.rdchem.Mol object at 0x7ec597285310>
3063,CN(C)C1=CC=C(C=C1)C(=O)C2=CC=C(C=C2)N(C)C,Inconclusive,inconclusive agonist,active antagonist,58.4712,<rdkit.Chem.rdchem.Mol object at 0x7ec597285380>
3064,C1=CSC(=C1)C(=O)NC2=NC=C(S2)[N+](=O)[O-],Inconclusive,inconclusive agonist,active antagonist,71.1967,<rdkit.Chem.rdchem.Mol object at 0x7ec5972853f0>


In [182]:
data_clean = data_clean.loc[data_clean['PUBCHEM_ACTIVITY_OUTCOME'] != 'Inconclusive']
data_clean.sort_values(by='Antagonist Efficacy (%)')

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%),Molecule
191,CC1=CC(=CC(=C1O)C)C(C)(C)C2=CC(=C(C(=C2)C)O)C,Active,active antagonist,inactive,-177.383,<rdkit.Chem.rdchem.Mol object at 0x7ec597273df0>
171,C1=CC(=CC2=NC3=C(C=CC(=C3)N)C=C21)N.C1=CC(=CC2...,Active,active antagonist,active agonist,-142.931,<rdkit.Chem.rdchem.Mol object at 0x7ec597273680>
68,C1=CC(=C2C(=C1NCCNCCO)C(=O)C3=C(C=CC(=C3C2=O)O...,Active,active antagonist,active antagonist,-142.058,<rdkit.Chem.rdchem.Mol object at 0x7ec597271310>
193,CN1C2=NC(=NC=C2CN(C1=O)C3=CC=CC=C3Br)NC4=CC5=C...,Active,active antagonist,inconclusive antagonist,-135.678,<rdkit.Chem.rdchem.Mol object at 0x7ec597273e60>
347,CC1=C(C(C(=C(N1)C)C(=O)OC/C=C/C2=CC=CC=C2)C3=C...,Active,active antagonist,inactive,-135.491,<rdkit.Chem.rdchem.Mol object at 0x7ec5972974c0>
...,...,...,...,...,...,...
4981,CC1CC(OC(O1)C)OC(=O)C,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9cb0>
4980,C1=C(C(=CC(=C1Cl)Cl)Cl)Cl,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9c40>
4979,CC(=C)C(=O)OCCC(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)...,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9bd0>
4977,CCCCC(CC)COC(=O)CCCCC(=O)OCC1=CC=CC=C1,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9af0>


## 6. Removing Active Agonist = Toxic<a name = 6></a>

In [183]:
data_clean.loc[data_clean['Viability Activity'] == 'active agonist']

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%),Molecule
21,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=C(C=CC(=C5...,Active,active antagonist,active agonist,-108.742,<rdkit.Chem.rdchem.Mol object at 0x7ec597270120>
171,C1=CC(=CC2=NC3=C(C=CC(=C3)N)C=C21)N.C1=CC(=CC2...,Active,active antagonist,active agonist,-142.931,<rdkit.Chem.rdchem.Mol object at 0x7ec597273680>
288,C[N+]1=C2C=C(C=CC2=CC3=C1C=C(C=C3)N)N.C1=CC(=C...,Active,active antagonist,active agonist,-80.1402,<rdkit.Chem.rdchem.Mol object at 0x7ec597296110>
810,CC1=C(C(=O)OC2=C1C=CC(=C2)O)Cl,Active,active antagonist,active agonist,-55.1457,<rdkit.Chem.rdchem.Mol object at 0x7ec5972a6260>
3103,CC1=CC2=C(C=C1C)N(C3=NC(=O)NC(=O)C3=N2)C[C@@H]...,Inactive,inactive,active agonist,0.0,<rdkit.Chem.rdchem.Mol object at 0x7ec5972855b0>
3119,C1=CC=C2C(=C1)C(=O)C3=CC=CC=C3N2CC(=O)O,Inactive,inactive,active agonist,0.0,<rdkit.Chem.rdchem.Mol object at 0x7ec597285a10>
3135,C1=CC=C2C(=C1)C(=O)OC23C4=C(C=C(C=C4)O)OC5=C3C...,Inactive,inactive,active agonist,0.0,<rdkit.Chem.rdchem.Mol object at 0x7ec597285f50>


In [184]:
len(data_clean.loc[data_clean['Viability Activity'] == 'active agonist'])

7

In [185]:
data_clean = data_clean.loc[data_clean['Viability Activity'] != 'active agonist']

In [186]:
len(data_clean)

6214

## 7. Remove Weak Compounds<a name = 7></a>

In [187]:
data_clean.sort_values(by='Antagonist Efficacy (%)')

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%),Molecule
191,CC1=CC(=CC(=C1O)C)C(C)(C)C2=CC(=C(C(=C2)C)O)C,Active,active antagonist,inactive,-177.383,<rdkit.Chem.rdchem.Mol object at 0x7ec597273df0>
68,C1=CC(=C2C(=C1NCCNCCO)C(=O)C3=C(C=CC(=C3C2=O)O...,Active,active antagonist,active antagonist,-142.058,<rdkit.Chem.rdchem.Mol object at 0x7ec597271310>
193,CN1C2=NC(=NC=C2CN(C1=O)C3=CC=CC=C3Br)NC4=CC5=C...,Active,active antagonist,inconclusive antagonist,-135.678,<rdkit.Chem.rdchem.Mol object at 0x7ec597273e60>
347,CC1=C(C(C(=C(N1)C)C(=O)OC/C=C/C2=CC=CC=C2)C3=C...,Active,active antagonist,inactive,-135.491,<rdkit.Chem.rdchem.Mol object at 0x7ec5972974c0>
306,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C)O)CC[C@@...,Active,active antagonist,inactive,-135.449,<rdkit.Chem.rdchem.Mol object at 0x7ec5972966c0>
...,...,...,...,...,...,...
4983,C(C(C(C(C(F)(F)S(=O)(=O)[O-])(F)F)(F)F)(F)F)(C...,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9d90>
4982,C(=O)(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F...,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9d20>
4981,CC1CC(OC(O1)C)OC(=O)C,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9cb0>
4979,CC(=C)C(=O)OCCC(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)...,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec5972f9bd0>


In [147]:
mask = ~((data_clean['Antagonist Efficacy (%)'] > -40) &
         (data_clean['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active'))
data_active_inactive_clean = data_clean.loc[mask]

In [148]:
len(data_clean)

6047

In [150]:
data_clean.sort_values(by='Antagonist Efficacy (%)')

Unnamed: 0,PUBCHEM_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Antagonist Activity,Viability Activity,Antagonist Efficacy (%),Molecule
191,CC1=CC(=CC(=C1O)C)C(C)(C)C2=CC(=C(C(=C2)C)O)C,Active,active antagonist,inactive,-177.383,<rdkit.Chem.rdchem.Mol object at 0x7ec597490a50>
68,C1=CC(=C2C(=C1NCCNCCO)C(=O)C3=C(C=CC(=C3C2=O)O...,Active,active antagonist,active antagonist,-142.058,<rdkit.Chem.rdchem.Mol object at 0x7ec5976105f0>
193,CN1C2=NC(=NC=C2CN(C1=O)C3=CC=CC=C3Br)NC4=CC5=C...,Active,active antagonist,inconclusive antagonist,-135.678,<rdkit.Chem.rdchem.Mol object at 0x7ec597490ac0>
347,CC1=C(C(C(=C(N1)C)C(=O)OC/C=C/C2=CC=CC=C2)C3=C...,Active,active antagonist,inactive,-135.491,<rdkit.Chem.rdchem.Mol object at 0x7ec5974a4120>
306,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C)O)CC[C@@...,Active,active antagonist,inactive,-135.449,<rdkit.Chem.rdchem.Mol object at 0x7ec597493290>
...,...,...,...,...,...,...
4979,CC(=C)C(=O)OCCC(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)...,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec597519cb0>
4978,CC(CC1=CC=C(C=C1)OCC(=O)O)NCC(C2=CSC(=N2)C(F)(...,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec597519c40>
4977,CCCCC(CC)COC(=O)CCCCC(=O)OCC1=CC=CC=C1,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec597519bd0>
4984,CC(=CCCC(=C)C=C)C,Inactive,inactive,inactive,0.000,<rdkit.Chem.rdchem.Mol object at 0x7ec597519ee0>


## 8. Save the DataSet<a name = 8></a>

In [None]:
data_clean.to_csv('AID_1259247_final.csv', index=False)

In [None]:
len(data_clean)

5135

In [None]:
data_clean = data_clean.loc[data_clean['Molecule'].notna()]
len(data_clean)

5134

In [None]:
PandasTools.WriteSDF(data_clean, "AID_1259247_final.sdf", molColName='Molecule',
                     properties=['PUBCHEM_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'Antagonist Activity',
                                 'Viability Activity', 'Antagonist Efficacy (%)'])