In [21]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import numpy as np
from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
molecule = new_client.molecule
approved_drugs = molecule.filter(max_phase=4).order_by('molecule_type').only(['indication_class', 
                                                                              'molecule_chembl_id', 
                                                                              'molecule_type', 
                                                                              'pref_name'])

df = pd.DataFrame(approved_drugs)

In [17]:
df

Unnamed: 0,indication_class,molecule_chembl_id,molecule_type,pref_name
0,,CHEMBL4297774,Antibody,AMIVANTAMAB
1,,CHEMBL1743007,Antibody,DARATUMUMAB
2,,CHEMBL3301582,Antibody,POLATUZUMAB VEDOTIN
3,,CHEMBL3301587,Antibody,DURVALUMAB
4,,CHEMBL3301589,Antibody,ENFORTUMAB VEDOTIN
...,...,...,...,...
4187,Anti-Eczematic (topical),CHEMBL2108232,Unknown,COAL TAR
4188,Carminative; Stomachic; Counterirritant (exter...,CHEMBL2108236,Unknown,CAPSICUM
4189,,CHEMBL2108245,Unknown,CREOSOTE CARBONATE
4190,Hemostatic (local),CHEMBL2108268,Unknown,"CELLULOSE, OXIDIZED"


In [18]:
df = df[df['molecule_type'] == 'Small molecule']
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,indication_class,molecule_chembl_id,molecule_type,pref_name
0,Anticholinergic,CHEMBL1240,Small molecule,PROPANTHELINE BROMIDE
1,Antihistaminic,CHEMBL1241,Small molecule,TRIPELENNAMINE
2,Analgesic (urinary tract),CHEMBL1242,Small molecule,PHENAZOPYRIDINE
3,,CHEMBL141305,Small molecule,CYCLOFENIL
4,,CHEMBL139877,Small molecule,SULFACARBAMIDE
...,...,...,...,...
3587,,CHEMBL3989678,Small molecule,PIPERAZINE CITRATE
3588,,CHEMBL3989691,Small molecule,ELTROMBOPAG OLAMINE
3589,,CHEMBL3989693,Small molecule,LEVALBUTEROL TARTRATE
3590,,CHEMBL3989694,Small molecule,OLANZAPINE PAMOATE


In [19]:
compounds_provider = molecule.filter(
    molecule_chembl_id__in=list(df["molecule_chembl_id"])
).only("molecule_chembl_id", "molecule_structures")

In [22]:
compounds = list(tqdm(compounds_provider))

100%|██████████| 3215/3215 [12:33<00:00,  4.27it/s]


In [23]:
drug_approved_df = pd.DataFrame.from_records(
    compounds,
)
print(f"DataFrame shape: {drug_approved_df.shape}")

DataFrame shape: (3215, 2)


In [24]:
drug_approved_df

Unnamed: 0,molecule_chembl_id,molecule_structures
0,CHEMBL2,{'canonical_smiles': 'COc1cc2nc(N3CCN(C(=O)c4c...
1,CHEMBL3,"{'canonical_smiles': 'CN1CCC[C@H]1c1cccnc1', '..."
2,CHEMBL4,{'canonical_smiles': 'CC1COc2c(N3CCN(C)CC3)c(F...
3,CHEMBL5,{'canonical_smiles': 'CCn1cc(C(=O)O)c(=O)c2ccc...
4,CHEMBL6,{'canonical_smiles': 'COc1ccc2c(c1)c(CC(=O)O)c...
...,...,...
3210,CHEMBL5095049,{'canonical_smiles': 'C1=C/COCc2cc(ccc2OCCN2CC...
3211,CHEMBL5095050,{'canonical_smiles': 'CCCCCCCCCCCCOS(=O)(=O)O....
3212,CHEMBL5095051,{'canonical_smiles': 'COc1ccc(C(CN(C)C)C2(O)CC...
3213,CHEMBL5095496,


In [27]:
drug_approved_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {drug_approved_df.shape}")

DataFrame shape: (3012, 2)


In [28]:
canonical_smiles = []

for i, compounds in drug_approved_df.iterrows():
    try:
        canonical_smiles.append(compounds["molecule_structures"]["canonical_smiles"])
    except KeyError:
        canonical_smiles.append(None)

drug_approved_df["smiles"] = canonical_smiles
drug_approved_df.drop("molecule_structures", axis=1, inplace=True)
print(f"DataFrame shape: {drug_approved_df.shape}")

DataFrame shape: (3012, 2)


In [30]:
drug_approved_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {drug_approved_df.shape}")

DataFrame shape: (3012, 2)


In [31]:
df.columns

Index(['indication_class', 'molecule_chembl_id', 'molecule_type', 'pref_name'], dtype='object')

In [32]:
drug_approved_df.columns

Index(['molecule_chembl_id', 'smiles'], dtype='object')

In [36]:
output_df = pd.merge(
    df[['molecule_chembl_id']],
    drug_approved_df,
    on='molecule_chembl_id'
)

output_df.drop_duplicates(inplace=True)
output_df.reset_index(drop=True, inplace=True)

print(f"Dataset with {output_df.shape[0]} entries.")


Dataset with 3012 entries.


In [37]:
output_df

Unnamed: 0,molecule_chembl_id,smiles
0,CHEMBL1240,CC(C)[N+](C)(CCOC(=O)C1c2ccccc2Oc2ccccc21)C(C)...
1,CHEMBL1241,CN(C)CCN(Cc1ccccc1)c1ccccn1
2,CHEMBL1242,Nc1ccc(/N=N/c2ccccc2)c(N)n1
3,CHEMBL141305,CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1
4,CHEMBL139877,NC(=O)NS(=O)(=O)c1ccc(N)cc1
...,...,...
3007,CHEMBL3989678,C1CNCCN1.C1CNCCN1.C1CNCCN1.O.O=C(O)CC(O)(CC(=O...
3008,CHEMBL3989691,CC1=NN(c2ccc(C)c(C)c2)C(=O)/C1=N\Nc1cccc(-c2cc...
3009,CHEMBL3989693,CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1.CC(C)(C)NC[C...
3010,CHEMBL3989694,Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1.O.O=C(O)c1...


In [38]:
df2 = pd.read_csv('pIC50.csv')
df2

Unnamed: 0,cid,smiles,bioactivity,type,value (nM),pIC50
0,127024762,C[C@@H](CO)Nc1nc2ccccc2nc1N1CCN(Cc2cc(Cl)ccc2C...,Active,IC50,1570.0,5.804100
1,127024157,C[C@@H]1C[C@H]1Nc1nc2cnncc2nc1N1CCC([C@@H](F)c...,Active,IC50,145.0,6.838632
2,155525744,C[C@H](CO)Nc1nc2ccccc2nc1N1CCN(Cc2cc(Cl)ccc2Cl...,Active,IC50,1570.0,5.804100
3,90038419,c1ccc2nc(N3CCN(Cc4coc5ccccc45)CC3)c(NC3CC3)nc2c1,Active,IC50,3960.0,5.402305
4,90038865,CC(=O)c1ccc(OC2CCN(c3nc4cnccc4nc3NC3CC3)CC2)cc...,Active,IC50,1740.0,5.759451
...,...,...,...,...,...,...
401,127024707,O=C(O)C(F)(F)F.O=C(c1nccc2nc(NC3CC3)c(N3CCC([C...,Active,IC50,6.0,8.221849
402,127024451,O=C(O)C(F)(F)F.Oc1ccc(OC2CCN(c3nc4ccncc4nc3NC3...,Active,IC50,54.0,7.267606
403,90038663,O=S(=O)(c1ccccc1)N1CCN(c2nc3cc(F)ccc3nc2NC2CC2...,Active,IC50,9440.0,5.025028
404,90037553,O=S(=O)(c1ccccc1)N1CCN(c2nc3ccccc3nc2NC2CC2)CC1,Active,IC50,5660.0,5.247184


In [None]:
df2 = df2['']