In [1]:
import os
import numpy as np
import pandas as pd
import math
import re
from rdkit import Chem

import deepchem as dc

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/zhanghanwen/anaconda3/envs/molnet/lib/python3.7/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
def mksure_path(dirs_or_files):
    if not os.path.exists(dirs_or_files):
        os.makedirs(dirs_or_files)

In [3]:
# file path
root = "./ChEMBL_data/"
chembl_data_path = root + "in_vitro/"
chembl_pvalue_dataset_path = root + "Datasets/pValue/"
chembl_normpvalue_dataset_path = root + "Datasets/norm_pValue/"

In [4]:
# function
pvalue_computer = lambda x : round(-np.log10(x*math.pow(10,-9)), 2)
featurizer = dc.feat.CircularFingerprint(size=1024)  # 'ECFP'


In [5]:
# read SMILES and bioactivities and shape them into a dataset
def loadBiasedData(smiles_path, pro_path):
    smiless = []
    pros = []
    with open(smiles_path) as f:
        smiles = f.readlines()
    with open(pro_path) as f:
        pro = f.readlines()
    assert len(smiles) == len(pro)
    
    for i in range(len(smiles)):
        cur_smi = smiles[i].strip()
        cur_pro = float(pro[i].strip())
        try:
            mol = Chem.MolFromSmiles(cur_smi)
            canonical_smi = Chem.MolToSmiles(mol, isomericSmiles=False)
            smiless.append(canonical_smi)
            pros.append(cur_pro)
        except:
            print(cur_smi)
            continue

    return smiless, pros

In [5]:
# create a dataset
def CreateDatasets(csv_file_path, pvalue_dir, norm_pvalue_dir):
    test_column = pd.read_csv(csv_file_path, usecols=['assay_type', 'canonical_smiles', 'molecule_chembl_id', 'pchembl_value', 'standard_type', 'standard_value', 'target_chembl_id'])
    # log it
    test_column["pValue"] = test_column["standard_value"].apply(pvalue_computer)
    # normalize it
    denominator = np.max(test_column["pValue"]) - np.min(test_column["pValue"])
    test_column["norm_pValue"] = round((test_column["pValue"] - np.min(test_column["pValue"]))/denominator,2)
    # delete nan
    test_column = test_column.dropna(axis=0,how='any')  
    if len(test_column) < 100:
        print(csv_file_path + ", no 100 number!")
        return 
    # task name
    task_name = [test_column["assay_type"].unique()[0] + "_" + test_column["target_chembl_id"].unique()[0] + "_" + test_column["standard_type"].unique()[0]]

    # create the dataset
    features = featurizer.featurize(test_column["canonical_smiles"])
    print(pvalue_dir)
    pvalue_created_dataset = dc.data.DiskDataset.from_numpy(X=features, y=test_column["pValue"], ids=test_column["canonical_smiles"], 
                                                     tasks = task_name, 
                                                     data_dir=pvalue_dir)
    print(norm_pvalue_dir)
    norm_pvalue_created_dataset = dc.data.DiskDataset.from_numpy(X=features, y=test_column["norm_pValue"], ids=test_column["canonical_smiles"], 
                                                     tasks = task_name, 
                                                     data_dir=norm_pvalue_dir)


In [6]:
def get_files(chembl_data_path):
    files = []
    for filepath,dirnames,filenames in os.walk(chembl_data_path):
        if len(filenames) != 0:
#             print(filepath)
#             print(filepath.split('/')[-3:])
            assay_type = filepath.split('/')[-3]
            target_type = filepath.split('/')[-2]
            bio_type = filepath.split('/')[-1]
            pvalue_data_dir = chembl_pvalue_dataset_path + assay_type + "/" + \
                 target_type + "/" + bio_type + "/"
            norm_pvalue_data_dir = chembl_normpvalue_dataset_path + assay_type + "/" + \
                 target_type + "/" + bio_type + "/"
#             print(pvalue_data_dir)
#             print(norm_pvalue_data_dir)
        for filename in filenames:
#             print(os.path.join(filepath,filename))
#             print(filename)
#             print()
            cur_target = re.findall('(.*).csv', filename)[0]
#             print(cur_target)
            source_csv_path = os.path.join(filepath,filename)
            goal_pvalue_dir = pvalue_data_dir + cur_target + "/"
            goal_norm_pvalue_dir = norm_pvalue_data_dir + cur_target + "/"
#             print(goal_pvalue_dir)
            mksure_path(goal_pvalue_dir)
#             print(goal_norm_pvalue_dir)
            mksure_path(goal_norm_pvalue_dir)
            # begin to create
            CreateDatasets(source_csv_path, goal_pvalue_dir, goal_norm_pvalue_dir)
            files.append(source_csv_path)
        print("-"*8)
    return files

In [8]:
def read_chembl_datasets():
    files = []
    pvalue_datasets_list = []
    norm_pvalue_datasets_list = []
    add_filenames = 0
    for filepath,dirnames,filenames in os.walk(chembl_data_path):
        if len(filenames) != 0:
    #             print(filepath)
    #             print(filepath.split('/')[-3:])
            assay_type = filepath.split('/')[-3]
            target_type = filepath.split('/')[-2]
            bio_type = filepath.split('/')[-1]
            pvalue_data_dir = chembl_pvalue_dataset_path + assay_type + "/" + \
                 target_type + "/" + bio_type + "/"
            norm_pvalue_data_dir = chembl_normpvalue_dataset_path + assay_type + "/" + \
                 target_type + "/" + bio_type + "/"
    #             print(pvalue_data_dir)
            print(norm_pvalue_data_dir, len(filenames))
            add_filenames += len(filenames)
        for filename in filenames:
    #             print(os.path.join(filepath,filename))
    #             print(filename)
    #             print()
#             print(filename)
            cur_target = re.findall('(.*).csv', filename)[0]
            print(cur_target)
            source_csv_path = os.path.join(filepath,filename)
            goal_pvalue_dir = pvalue_data_dir + cur_target + "/"
            goal_norm_pvalue_dir = norm_pvalue_data_dir + cur_target + "/"
    #             print(goal_pvalue_dir)
            mksure_path(goal_pvalue_dir)
#             print(goal_norm_pvalue_dir)
            mksure_path(goal_norm_pvalue_dir)
#             print(goal_pvalue_dir)
            pvalue_datasets_list.append(dc.data.DiskDataset(goal_pvalue_dir))
#             print(dc.data.DiskDataset(goal_pvalue_dir))
            norm_pvalue_datasets_list.append(dc.data.DiskDataset(goal_norm_pvalue_dir))
            print(filename, dc.data.DiskDataset(goal_norm_pvalue_dir).get_task_names())
            files.append(source_csv_path)
    #         break
        print("-"*8)
    print(add_filenames)
    
    return files, pvalue_datasets_list, norm_pvalue_datasets_list

In [11]:
print(len(files))

1395


In [12]:
len(files), len(pvalue_datasets_list), len(norm_pvalue_datasets_list)

(1395, 1395, 1395)

In [13]:
pvalue_datasets_list[8].get_task_names()

array(['F_CHEMBL3371_IC50'], dtype='<U17')

In [14]:
"0: " + pvalue_datasets_list[0].get_task_names()[0]

'0: F_CHEMBL4333_IC50'

In [15]:
pvalue_datasets_list[8].y

array([6.1 , 6.52, 9.1 , 7.95, 7.85, 7.51, 8.05, 6.77, 8.39, 8.01, 7.47,
       9.4 , 8.77, 7.71, 8.07, 7.86, 9.15, 8.41, 8.03, 7.37, 7.83, 7.37,
       8.06, 8.57, 8.89, 9.1 , 7.85, 8.8 , 7.67, 7.59, 7.68, 7.98, 8.96,
       8.14, 8.12, 9.22, 7.09, 7.12, 7.02, 8.09, 7.31, 7.25, 7.61, 8.14,
       9.1 , 9.  , 6.95, 7.96, 7.27, 7.28, 7.07, 7.36, 7.81, 6.97, 7.01,
       7.68, 6.91, 7.6 , 7.21, 6.96, 6.96, 7.3 , 8.  , 7.59, 6.19, 7.96,
       6.92, 6.88, 7.25, 7.64, 7.02, 6.85, 7.04, 6.67, 6.73, 6.87, 6.47,
       7.59, 8.1 , 6.31, 7.3 , 6.47, 7.15, 7.55, 6.17, 7.76, 8.02, 6.21,
       6.9 , 6.79, 7.96, 7.59, 7.28, 6.59, 6.77, 8.1 , 8.  , 6.92, 6.61,
       6.98, 7.11, 7.02, 6.93, 7.  , 5.8 , 7.28, 6.24, 7.02, 7.24, 7.33,
       6.62, 7.27, 6.75, 7.39, 6.41, 6.53, 6.08, 6.71, 8.21, 8.08, 7.82,
       8.27, 5.8 , 5.62, 7.67, 7.27, 6.49, 6.06, 5.94, 5.98, 6.01, 5.63,
       4.68, 5.95, 5.64, 6.16, 6.18, 5.39, 6.01, 6.56, 6.8 , 5.43, 5.7 ,
       5.82, 5.25, 5.53, 5.84, 5.76, 6.  , 6.26, 5.

In [30]:
set(pvalue_datasets_list[0].ids)

{'C=C1C(=O)C=C2CN(C(=O)c3ccccc3)[C@@](Cc3ccc(F)cc3)(C(=O)OC)[C@@H]12',
 'C=CCn1c(O)nnc1Sc1ncc([N+](=O)[O-])s1',
 'CC(=O)Cn1ncc(Br)c(Br)c1=O',
 'CC(=O)N(C1=C(Cl)C(=O)c2ccccc2C1=O)C1CCCCC1',
 'CC(=O)n1cc(N(C(=O)CCl)c2ccc(C)cc2)c2ccccc21',
 'CC(=O)n1cc(N(C(=O)CCl)c2ccc(Cl)cc2)c2ccccc21',
 'CC(C)(C)C(=O)/C(=C\\c1cccc([N+](=O)[O-])c1)n1cncn1',
 'CC(C)(C)c1ccc(-n2nc(C#N)c(Cl)cc2=O)cc1',
 'CC(C)Oc1ccc(COc2ccc3c(c2)c(Cl)c2n3CCOC2CC(=O)O)cc1C#N',
 'CC(C)c1nnc(NC(=O)CCS(=O)(=O)c2nc(-c3cccs3)cc(C(F)(F)F)n2)s1',
 'CC1(C)C[C@H]2C=C(C=O)[C@]3(C=O)C[C@]3(C)[C@H]2C1',
 'CC1=CC2/C=C(\\C)CCC(O)C(O)/C=C/C(=O)C23C(=O)NC(CC(C)C)C3C1C',
 'CC1=NNC(=O)/C1=C\\c1cn(-c2ccccc2)nc1-c1ccccc1',
 'CCC(Nc1cc(C)cc(CN2CC(C(=O)O)C2)c1)c1ccc(Cl)c(C)c1',
 'CCC(Nc1cc(C)cc(CN2CC[C@@H](C(=O)O)C2)c1)c1ccc(Cl)c(C)c1',
 'CCC(Nc1ccc(C)c(C(C)N2CC(C(=O)O)C2)c1)c1ccc(Cl)c(C)c1',
 'CCC(Nc1ccc(C)c(CN2CC(C(=O)O)C2)c1)c1ccc(Cl)c(C)c1',
 'CCC(Nc1cccc(C(C)N2CC(C(=O)O)C2)c1)c1ccc(Cl)c(C)c1',
 'CCC(Nc1cccc(CN2CC(C(=O)O)C2)c1)c1ccc(Cl)c(C)c1