In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity
from ifeatpro.features import get_feature
import multiprocessing as mp

1. Get training data
2. Create DDE encoding for protein sequences
3. Get morgan_r2 for substrates
4. Concatenate into a single vector using dti data
5. Train KDE model
6. Evaluate model

In [None]:
# training data files

esitrain_file = "../../DeepConv-DTI/epp_examples/training_dataset/training_dti.csv"
prottrain_file = "../../DeepConv-DTI/epp_examples/training_dataset/training_protein.csv"
substrain_file = "../../DeepConv-DTI/epp_examples/training_dataset/training_compound.csv"

# Sequence Encoding

**encode them as all ifeature encodings**

In [None]:
help(get_feature)

In [None]:
# creating fasta file for ifeatpro
prot_train_df = pd.read_csv(prottrain_file, usecols=[1, 2])

In [None]:
def create_fasta(df, fastafilename):
    with open(fastafilename, "w") as f:
        for index, row in df.iterrows():
            f.write(">" + row[0] + "\n")
            f.write(row[1] + "\n")
    return 

In [None]:
create_fasta(prot_train_df, "./data/brenda_train_fasta.fa")

In [None]:
# create dde encoding for the train enzymes in brenda
get_feature("./data/brenda_train_fasta.fa", "dde", "./data/brenda_trainenz_ifeatures/")

In [None]:
prot_trainenc_df = pd.read_csv("./data/brenda_trainenz_ifeatures/dde.csv", header=None, index_col=0) 

In [None]:
prot_trainenc_df.head(2)

# Substrate Encoding

In [None]:
subs_trainenc_df = pd.read_csv(substrain_file, usecols=[1,3], index_col=0)

# Concatenation of vectors

In [2]:
# Training concatenation of sequences and substrates

dti_train_df = pd.read_csv(esitrain_file, usecols=[1,2,3])

NameError: name 'esitrain_file' is not defined

In [12]:
dti_train_df.head(2)

Unnamed: 0,Protein_ID,Compound_ID,Label
0,P20960,SUB-7737,0
1,O43915,SUB-5159,0


In [13]:
dti_train_df = dti_train_df.drop_duplicates(subset=["Protein_ID", "Compound_ID"], keep=False)

In [14]:
dtipos_train_df = dti_train_df.loc[dti_train_df.Label==1]

prot_enctrain_array = prot_trainenc_df.loc[dtipos_train_df.Protein_ID.values, :].values

subs_enctrain_array_raw = subs_trainenc_df.loc[dtipos_train_df.Compound_ID.values, :].values

subs_enctrain_array = np.apply_along_axis(lambda x: x[0].split("\t"), axis=1 , arr=subs_enctrain_array_raw).astype("float32")

enctrain_array = np.concatenate([prot_enctrain_array, subs_enctrain_array], axis = 1)

enctrain_array.shape

(8783, 2448)

# Model training

In [15]:
class EnzymeRanker:
    def __init__(self, X_tr_pos, kern="gaussian", bw=0.35):
        """
        X_tr_pos: An array of concatenated enzyme and substrate feature vectors required for model training
        kernel: kde model kernel
        bw: bandwidth of the kde model
        """
        self.kern =  kern
        self.bw = bw
        self.model = self._get_model()
        self.model.fit(X_tr_pos)
    
    def _get_model(self):
        return KernelDensity(kernel=self.kern, bandwidth=self.bw)
    
    def get_pos_acc(self, X_va_pos):
        scores_valid = self.model.score_samples(X_va_pos)
        correct_frac = len(np.where(scores_valid>=0)[0])/len(scores_valid)
        return correct_frac, scores_valid
    
    def get_neg_acc(self, X_va_neg):
        scores_valid = self.model.score_samples(X_va_neg)
        correct_frac = len(np.where(scores_valid<0)[0])/len(scores_valid)
        return correct_frac, scores_valid
    

In [16]:
%%time
kde = EnzymeRanker(enctrain_array, bw=0.39290333420962714)

CPU times: user 1.71 s, sys: 5.62 ms, total: 1.72 s
Wall time: 1.72 s


In [17]:
kde.model.score(enctrain_array[:40])

1136.386128978344

# Evaluate model

1. Get validation data
2. Create DDE encoding for protein sequences
3. Get morgan_r2 for substrates
4. Concatenate into a single vector using dti data
5. Predict using KDE model
6. Evaluate

In [18]:
# Validation files

esivalid_file = "../../DeepConv-DTI/epp_examples/validation_dataset/validation_dti.csv"
protvalid_file = "../../DeepConv-DTI/epp_examples/validation_dataset/validation_protein.csv"
subsvalid_file = "../../DeepConv-DTI/epp_examples/validation_dataset/validation_compound.csv"

## Sequence encoding for validation data

In [19]:
# creating fasta file for ifeatpro
prot_valid_df = pd.read_csv(protvalid_file, usecols=[1, 2])

In [20]:
create_fasta(prot_valid_df, "./data/brenda_valid_fasta.fa")

In [21]:
# create dde encoding for the valid enzymes in brenda
get_feature("./data/brenda_valid_fasta.fa", "dde", "./data/brenda_validenz_ifeatures/")

Descriptor type: dde


'./data/brenda_validenz_ifeatures/dde.csv'

In [22]:
prot_validenc_df = pd.read_csv("./data/brenda_validenz_ifeatures/dde.csv", header=None, index_col=0) 

In [23]:
prot_validenc_df.head(2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,391,392,393,394,395,396,397,398,399,400
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P04335,1.672597,1.18143,0.119057,2.243803,0.119057,-1.335491,-0.943317,-1.155945,-0.943317,0.821345,...,-0.471278,0.834948,0.119057,-0.666667,1.44773,0.579839,-0.943317,-0.943317,1.651754,0.834948
P0A111,0.053169,-0.982475,1.057586,1.057586,1.057586,1.497267,0.037556,0.462671,2.077617,-1.115187,...,-0.490841,0.747424,0.037556,2.189189,1.295971,-0.370629,-0.982475,0.037556,-0.490841,-0.694341


## Substrate encoding

In [24]:
subs_validenc_df = pd.read_csv(subsvalid_file, usecols=[1,3], index_col=0)

## Concatenation of validation vectors

In [25]:
# Validation concatenation of sequences and substrates

dti_valid_df = pd.read_csv(esivalid_file, usecols=[1,2,3])

In [26]:
dti_valid_df.head(2)

Unnamed: 0,Protein_ID,Compound_ID,Label
0,P49012,SUB-3806,1
1,P31658,SUB-3515,1


In [27]:
dti_valid_df = dti_valid_df.drop_duplicates(subset=["Protein_ID", "Compound_ID"], keep=False)

In [28]:
dtipos_valid_df = dti_valid_df.loc[dti_valid_df.Label==1]

protpos_encvalid_array = prot_validenc_df.loc[dtipos_valid_df.Protein_ID.values, :].values

subspos_encvalid_array_raw = subs_validenc_df.loc[dtipos_valid_df.Compound_ID.values, :].values

subspos_encvalid_array = np.apply_along_axis(lambda x: x[0].split("\t"), axis=1 , arr=subspos_encvalid_array_raw).astype("float32")

pos_encvalid_array = np.concatenate([protpos_encvalid_array, subspos_encvalid_array], axis = 1)

pos_encvalid_array.shape

(3060, 2448)

In [29]:
dtineg_valid_df = dti_valid_df.loc[dti_valid_df.Label==0]

protneg_encvalid_array = prot_validenc_df.loc[dtineg_valid_df.Protein_ID.values, :].values

subsneg_encvalid_array_raw = subs_validenc_df.loc[dtineg_valid_df.Compound_ID.values, :].values

subsneg_encvalid_array = np.apply_along_axis(lambda x: x[0].split("\t"), axis=1 , arr=subsneg_encvalid_array_raw).astype("float32")

neg_encvalid_array = np.concatenate([protneg_encvalid_array, subsneg_encvalid_array], axis = 1)

neg_encvalid_array.shape

(3147, 2448)

## Model evaluation

In [30]:
%%time
kde.get_pos_acc(pos_encvalid_array)

CPU times: user 1min 4s, sys: 269 ms, total: 1min 4s
Wall time: 1min 3s


(0.5013071895424837,
 array([ 28.2590965 ,   0.47388854,   2.08813599, ..., -55.95250443,
        -61.33164654,  -0.88953706]))

In [31]:
%%time
kde.get_neg_acc(neg_encvalid_array)

CPU times: user 1min 5s, sys: 343 ms, total: 1min 5s
Wall time: 1min 5s


(0.5684779154750557,
 array([ -72.14704307, -117.49175126, -272.95932222, ...,   -4.12997586,
           0.98658036, -143.36307275]))

In [32]:
mp.cpu_count()

24

In [33]:
bws_ = np.logspace(-0.41, -0.405, 50)

In [34]:
def bandwidth_hpopt(bw):
    kde = EnzymeRanker(enctrain_array, bw=bw)
    acc_pos, _ = kde.get_pos_acc(pos_encvalid_array)
    acc_neg, _ = kde.get_neg_acc(neg_encvalid_array)
    return bw, acc_pos, acc_neg


def multi_func(bws):
    pool = mp.Pool(mp.cpu_count())
    info = pool.map(bandwidth_hpopt, bws)
    return info


In [35]:
%%time
scores = multi_func(bws_)

CPU times: user 76.6 ms, sys: 95.8 ms, total: 172 ms
Wall time: 9min 8s


In [36]:
scores

[(0.3890451449942806, 0.7143790849673203, 0.4617095646647601),
 (0.3891365648716548, 0.7189542483660131, 0.46266285351128056),
 (0.3892280062313534, 0.7029411764705882, 0.47823323800444867),
 (0.3893194690784243, 0.7183006535947712, 0.43946615824594853),
 (0.3894109534179167, 0.7127450980392157, 0.47569113441372735),
 (0.38950245925488114, 0.7450980392156863, 0.4277089291388624),
 (0.3895939865943691, 0.6892156862745098, 0.4899904671115348),
 (0.3896855354414334, 0.6892156862745098, 0.46647600889736257),
 (0.389777105801128, 0.696078431372549, 0.47251350492532573),
 (0.38986869767850807, 0.7019607843137254, 0.4613918017159199),
 (0.3899603110786299, 0.7176470588235294, 0.4429615506831903),
 (0.390051946006551, 0.7016339869281045, 0.4489990467111535),
 (0.3901436024673302, 0.6767973856209151, 0.4547187797902765),
 (0.3902352804660273, 0.6823529411764706, 0.4632983794089609),
 (0.3903269800077034, 0.6748366013071896, 0.45662535748331745),
 (0.39041870109742083, 0.6898692810457516, 0.4356

# Performance on test data

1. train model using the best bandwidth
2. evaluate on kegg data

In [34]:
# kegg data input

dti_file = "../../DeepConv-DTI/epp_examples/test_dataset/kegg_dti.csv"
protein_file = "../../DeepConv-DTI/epp_examples/test_dataset/kegg_protein.csv"
compound_file = "../../DeepConv-DTI/epp_examples/test_dataset/kegg_compound.csv"

In [35]:
dti_df = pd.read_csv(dti_file, usecols=[1, 2])
prot_df = pd.read_csv(protein_file, usecols=[1, 2])
subs_df = pd.read_csv(compound_file, usecols=[1, 3], index_col=0)

In [36]:
prot_enctest_file = "./data/enz_ifeatures/dde.csv"
prot_enctest_df_raw = pd.read_csv(prot_enctest_file, index_col=0)

In [37]:
prot_enctest_df = prot_enctest_df_raw.loc[prot_df.Protein_ID.values, :]

In [38]:
prot_enctest_array = prot_enctest_df.loc[dti_df.Protein_ID.values, :].values

In [39]:
subs_enctest_array_raw = subs_df.loc[dti_df.Compound_ID.values, :].values

subs_enctest_array = np.apply_along_axis(lambda x: x[0].split("\t"), axis=1 , arr=subs_enctest_array_raw).astype("float32")

enctest_array = np.concatenate([prot_enctest_array, subs_enctest_array], axis = 1)

enctest_array.shape

(69409, 2448)

In [40]:
enctrain_array.shape

(8783, 2448)

In [41]:
kde = EnzymeRanker(enctrain_array, bw=0.39290333420962714)

In [42]:
%%time
kde.get_pos_acc(enctest_array)

CPU times: user 26min 5s, sys: 2.43 s, total: 26min 8s
Wall time: 26min 4s


(0.3579795127433042,
 array([ 2.44022821e+00,  4.86108182e-01,  2.43961014e+00, ...,
        -2.95347252e+03, -2.97047195e+03, -2.94693927e+03]))