In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity
from ifeatpro.features import get_feature
import multiprocessing as mp
import tempfile
import os
from sklearn.metrics import accuracy_score, matthews_corrcoef

In [2]:
# training data files

esitrain_file = "../../DeepConv-DTI/epp_examples/training_dataset/training_dti.csv"
prottrain_file = "../../DeepConv-DTI/epp_examples/training_dataset/training_protein.csv"
substrain_file = "../../DeepConv-DTI/epp_examples/training_dataset/training_compound.csv"


# Validation files

esivalid_file = "../../DeepConv-DTI/epp_examples/validation_dataset/validation_dti.csv"
protvalid_file = "../../DeepConv-DTI/epp_examples/validation_dataset/validation_protein.csv"
subsvalid_file = "../../DeepConv-DTI/epp_examples/validation_dataset/validation_compound.csv"


# Kegg file

dti_file = "../../DeepConv-DTI/epp_examples/test_dataset/kegg_dti.csv"
protein_file = "../../DeepConv-DTI/epp_examples/test_dataset/kegg_protein.csv"
compound_file = "../../DeepConv-DTI/epp_examples/test_dataset/kegg_compound.csv"

In [3]:
class KDE:
    def __init__(self, train_dti, train_enz, train_sub,
                valid_dti, valid_enz, valid_sub,
                with_label=True, kern="gaussian", bw=0.35,
                start=0, end=-1):

        
        # read training data
        self.train_dti_df_ = self._read_csv(train_dti, columns=[0,1,2,3])
        self.train_enz_df = self._read_csv(train_enz, columns=[1,2])
        self.train_sub_df = self._read_csv(train_sub, columns=[1,2,3])
        
        # get rid of illegal sequences from training
        illegal_proteins = list(self.train_enz_df.loc[self.train_enz_df.Sequence.str.contains("B|J|X|Z|O|U", regex=True)].index)
        self.train_dti_df = self.train_dti_df_.loc[~self.train_dti_df_.Protein_ID.isin(illegal_proteins)]
        
        # read validation data
        if with_label:
            self.valid_dti_df_ = self._read_csv(valid_dti, columns=[0,1,2,3]).iloc[start:end, :]
        else:
            self.valid_dti_df_ = self._read_csv(valid_dti, columns=[0,1,2]).iloc[start:end, :]

        self.valid_enz_df = self._read_csv(valid_enz, columns=[1,2])
        self.valid_sub_df = self._read_csv(valid_sub, columns=[1,2,3])
        
        # get rid of illegal sequences from validation
        illegal_proteins = list(self.valid_enz_df.loc[self.valid_enz_df.Sequence.str.contains("B|J|X|Z|O|U", regex=True)].index)
        self.valid_dti_df = self.valid_dti_df_.loc[~self.valid_dti_df_.Protein_ID.isin(illegal_proteins)]
        
        # modify training interaction data to include only positive data
        self.pos_train_dti_df = self.train_dti_df.loc[self.train_dti_df.Label==1].drop_duplicates()
        
        # create features for train and validation enzymes
        self.train_enz_encdf = self._create_enzyme_features(self.train_enz_df)
        self.valid_enz_encdf = self._create_enzyme_features(self.valid_enz_df)
        
        # get the substrate encodings
        self.train_sub_encdf = self.train_sub_df.iloc[:, [1]]
        self.valid_sub_encdf = self.valid_sub_df.iloc[:, [1]]
        
        # concatenate the two enzyme and substrate features
        self.train_enc = self._concat_features(self.pos_train_dti_df, self.train_enz_encdf, self.train_sub_encdf)
        self.valid_enc = self._concat_features(self.valid_dti_df, self.valid_enz_encdf, self.valid_sub_encdf)
        
        # define hyperparameters
        self.kern = kern
        self.bw = bw
        
        pass
        
        
    def _read_csv(self, filename, columns):
        df = pd.read_csv(filename, index_col=0, usecols=columns)
        return df

    
    def _create_fasta(self, df, fastafilename):
        with open(fastafilename, "w") as f:
            for index, row in df.iterrows():
                f.write(">" + index + "\n")
                f.write(row[0] + "\n")
        return 

    
    def _create_enzyme_features(self, enz_df, feat_type="dde"):
        # create temporary fastafile
        self._create_fasta(enz_df, "temp.fa")
        
        # read filename created as fasta file
        get_feature("temp.fa", feat_type, "./")
        
        enz_enc_df = pd.read_csv(f"./{feat_type}.csv", header=None, index_col=0)
        
        os.remove(os.path.join("./", "temp.fa"))
        os.remove(os.path.join("./", f"{feat_type}.csv"))

        return enz_enc_df
    
    
    def _concat_features(self, dti_df, enz_encdf, sub_encdf):
        enz_encarr = enz_encdf.loc[dti_df.Protein_ID.values, :].values
        sub_encarr_ = sub_encdf.loc[dti_df.Compound_ID.values, :].values
        sub_encarr = np.apply_along_axis(lambda x: x[0].split("\t"), axis=1 , arr=sub_encarr_).astype("float32")
        enc_arr = np.concatenate([enz_encarr, sub_encarr], axis=1)
        return enc_arr
    
    
    def train(self, kern="gaussian", bw="0.35"):
        self.model = KernelDensity(kernel=kern, bandwidth=bw)
        self.model.fit(self.train_enc)
        return
    
    
    def predict(self, X_va):
        return self.model.score_samples(X_va)
    
    
    def evaluate(self, scores):
        preds = scores>=0
        preds = preds.astype(int)
        trues = self.valid_dti_df.Label.values
        acc = accuracy_score(trues, preds)
        mcc = matthews_corrcoef(trues, preds)
        pos_idx = np.where(trues==1)[0]
        pacc = sum(preds[pos_idx])/len(pos_idx)
        neg_idx = np.where(trues==0)[0]
        nacc = sum(preds[pos_idx]==0)/len(neg_idx)
        return acc, mcc, pacc, len(pos_idx), nacc, len(neg_idx)
    
    
    def _opt_bw_helper(self, bw):
        kde = KernelDensity(kernel=self.kern, bandwidth=bw)
        kde.fit(self.train_enc)
        scores = kde.score_samples(self.valid_enc)
        preds_bool = scores>=0
        preds = preds_bool.astype(int)
        trues = self.valid_dti_df.Label.values
        return matthews_corrcoef(trues, preds)
        
        
        
    def opt_bw(self, bws):
        pool = mp.Pool(mp.cpu_count())
        res = pool.map(self._opt_bw_helper, bws)
        return zip(bws, res) 

In [4]:
kde = KDE(esitrain_file, prottrain_file, substrain_file, esivalid_file, protvalid_file, subsvalid_file)

Descriptor type: dde
Descriptor type: dde


In [5]:
bws = np.logspace(-0.41, -0.405, 48)

In [6]:
%%time
opt_res = list(kde.opt_bw(bws))

CPU times: user 7.67 s, sys: 11.2 s, total: 18.8 s
Wall time: 4min 12s


In [7]:
best_opt = sorted(list(opt_res), key=lambda x:x[1], reverse=True)[0][0]

In [8]:
kde.train(bw=best_opt)

In [9]:
%%time
scores = kde.predict(kde.valid_enc)

CPU times: user 1min 36s, sys: 117 ms, total: 1min 36s
Wall time: 1min 36s


In [10]:
kde.evaluate(scores)

(0.6217371737173717,
 0.3498292539141778,
 0.23399355135882083,
 2171,
 0.7316322041355038,
 2273)

In [12]:
kde_kegg = KDE(esitrain_file, prottrain_file, substrain_file, dti_file, protein_file, compound_file, with_label=False)

Descriptor type: dde
Descriptor type: dde


In [13]:
kde_kegg.train(bw=best_opt)

In [14]:
%%time
test_scores = kde_kegg.predict(kde_kegg.valid_enc)

CPU times: user 23min 45s, sys: 2.76 s, total: 23min 48s
Wall time: 23min 43s


In [15]:
sum(test_scores>=0)

604

In [16]:
sum(test_scores>=0)/len(test_scores)

0.008997065526641146