In [1]:
# modules
import os
import re
import sys
import tqdm
import itertools
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.metrics import precision_score,accuracy_score,f1_score,recall_score
from sklearn.model_selection import train_test_split

sys.path.append('../')
from ensemble.model import Ensemble
from baseModels.SVM.model import SVM
from baseModels.GBC.model import GBC
from baseModels.NN.model import NN
from featEngg.online.kmerMethods.models import ngModel,gaangModel

In [2]:
class Base:
    def __init__(self,SVM=True,GBC=False,NN=False,pca_components=55,regCparam=5,
        kernparam='rbf',nestparam=300,lrateparam=0.005,mdepthparam=5,ssampleparam=1,hlayer=(100,50,10),
        lrateinit=0.01,regparam=0.005,random_seed=None,optimizeQ=False,verboseQ=False):
        
        self.pca_components=pca_components
        self.optimizeQ=optimizeQ
        self.verboseQ=verboseQ
        self.rs=random_seed
        
        if SVM:
            self.regCparam=regCparam
            self.kernparam=kernparam
            
        elif GBC:
            self.nestparam=nestparam
            self.lrateparam=lrateparam
            self.mdepthparam=mdepthparam


        elif NN:
            self.hlayer=hlayer
            self.lrateparam=lrateinit
            self.reg=regparam

            
        else:
            raise ValueError('No model initiated')
            
    def get_SVM(self,Xtrain,Xvalid,ytrain,yvalid,Xtest=None):
        return SVM(Xtrain,Xvalid,ytrain,yvalid,Xtest,pca_comp=self.pca_components,regC=self.regCparam,kern=self.kernparam,optimize=self.optimizeQ,verbose=self.verboseQ,random_seed=self.rs,classweight=None)
    
    def get_GBC(self,Xtrain,Xvalid,ytrain,yvalid,Xtest=None):
        return GBC(Xtrain,Xvalid,ytrain,yvalid,Xtest,pca_comp=self.pca_components,nest=self.nestparam,lrate=self.lrateparam,mdepth=self.mdepthparam,optimize=self.optimizeQ,verbose=self.verboseQ,random_seed=self.rs)

    def get_NN(self,Xtrain,Xvalid,ytrain,yvalid,Xtest=None):
        return NN(Xtrain,Xvalid,ytrain,yvalid,Xtest,pca_comp=self.pca_components,hlayers=self.hlayer,lrateinit=self.lrateparam,regparam=self.reg,optimize=self.optimizeQ,verbose=self.verboseQ,random_seed=self.rs)


class EClassification(Base):
    
    def __init__(self,enzseqdata,testenzseqdata,labelfile,trainfeaturefiledirs,testfeaturefiledirs,model='SVM',random_seed=17,pca_components=55,n_models=17,validation_fraction=0.25):
        
        self.random_seed = random_seed
        self.model=model
        self.default_pca_components = pca_components
        self.n_models = n_models
        self.validation_fraction = validation_fraction
        self.test = True if testfeaturefiledirs else False
        
        
        #initialize super class
        if self.model=='SVM':
            super().__init__(optimizeQ=False)
        else:
            if self.model=='GBC':
                super().__init__(SVM=False,GBC=True)
            elif self.model=='NN':
                super().__init__(SVM=False,NN=True)
            else:
                raise ValueError('Wrong Model Assigned')
        
        self.object_map = {'SVM':self.get_SVM,'NN':self.get_NN,'GBC':self.get_GBC}
        
        # original data based on which everything is obtained
        df1 = pd.read_csv(enzseqdata,header=None)
        df2 = pd.read_csv(labelfile,header=None)
        self.df = df1.merge(df2,on=0)
        
        '''Change train file to a proper file with semi-balanced split'''
        self.df = self.df.loc[self.df['1_x'].apply(self.filter_by_len)]
        df_0 = self.df.loc[self.df.iloc[:,-1]==0]
        df_1 = self.df.loc[self.df.iloc[:,-1]==1]
        train_df0 = df_0.sample(n=int(0.75*(len(df_0))),random_state=self.random_seed)
        valid_df0 = df_0.loc[~df_0.isin(train_df0)[0]]
        train_df1 = df_1.sample(n=int(0.6*(len(df_1))),random_state=self.random_seed)
        valid_df1 = df_1.loc[~df_1.isin(train_df1)[0]]
        train_df = pd.concat([train_df0,train_df1])
        valid_df = pd.concat([valid_df0,valid_df1])
        self.train_df = train_df.sample(frac=1,random_state=self.random_seed)
        self.valid_df = valid_df.sample(frac=1,random_state=self.random_seed)
        
        self.enz_train = self.train_df[0].values
        self.X_train = self.train_df.iloc[:,1].values
        self.y_train = self.train_df.iloc[:,-1].values
        
        self.enz_valid = self.valid_df[0].values
        self.X_valid = self.valid_df.iloc[:,1].values
        self.y_valid = self.valid_df.iloc[:,-1].values
        
        
        self.label_file = labelfile
        
        # test data
        if self.test:
            self.test_df = pd.read_csv(testenzseqdata,header=None)
            self.testenz_names = self.test_df[0].values
            self.X_test = self.test_df.iloc[:,1].values
        else:
            self.X_test=None
        
        self.make_tempdir()
            
        # gaakmer #kmer gives too many nan values
        gaang = gaangModel(self.X_train,self.X_valid,self.X_test)
        kmernames = ['gaakmer']
        kmerObjs = [self.get_model_online(gaang.Xtrain,gaang.Xvalid,self.y_train,self.y_valid,gaang.Xtest)]

        
        #generate a list of names from the directories
        trainfeatfiles = [d+f.name for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]            
        self.featnames = [f.name.replace('.csv.gz','') for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
        
        feat_pool = mp.Pool(mp.cpu_count())
        if self.test:
            testfeatfiles = [d+f.name for d in testfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
            func_iter = list(zip(trainfeatfiles,testfeatfiles))
            assert [f.name for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]==[f.name for d in testfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
            self.objects=list(itertools.starmap(self.get_model_offline,func_iter))

        else:
            # getting all objects together
            self.objects = list(feat_pool.map(self.get_model_offline,trainfeatfiles))
            
        self.featnames.extend(kmernames)
        self.objects.extend(kmerObjs)
            
        
        # select only the best models based on training or validation
        self.best_idx,self.best_models = self.select_top_models(self.objects)
        self.best_model_names = np.array(self.featnames)[self.best_idx]
        
        # getting all model predictions together for ensemble
        if not self.test:
            self.all_model_preds = [o.ypredvalid for o in self.best_models]
            self.en = Ensemble(self.all_model_preds,self.y_valid)
            self.precision = precision_score(self.y_valid,self.en.preds,pos_label=0,average='binary')
            
        else:
            self.all_model_preds = [o.yhattest for o in self.best_models]
            self.en = Ensemble(self.all_model_preds)
        
        self.del_dir()
        
        self.write_best_model_stats('../data/sim_results/results.csv')
        pass
    
    def filter_by_len(self,seq):
        val = len(seq)
        if val>200 and val<600:
            return True
        return False

    
    def make_tempdir(self):
        !mkdir -p tmp
        return 
    
    def write_model_stats(self,featname,obj):
        with open('tmp/'+featname+'.txt','w') as f:
            f.write(str(obj.acc_train))
            f.write('\n')
            f.write(str(obj.acc_valid))
            f.write('\n')
        return
    
    def del_dir(self):
        !rm -rf tmp
        return
    
    def get_model_online(self,X_train,X_valid,y_train,y_valid,X_test=None):

        if X_train.shape[1]<self.default_pca_components:
            self.pca_components = int(0.75*X_train.shape[1])
        else:
            self.pca_components=self.default_pca_components
            
        if self.test:
            obj = self.object_map[self.model](X_train,X_valid,y_train,y_valid,X_test)
        else:
            obj = self.object_map[self.model](X_train,X_valid,y_train,y_valid)
        return obj
    
    
    def get_model_offline(self,featfilename,testfeatfilename=None):
        
        df1 = pd.read_csv(featfilename,header=None)
        df2 = pd.read_csv(self.label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        df_feat_train = df_feat.loc[self.enz_train]
        df_feat_valid = df_feat.loc[self.enz_valid]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_valid_feat,y_valid_feat = df_feat_valid.iloc[:,0:-1].values,df_feat_valid.iloc[:,-1].values
        
        if X_train_feat.shape[1]<self.default_pca_components:
            self.pca_components = int(0.75*X_train_feat.shape[1])
        else:
            self.pca_components=self.default_pca_components
            
        if self.test:
            df_feat_test = pd.read_csv(testfeatfilename,header=None).set_index(0)
            X_test_feat = df_feat_test.loc[self.testenz_names].values
            if X_train_feat.shape[1] != X_test_feat.shape[1]:
                print(featfilename)
            obj = self.object_map[self.model](X_train_feat,X_valid_feat,y_train_feat,y_valid_feat,X_test_feat)
        else:
            obj = self.object_map[self.model](X_train_feat,X_valid_feat,y_train_feat,y_valid_feat)
            
        pattern = re.compile('.+/(.+)\.csv\.gz$')
        m = pattern.match(featfilename)
        feat_name = m.group(1)
        self.write_model_stats(feat_name,obj)
                
        return obj
    
        
    def select_top_models(self,Os):
        o_valid_accs = [o.acc_valid for o in Os] if self.test else [o.acc_train for o in Os] 
        sorted_idx = np.argsort(o_valid_accs)[::-1]
        best_idx = sorted_idx[:self.n_models]
        return best_idx,np.array(Os)[best_idx]
    
    def get_precision(self,y,yhat):
        return round(precision_score(y,yhat,pos_label=0,average='binary'),2)


    def get_recall(self,y,yhat):
        return round(recall_score(y,yhat,pos_label=0,average='binary'),2)


    def get_f1_score(self,y,yhat):
        return round(f1_score(y,yhat),2)

    def get_accuracy(self,y,yhat):
        return round(accuracy_score(y,yhat),2)

    def write_best_model_stats(self,filename):
        datadict = {'precision':[],'recall':[],'f1_score':[],'accuracy':[]}
        index_names = ['ensemble']
        # ensemble_model_stats
        ensemble_y = self.y_valid
        ensemble_yhat = self.en.preds
        datadict['precision'].append(self.get_precision(ensemble_y,ensemble_yhat))
        datadict['recall'].append(self.get_recall(ensemble_y,ensemble_yhat))
        datadict['f1_score'].append(self.get_f1_score(ensemble_y,ensemble_yhat))
        datadict['accuracy'].append(self.get_accuracy(ensemble_y,ensemble_yhat))


        for idx,name in zip(self.best_idx,self.best_model_names):
            index_names.append(name)
            yi = self.objects[idx].yvalid
            yhati = self.objects[idx].ypredvalid
            datadict['precision'].append(self.get_precision(yi,yhati))
            datadict['recall'].append(self.get_recall(yi,yhati))
            datadict['f1_score'].append(self.get_f1_score(yi,yhati))
            datadict['accuracy'].append(self.get_accuracy(yi,yhati))

        df = pd.DataFrame(datadict,index=index_names)
        return df.to_csv(filename,index=True,header=True)       

In [21]:
%%time
if __name__=='__main__':
    enz_file = '../data/enz_sequence.csv'
    label_file = '../data/enz_labels.csv'
    


    # Feature files for iFeature, pssmMethods 
    ifeatdatadir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
    pssmdatadir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'
    
    trainfeatdirs = [ifeatdatadir,pssmdatadir]
    
    ec = EClassification(enz_file,None,label_file,trainfeatdirs,None,model='SVM',validation_fraction=0.34,pca_components=75,n_models=17)
    

CPU times: user 28.9 s, sys: 4.84 s, total: 33.8 s
Wall time: 53.9 s


In [33]:
ec.valid_df.shape

(1541, 3)

In [35]:
ec.valid_df.loc[ec.valid_df['1_y']==0]

Unnamed: 0,0,1_x,1_y
5265,Q13VI0@UniProt,MTEDVRMERDTFGEIAVPNARLWGAQTQRSLQNFRISTEKQSPELI...,0
2413,Q11205@SwissProt,MKCSLRVWFLSMAFLLVFIMSLLFTYSHHSMATLPYLDSGTLGGTH...,0
5327,A9S014@UniProt,MAVPVSNLPLRAIPGGYGISYLGAIKDRLDYFWIQGEEEFYRSRVE...,0
2770,P08839@UniProt,MISGILASPGIAFGKALLLKEDEIVIDRKKISADQVDQEVERFLSG...,0
5529,E7DDH2@UniProt,MQKRMLGGMVAGALACFQVQAAQFQCQDDVKPAAISAEEQKLVDQF...,0
...,...,...,...
550,Q0VH42@UniProt,MTVKISHTAEVQDLIKEAAGFNSDQGSPRLKQLMHRLISDAFKIIE...,0
4677,O67848@UniProt,MNELIEKAKVLQEALPYIREFHGKVFVIKYGGSAMHDEELRESFAR...,0
3780,Q8A186@UniProt,MKSTFLFLVTTTMMTCTALGQPSNDKKNVLPDWAFGGFERPQGANP...,0
5859,Q59054@UniProt,MRFDIKKVLELAEKDFETAWRETRALIKDKHIDNKYPRLKPVYGKP...,0


In [125]:
#write_best_model_stats(ec,None)

Unnamed: 0,precision,recall,f1_score,accuracy
ensemble,0.361596,0.347722,0.766784,0.657365
CTriad,0.313149,0.434053,0.696694,0.589228
KSCTriad,0.312281,0.426859,0.698807,0.590526
DPC,0.316602,0.393285,0.71728,0.6061
DDE,0.318102,0.434053,0.70229,0.595068
Moran,0.314501,0.40048,0.712277,0.601557
TPC,0.315789,0.402878,0.712611,0.602206
Geary,0.309568,0.395683,0.709193,0.597664
gaakmer,0.297762,0.414868,0.68714,0.576898
CKSAAP,0.336842,0.460432,0.712172,0.608696
