In [1]:
# modules
import os
import re
import sys
import tqdm
import itertools
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek

sys.path.append('../')
from ensemble.model import Ensemble
from baseModels.SVM.model import SVM
from baseModels.GBC.model import GBC
from baseModels.NN.model import NN
from featEngg.online.kmerMethods.models import ngModel,gaangModel

In [2]:
class Base:
    def __init__(self,SVM=True,GBC=False,NN=False,pca_components=55,regCparam=5,
        kernparam='rbf',nestparam=300,lrateparam=0.005,mdepthparam=5,ssampleparam=1,hlayer=(100,50,10),
        lrateinit=0.01,regparam=0.005,random_seed=None,optimizeQ=False,verboseQ=False):
        
        self.pca_components=pca_components
        self.optimizeQ=optimizeQ
        self.verboseQ=verboseQ
        self.rs=random_seed
        
        if SVM:
            self.regCparam=regCparam
            self.kernparam=kernparam
            
        elif GBC:
            self.nestparam=nestparam
            self.lrateparam=lrateparam
            self.mdepthparam=mdepthparam


        elif NN:
            self.hlayer=hlayer
            self.lrateparam=lrateinit
            self.reg=regparam

            
        else:
            raise ValueError('No model initiated')
            
    def get_SVM(self,Xtrain,Xvalid,ytrain,yvalid,Xtest=None):
        return SVM(Xtrain,Xvalid,ytrain,yvalid,Xtest,pca_comp=self.pca_components,regC=self.regCparam,kern=self.kernparam,optimize=self.optimizeQ,verbose=self.verboseQ,random_seed=self.rs,classweight=None)
    
    def get_GBC(self,Xtrain,Xvalid,ytrain,yvalid,Xtest=None):
        return GBC(Xtrain,Xvalid,ytrain,yvalid,Xtest,pca_comp=self.pca_components,nest=self.nestparam,lrate=self.lrateparam,mdepth=self.mdepthparam,optimize=self.optimizeQ,verbose=self.verboseQ,random_seed=self.rs)

    def get_NN(self,Xtrain,Xvalid,ytrain,yvalid,Xtest=None):
        return NN(Xtrain,Xvalid,ytrain,yvalid,Xtest,pca_comp=self.pca_components,hlayers=self.hlayer,lrateinit=self.lrateparam,regparam=self.reg,optimize=self.optimizeQ,verbose=self.verboseQ,random_seed=self.rs)


class EClassification(Base):
    
    def __init__(self,enzseqdata,testenzseqdata,labelfile,trainfeaturefiledirs,testfeaturefiledirs,model='SVM',random_seed=None,pca_components=55,n_models=17,validation_fraction=0.25):
        
        self.random_seed = random_seed
        self.model=model
        self.default_pca_components = pca_components
        self.n_models = n_models
        self.validation_fraction = validation_fraction
        self.test = True if testfeaturefiledirs else False
        
        
        #initialize super class
        if self.model=='SVM':
            super().__init__(optimizeQ=False)
        else:
            if self.model=='GBC':
                super().__init__(SVM=False,GBC=True)
            elif self.model=='NN':
                super().__init__(SVM=False,NN=True)
            else:
                raise ValueError('Wrong Model Assigned')
        
        self.object_map = {'SVM':self.get_SVM,'NN':self.get_NN,'GBC':self.get_GBC}
        
        # original data based on which everything is obtained
        df1 = pd.read_csv(enzseqdata,header=None)
        df2 = pd.read_csv(labelfile,header=None)
        self.train_df = df1.merge(df2,on=0)
        
        self.enz_names = self.train_df[0].values
        self.X = self.train_df.iloc[:,1].values
        self.y = self.train_df.iloc[:,-1].values
        
        # training and validation data for general use
        self.X_train, self.X_valid, self.y_train, self.y_valid,self.enz_train,self.enz_valid = train_test_split(self.X, self.y,self.enz_names, test_size=self.validation_fraction, random_state=self.random_seed)
        
        self.label_file = labelfile
        
        # test data
        if self.test:
            self.test_df = pd.read_csv(testenzseqdata,header=None)
            self.testenz_names = self.test_df[0].values
            self.X_test = self.test_df.iloc[:,1].values
        else:
            self.X_test=None
        
        self.make_tempdir()
            
        # kmer and gaakmer
        ng = ngModel(self.X_train,self.X_valid,self.X_test)
        gaang = gaangModel(self.X_train,self.X_valid,self.X_test)
        kmernames = ['kmer','gaakmer']
        kmerObjs = [self.get_model_online(ng.Xtrain,ng.Xvalid,self.y_train,self.y_valid,ng.Xtest),self.get_model_online(gaang.Xtrain,gaang.Xvalid,self.y_train,self.y_valid,gaang.Xtest)]

        
        #generate a list of names from the directories
        trainfeatfiles = [d+f.name for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]            
        self.featnames = [f.name.replace('.csv.gz','') for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
        
        feat_pool = mp.Pool(mp.cpu_count())
        if self.test:
            testfeatfiles = [d+f.name for d in testfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
            func_iter = list(zip(trainfeatfiles,testfeatfiles))
            assert [f.name for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]==[f.name for d in testfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
            self.objects=list(itertools.starmap(self.get_model_offline,func_iter))

        else:
            # getting all objects together
            self.objects = list(feat_pool.map(self.get_model_offline,trainfeatfiles))
            
        self.featnames.extend(kmernames)
        self.objects.extend(kmerObjs)
            
        
        # select only the best models based on training or validation
        self.best_idx,self.best_models = self.select_top_models(self.objects)
        self.best_model_names = np.array(self.featnames)[self.best_idx]
        
        # getting all model predictions together for ensemble
        if not self.test:
            self.all_model_preds = [o.ypredvalid for o in self.best_models]
            self.en = Ensemble(self.all_model_preds,self.y_valid)
            self.precision = precision_score(self.y_valid,self.en.preds,pos_label=0,average='binary')
            
        else:
            self.all_model_preds = [o.yhattest for o in self.best_models]
            self.en = Ensemble(self.all_model_preds)
        
        self.del_dir()
        pass
    
    def make_tempdir(self):
        !mkdir -p tmp
        return 
    
    def write_model_stats(self,featname,obj):
        with open('tmp/'+featname+'.txt','w') as f:
            f.write(str(obj.acc_train))
            f.write('\n')
            f.write(str(obj.acc_valid))
            f.write('\n')
        return
    
    def del_dir(self):
        !rm -rf tmp
        return
    
    def get_model_online(self,X_train,X_valid,y_train,y_valid,X_test=None):

        ros = SMOTETomek(random_state=0)
        X_train,y_train = ros.fit_resample(X_train,y_train)

        if X_train.shape[1]<self.default_pca_components:
            self.pca_components = int(0.75*X_train.shape[1])
        else:
            self.pca_components=self.default_pca_components
            
        if self.test:
            obj = self.object_map[self.model](X_train,X_valid,y_train,y_valid,X_test)
        else:
            obj = self.object_map[self.model](X_train,X_valid,y_train,y_valid)
        return obj
    
    
    def get_model_offline(self,featfilename,testfeatfilename=None):
        
        df1 = pd.read_csv(featfilename,header=None)
        df2 = pd.read_csv(self.label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        df_feat_train = df_feat.loc[self.enz_train]
        df_feat_valid = df_feat.loc[self.enz_valid]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_valid_feat,y_valid_feat = df_feat_valid.iloc[:,0:-1].values,df_feat_valid.iloc[:,-1].values
        
        ros = SMOTETomek(random_state=0)
        X_train_feat,y_train_feat = ros.fit_resample(X_train_feat,y_train_feat)

        if X_train_feat.shape[1]<self.default_pca_components:
            self.pca_components = int(0.75*X_train_feat.shape[1])
        else:
            self.pca_components=self.default_pca_components
            
        if self.test:
            df_feat_test = pd.read_csv(testfeatfilename,header=None).set_index(0)
            X_test_feat = df_feat_test.loc[self.testenz_names].values
            if X_train_feat.shape[1] != X_test_feat.shape[1]:
                print(featfilename)
            obj = self.object_map[self.model](X_train_feat,X_valid_feat,y_train_feat,y_valid_feat,X_test_feat)
        else:
            obj = self.object_map[self.model](X_train_feat,X_valid_feat,y_train_feat,y_valid_feat)
            
        pattern = re.compile('.+/(.+)\.csv\.gz$')
        m = pattern.match(featfilename)
        feat_name = m.group(1)
        self.write_model_stats(feat_name,obj)
        
        
        return obj
    
        
    def select_top_models(self,Os):
        o_valid_accs = [o.acc_valid for o in Os] if self.test else [o.acc_train for o in Os] 
        sorted_idx = np.argsort(o_valid_accs)[::-1]
        best_idx = sorted_idx[:self.n_models]
        return best_idx,np.array(Os)[best_idx]
        

In [3]:
%%time
if __name__=='__main__':
    enz_file = '../data/enz_sequence.csv'
    label_file = '../data/enz_labels.csv'
    


    # Feature files for iFeature, pssmMethods 
    ifeatdatadir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
    pssmdatadir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'
    
    trainfeatdirs = [ifeatdatadir,pssmdatadir]
    
    ec = EClassification(enz_file,None,label_file,trainfeatdirs,None,model='SVM',validation_fraction=0.5,pca_components=175,n_models=17)
    

CPU times: user 40min 25s, sys: 15 s, total: 40min 40s
Wall time: 44min 1s


In [13]:
ec.en.acc

0.6062313556513093

In [14]:
test_accs =  [so.acc_valid for so in np.array(ec.objects)[ec.best_idx]]

In [15]:
train_accs =  [so.acc_train for so in np.array(ec.objects)[ec.best_idx]]

In [16]:
max(train_accs),min(train_accs),np.mean(train_accs)

(0.9981313400961025, 0.9650852878464818, 0.9883404064780009)

In [17]:
max(test_accs),min(test_accs),np.mean(test_accs)

(0.6158435531985416, 0.5575074577394763, 0.580826298036616)

In [18]:
ec.precision

0.44011976047904194

In [19]:
np.unique(ec.en.preds,return_counts=True)

(array([0, 1]), array([ 334, 2683]))

In [20]:
np.unique(ec.y_valid,return_counts=True)

(array([0, 1]), array([1148, 1869]))

In [21]:
ec.best_model_names

array(['KSCTriad', 'CTriad', 'DPC', 'DDE', 'Moran', 'CKSAAP', 'Geary',
       'NMBroto', 's_fpssm', 'TPC', 'rpm_pssm', 'smoothed_pssm', 'GTPC',
       'pssm_composition', 'ab_pssm', 'pssm_ac', 'CTDD'], dtype='<U24')