In [1]:
# modules
import os
import sys
sys.path.append('../')
import pandas as pd
from sklearn.model_selection import train_test_split
from IndModels.ModelsEdited import NGModel,GAACModel
import numpy as np
from mySVM.model import SVM
from GBClassifier.model import GBC
from Ensemble.model import Ensemble
import multiprocessing as mp
from sklearn.metrics import precision_score

In [2]:
class EClassification:
    
    def __init__(self,enzymedata,ifeaturefilenames,ifeaturelabelfile,random_seed=None):
        
        self.random_seed = random_seed
        
        # original data based on which everything is obtained
        self.df = pd.read_csv(enzymedata,header=None).iloc[np.random.randint(low=0,high=6052,size=6000),:]
        self.enz_names = self.df[0].values
        self.X = self.df.iloc[:,1].values
        self.y = self.df.iloc[:,-1].values
        
        # training and testing data for kmer and gaakmer - will be used for others as well
        self.X_train, self.X_test, self.y_train, self.y_test,self.enz_train,self.enz_test = train_test_split(self.X, self.y,self.enz_names, test_size=0.5, random_state=self.random_seed)
        
        #ng and gaang model
#         self.ngmodel = NGModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,pca_components=75,kernparam='rbf',regCparam=5)
#         self.gmodel = GAACModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,pca_components=75,kernparam='rbf',regCparam=5)
        
        self.ngmodel = NGModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,pca_components=75,SVM=False,GBC=True,nestparam=250,lrateparam=0.1,mdepthparam=5,ssampleparam=1)
        self.gmodel = GAACModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,pca_components=75,SVM=False,GBC=True,nestparam=250,lrateparam=0.1,mdepthparam=5,ssampleparam=1)

        
        self.ifeat_label_file = ifeaturelabelfile
        
        #multiprocessing 
#         pool = mp.Pool(mp.cpu_count())
        
        # getting all SVM objects together 
        self.SVMobjects = list(map(self.get_model_ifeat,ifeaturefilenames))
#         self.SVMobjects.extend([self.ngmodel.SVMobject,self.gmodel.SVMobject])
        self.SVMobjects.extend([self.ngmodel.GBCobject,self.gmodel.GBCobject])
        
        self.model_names = [ifeatname.replace('../data/feat_vec/','').replace('.csv','') for ifeatname in ifeaturefilenames]
        self.model_names.extend(['ngmodel','gmodel'])
        
        # select only the best models based on training
        self.best_idx,self.best_models = self.select_top_models(self.SVMobjects)
        self.best_model_names = np.array(self.model_names)[self.best_idx]
        
        # getting all model predictions together
        self.all_model_preds = [svmo.ypredtest for svmo in self.best_models]
        self.en = Ensemble(self.all_model_preds,self.y_test)
        self.precision = precision_score(self.y_test,self.en.preds)
        
        pass
    
    
    def get_model_ifeat(self,ifeatfilename):
        df1 = pd.read_csv(ifeatfilename,header=None)
        df2 = pd.read_csv(self.ifeat_label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        enz_name_train = self.enz_train
        enz_name_test = self.enz_test
        df_feat_train = df_feat.loc[enz_name_train]
        df_feat_test = df_feat.loc[enz_name_test]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_test_feat,y_test_feat = df_feat_test.iloc[:,0:-1].values,df_feat_test.iloc[:,-1].values
        if X_train_feat.shape[1]<75:
            n_comp = int(0.75*X_train_feat.shape[1])
        else:
            n_comp=75
#         svm = SVM(X_train_feat,X_test_feat,y_train_feat,y_test_feat,verbose=False,optimize=False, pca_comp=n_comp,random_seed=self.random_seed,kern='rbf',regC=5)
        svm = GBC(X_train_feat,X_test_feat,y_train_feat,y_test_feat,verbose=False,optimize=False, pca_comp=n_comp,random_seed=self.random_seed,nest=250,lrate=0.1,mdepth=5,ssample=1)
        return svm
        
    def select_top_models(self,SVMOs):
        svm_train_accs = [svmo.acc_train for svmo in SVMOs]
        sorted_idx = np.argsort(svm_train_accs)[::-1]
        best_idx = sorted_idx[:]
        return best_idx,np.array(SVMOs)[best_idx]
        

In [3]:
%%time
if __name__=='__main__':
    datadir = '../data/'
    enz_file = datadir + 'Enzyme_Dataset.csv'
    


    # Feature files for iFeature 
    ifeatdatadir = '../data/feat_vec/'
    ifeature_files = [ifeatdatadir+f.name for f in os.scandir(ifeatdatadir)]
    ifeatlabelfile = '../data/ifeat_labels.csv'

    ec = EClassification(enz_file,ifeature_files,ifeatlabelfile)
    

CPU times: user 9min 8s, sys: 11.3 s, total: 9min 20s
Wall time: 9min 19s


In [4]:
ec.en.acc

0.787

In [5]:
ec.best_model_names

array(['gmodel', 'TPC', 'CTDD', 'QSOrder', 'GTPC', 'CKSAAP', 'CKSAAGP',
       'APAAC', 'SOCNumber', 'CTDT', 'Moran', 'GDPC', 'CTriad', 'NMBroto',
       'CTDC', 'AAC', 'PAAC', 'KSCTriad', 'Geary', 'DPC', 'DDE',
       'ngmodel', 'GAAC'], dtype='<U9')

In [6]:
test_accs =  [so.acc_test for so in np.array(ec.SVMobjects)[ec.best_idx]]

In [7]:
train_accs =  [so.acc_train for so in np.array(ec.SVMobjects)[ec.best_idx]]

In [8]:
max(train_accs),min(train_accs),np.mean(train_accs)

(1.0, 0.9266666666666666, 0.9927826086956525)

In [9]:
max(test_accs),min(test_accs),np.mean(test_accs)

(0.777, 0.7346666666666667, 0.7583623188405796)