In [1]:
# modules
import os
import sys
sys.path.append('../')
import pandas as pd
from sklearn.model_selection import train_test_split
from IndModels.ModelsEdited import NGModel,GAACModel
import numpy as np
from mySVM.model import SVM
from Ensemble.model import Ensemble
import multiprocessing as mp
from sklearn.metrics import precision_score

In [12]:
class EClassification:
    
    def __init__(self,enzymedata,ifeaturefilenames,ifeaturelabelfile,random_seed=None):
        
        self.random_seed = random_seed
        
        # original data based on which everything is obtained
        self.df = pd.read_csv(enzymedata,header=None).iloc[np.random.randint(low=0,high=6052,size=1500),:]
        self.enz_names = self.df[0].values
        self.X = self.df.iloc[:,1].values
        self.y = self.df.iloc[:,-1].values
        
        # training and testing data for kmer and gaakmer - will be used for others as well
        self.X_train, self.X_test, self.y_train, self.y_test,self.enz_train,self.enz_test = train_test_split(self.X, self.y,self.enz_names, test_size=0.5, random_state=self.random_seed)
        
        #ng and gaang model
        self.ngmodel = NGModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,pca_components=50)
        self.gmodel = GAACModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,pca_components=50)
        
        self.ifeat_label_file = ifeaturelabelfile
        
        #multiprocessing 
        pool = mp.Pool(mp.cpu_count())
        
        # getting all SVM objects together 
        self.SVMobjects = list(pool.map(self.get_model_ifeat,ifeaturefilenames))
        self.SVMobjects.extend([self.ngmodel.SVMobject,self.gmodel.SVMobject])
        
        # select only the best models based on training
        self.best_idx,self.best_models = self.select_top_models(self.SVMobjects)
        
        # getting all model predictions together
        self.all_model_preds = [svmo.ypredtest for svmo in self.best_models]
        self.en = Ensemble(self.all_model_preds,self.y_test)
        self.precision = precision_score(self.y_test,self.en.preds)
        
        pass
    
    
    def get_model_ifeat(self,ifeatfilename):
        df1 = pd.read_csv(ifeatfilename,header=None)
        df2 = pd.read_csv(self.ifeat_label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        enz_name_train = self.enz_train
        enz_name_test = self.enz_test
        df_feat_train = df_feat.loc[enz_name_train]
        df_feat_test = df_feat.loc[enz_name_test]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_test_feat,y_test_feat = df_feat_test.iloc[:,0:-1].values,df_feat_test.iloc[:,-1].values
        if X_train_feat.shape[1]<50:
            n_comp = int(0.75*X_train_feat.shape[1])
        else:
            n_comp=50
        svm = SVM(X_train_feat,X_test_feat,y_train_feat,y_test_feat,verbose=False,optimize=False, pca_comp=n_comp,random_seed=self.random_seed)
        return svm
        
    def select_top_models(self,SVMOs):
        svm_train_accs = [svmo.acc_train for svmo in SVMOs]
        sorted_idx = np.argsort(svm_train_accs)[::-1]
        best_idx = sorted_idx[:11]
        return best_idx,np.array(SVMOs)[best_idx]
        

In [13]:
%%time
if __name__=='__main__':
    datadir = '../data/'
    enz_file = datadir + 'Enzyme_Dataset.csv'
    


    # Feature files for iFeature 
    ifeatdatadir = '../data/feat_vec/'
    ifeature_files = [ifeatdatadir+f.name for f in os.scandir(ifeatdatadir)]
    ifeatlabelfile = '../data/ifeat_labels.csv'

    ec = EClassification(enz_file,ifeature_files,ifeatlabelfile)
    

CPU times: user 22.6 s, sys: 17.8 s, total: 40.4 s
Wall time: 1min 2s


In [14]:
ec.gmodel.SVMobject.acc_test,ec.gmodel.SVMobject.acc_train

(0.62, 0.6426666666666667)

In [15]:
ec.ngmodel.SVMobject.acc_test,ec.ngmodel.SVMobject.acc_train

(0.6266666666666667, 0.6613333333333333)

In [16]:
ec.en.acc

0.644