In [1]:
#import modules
import os
import re
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import multiprocessing as mp
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score,accuracy_score
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.preprocessing import StandardScaler
import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

# Import datafiles to generate train test

In [2]:
seq_file = '../data/enz_sequence.csv'
label_file = '../data/enz_labels.csv'

In [3]:
df1 = pd.read_csv(seq_file,header=None)
df2 = pd.read_csv(label_file,header=None)
df = df1.merge(df2,on=0)

def filter_by_len(seq):
    val = len(seq)
    if val>200 and val<600:
        return True
    return False

df = df.loc[df['1_x'].apply(filter_by_len)]
# create a smaller test-train
df_0 = df.loc[df.iloc[:,-1]==0]
df_1 = df.loc[df.iloc[:,-1]==1]
train_df0 = df_0.sample(n=int(0.75*(len(df_0))),random_state=17)
valid_df0 = df_0.loc[~df_0.isin(train_df0)[0]]
train_df1 = df_1.sample(n=int(0.6*(len(df_1))),random_state=17)
valid_df1 = df_1.loc[~df_1.isin(train_df1)[0]]
train_df = pd.concat([train_df0,train_df1])
valid_df = pd.concat([valid_df0,valid_df1])
train_df = train_df.sample(frac=1,random_state=17)
valid_df = valid_df.sample(frac=1,random_state=17)

In [4]:
class Classifier:
    def __init__(self,X_train,X_valid,X_test,y_train,y_valid,y_test,feat_vec):
        
        self.X_train,self.X_valid,self.X_test,self.y_train,self.y_valid,self.y_test = X_train,X_valid,X_test,y_train,y_valid,y_test
        
        pattern = re.compile('.+/+(.+)\.csv\.gz$')
        m = re.match(pattern,feat_vec)
        featname = m.group(1)
        
        filename = '../data/simResults/ind_feat_results/'+featname+'.log'
        
        with open(filename,'w') as f:
            f.write('-'*5 + featname + ' model evaluation begins' + '-'*5 + '\n')
            
    
        pool = mp.Pool(mp.cpu_count())
        depths = [10,14,19,21,25,55]
        scrs = list(pool.map(self.get_score_dt,depths))
        self.dtclf = self.dt_clf(depths[np.argmax(scrs)])
        with open(filename,'a') as f:
            f.write('\n')
            f.write('Decision Tree Stats'+'\n')
            f.write('\n')
        self.write_clf_stats(self.dtclf,filename)
            
        pool = mp.Pool(mp.cpu_count())
        c_param = [1,1.25,1.5,3,5,7,10,20,25,35,100]
        scrs = list(pool.map(self.get_score_svc,c_param))            
        self.svcclf = self.svc_clf(c_param[np.argmax(scrs)])
        with open(filename,'a') as f:
            f.write('\n')
            f.write('SVM Stats'+'\n')
            f.write('\n')
        self.write_clf_stats(self.svcclf,filename)
            

        
        
        

    def dt_clf(self,max_depth):
        dt = DecisionTreeClassifier(max_features='auto',
                                    max_depth=max_depth,random_state=42)
        steps = [('scaler',StandardScaler()),('dt',dt)]
        pipe = Pipeline(steps)
        pipe.fit(self.X_train,self.y_train)
        return pipe

    def get_score_dt(self,max_depth):
        p = self.dt_clf(max_depth)
        return p.score(self.X_valid,self.y_valid)

    def svc_clf(self,regC):
        svc = SVC(C = regC,
                kernel='rbf',
                decision_function_shape='ovo',
                random_state=42,
                class_weight=None)
        steps = [('scaler',StandardScaler()),('svc',svc)]
        pipe = Pipeline(steps)
        pipe.fit(self.X_train,self.y_train)
        return pipe

    def get_score_svc(self,regC):
        p = self.svc_clf(regC)
        return p.score(self.X_valid,self.y_valid)
    
    def getHPOPTfigure(d,s):
        plt.plot(d,s)
        return
    
    def get_uniqcount(self,arr):
        return np.unique(arr,return_counts=True)
    
    def write_clf_stats(self,clf,filename):
        
        yhattrain = clf.predict(self.X_train)
        yhatvalid = clf.predict(self.X_valid)
        yhattest = clf.predict(self.X_test)
        acc_train = accuracy_score(self.y_train,yhattrain)
        prec_train = precision_score(self.y_train,yhattrain,pos_label=0)
        acc_valid = accuracy_score(self.y_valid,yhatvalid)
        prec_valid = precision_score(self.y_valid,yhatvalid,pos_label=0)
        acc_test = accuracy_score(self.y_test,yhattest)
        prec_test = precision_score(self.y_test,yhattest,pos_label=0)
        
        with open(filename,'a') as f:
            f.write('Training Data Label Count\n')
            for i,j in zip(*self.get_uniqcount(self.y_train)):
                f.write(str(i)+':'+str(j))
                f.write('\n')
            f.write('Train Accuracy: '+str(acc_train)+'\n')
            f.write('Train Precision: '+str(prec_train)+'\n')
            f.write('Validation Data Label Count\n')
            for i,j in zip(*self.get_uniqcount(self.y_valid)):
                f.write(str(i)+':'+str(j))
                f.write('\n')
            f.write('Validation Accuracy: '+str(acc_valid)+'\n')
            f.write('Validation Precision: '+str(prec_valid)+'\n')
            f.write('Test Data Label Count\n')
            for i,j in zip(*self.get_uniqcount(self.y_test)):
                f.write(str(i)+':'+str(j))
                f.write('\n')
            f.write('Test Accuracy: '+str(acc_test)+'\n')
            f.write('Test Precision: '+str(prec_test)+'\n')
            
        return 
    
    
class Feat_Evaluation(Classifier):
    def __init__(self,feat_vec_file):
        feat_vec = feat_vec_file
        feat_df1 = pd.read_csv(feat_vec,header=None)
        feat_df2 = pd.read_csv(label_file,header=None)
        feat_df = feat_df1.merge(feat_df2,on=0)

        train_featdf = feat_df.loc[feat_df[0].isin(train_df[0].values)]
        valid_featdf = feat_df.loc[feat_df[0].isin(valid_df[0].values)]

        #define data
        testenz_names = valid_featdf.iloc[:,0].values
        X_test = valid_featdf.iloc[:,1:-1].values
        y_test = valid_featdf.iloc[:,-1].values

        #define data
        enz_names = train_featdf.iloc[:,0].values
        X = train_featdf.iloc[:,1:-1].values
        y = train_featdf.iloc[:,-1].values

        # train test split
        X_train, X_valid, y_train, y_valid, enz_train, enz_valid = train_test_split(X, y, enz_names, 
                                test_size=0.1, random_state=17)
        
        super().__init__(X_train, X_valid, X_test, y_train, y_valid,y_test,feat_vec_file)
        
        return


In [5]:
def feat_eval_func(featfile):
    feat_eval = Feat_Evaluation(featfile)
    return 
    

In [6]:
if __name__=='__main__':
    ifeatdir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
    pssmdir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'
    feat_vecfiles = [ifeatdir + f.name for f in os.scandir(ifeatdir) if f.name.endswith('.csv.gz')]
    pssm_vecfiles = [pssmdir + f.name for f in os.scandir(pssmdir) if f.name.endswith('.csv.gz')]
    feat_vecfiles.extend(pssm_vecfiles)

    for i in tqdm.tqdm(range(len(feat_vecfiles))):
        feat_eval_func(feat_vecfiles[i])

100%|██████████| 42/42 [17:13<00:00, 24.61s/it]
