In [None]:
import numpy as np 
import pandas as pd 

class preparing_data:
    def __init__(self):
        self.data_url = "data/colon.csv"
        
    def load_data(self):
        #df = pd.read_csv(self.data_url, delimiter=',')
        df = pd.read_csv(self.data_url, delimiter=',|;')
        features = df.iloc[:,:-1]
        target = df.iloc[:,-1]
        
        return features,target
    
    def prepare_features_scores_list(self,features):
        features_scores = pd.DataFrame()
        features_scores.insert(0, 'feature', list(features))
        
        return features_scores
    
    def add_score_column(self,features_scores,col_name,col_value):
        # convert 1D array to rows and insert it into df colum
        features_scores[col_name] = pd.Series(col_value)
        
        return features_scores
    
    # get the rows according to true decision
    def filter_by_decision(self,features_scores):
        filtered= features_scores.loc[(features_scores['decision'] == True)]
        
        return filtered
                       

    # transform selected features into the training data X
    def transform_new_features(self,X,features_scores):
        features_list= features_scores['feature'].values
        new_features = X.loc[:, features_list]
        
        return new_features

        


In [None]:
import numpy as np 
import pandas as pd 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif,mutual_info_classif
from sklearn import svm
from sklearn.feature_selection import SelectFromModel
from scipy import stats
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,LassoCV,RidgeClassifierCV,RidgeClassifier
from sklearn.linear_model import RidgeCV,Ridge,ElasticNetCV,ElasticNet,LarsCV,Lars,LassoLarsCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from statistics import *

class features_selection:
    def __init__(self,features,target,fs_methods,threshold):
        self.fs_methods = fs_methods
        self.features = features
        self.target = target
        self.threshold = threshold 
        self.random_state = 42

        
    def get_features_scores(self,features_scores):

        pre_data= preparing_data()
        
        if 'anova' in self.fs_methods:
            f,p_val,anova_support = self.anova(0.05)
            features_scores = pre_data.add_score_column(features_scores,'anova_f',f)
            features_scores = pre_data.add_score_column(features_scores,'anova_p_val',p_val)
            features_scores = pre_data.add_score_column(features_scores,'anova_support',anova_support)
        
        if 'mi' in self.fs_methods:
            mi,mi_threshold,mi_support = self.mutual_info(0.05)
            features_scores = pre_data.add_score_column(features_scores,'mi_value',mi)
            features_scores = pre_data.add_score_column(features_scores,'mi_support',mi_support)
        
        if 'reg_svm_l2' in self.fs_methods:
            regsvml2_coef,regsvml2_threshold,regsvml2_support = self.regularization_svm_l2()
            features_scores = pre_data.add_score_column(features_scores,'reg_svm_l2_coef',regsvml2_coef)
            features_scores = pre_data.add_score_column(features_scores,'reg_svm_l2_support',regsvml2_support)
        
        #LogisticRegression
        if 'reg_lr_l2' in self.fs_methods:
            reglrl2_coef,reglrl2_threshold,reglrl2_support = self.regularization_lr_l2()
            features_scores = pre_data.add_score_column(features_scores,'reg_lr_l2_coef',reglrl2_coef)
            features_scores = pre_data.add_score_column(features_scores,'reg_lr_l2_support',reglrl2_support)
            
        if 'linear_reg' in self.fs_methods:
            linear_reg_coef,linear_reg_threshold,linear_reg_support = self.linear_reg()
            features_scores = pre_data.add_score_column(features_scores,'linear_reg_coef',linear_reg_coef)
            features_scores = pre_data.add_score_column(features_scores,'linear_reg_support',linear_reg_support)
            
        if 'logistic_reg' in self.fs_methods:
            log_reg_coef,log_reg_threshold,log_reg_support = self.logistic_reg()
            features_scores = pre_data.add_score_column(features_scores,'logistic_reg_coef',log_reg_coef)
            features_scores = pre_data.add_score_column(features_scores,'logistic_reg_support',log_reg_support)
            
        if 'lasso' in self.fs_methods:
            lasso_coef,lasso_threshold,lasso_support = self.lasso()
            features_scores = pre_data.add_score_column(features_scores,'lasso_coef',lasso_coef)
            features_scores = pre_data.add_score_column(features_scores,'lasso_support',lasso_support)
            
        if 'elastic_net' in self.fs_methods:
            elastic_net_coef,elastic_net_threshold,elastic_net_support = self.ElasticNet()
            features_scores = pre_data.add_score_column(features_scores,'elastic_net_coef',elastic_net_coef)
            features_scores = pre_data.add_score_column(features_scores,'elastic_net_support',elastic_net_support)
        
        if 'dec_tree' in self.fs_methods:
            dec_tree_coef,dec_tree_threshold,dec_tree_support = self.dec_tree()
            features_scores = pre_data.add_score_column(features_scores,'dec_tree_coef',dec_tree_coef)
            features_scores = pre_data.add_score_column(features_scores,'dec_tree_support',dec_tree_support)
            
        return features_scores
    
    def get_support(self, scores):
        
        if self.threshold == 'median':
            threshold = np.median(scores)
            #if threshold == 0:
                #threshold = median_grouped(scores)
        elif self.threshold == 'mean':
            threshold = np.mean(scores)
        elif self.threshold == 'mode':
            threshold = mode(scores)
        elif self.threshold == 2:
            threshold = 2 * np.mean(scores)
        else:
            threshold= self.threshold #float value
        
        support = scores > threshold     
        
        print(threshold)
  
        return support
    
    def anova(self, threshold):
        #f,p_value = f_classif(self.features,self.target)
        anova = SelectKBest(f_classif, k='all').fit(self.features,self.target)
        f= anova.scores_
        p_value = anova.pvalues_
        if self.threshold == 'self-evaluation':
            support = p_value < threshold
        else:
            support = self.get_support(f)

        return f,p_value,support
    
    def mutual_info(self,threshold):
        mi = SelectKBest(mutual_info_classif, k='all').fit(self.features,self.target)
        scores= mi.scores_
        
        if self.threshold == 'self-evaluation':
            support = scores < threshold
        else:
            support = self.get_support(f)
       
            
        return scores,threshold,support
    
    # svm
    def regularization_svm_l2(self):
        #clf = svm.SVC(kernel='linear', C=1)
        #clf = LogisticRegression()
        #clf=LinearSVC(penalty="l2")
        clf= LinearSVC(C=0.01, penalty="l1", dual=False,random_state=self.random_state)
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        coef = model.estimator_.coef_
        # threshold value form the model
        threshold = model.threshold_
       
        #support = coef > threshold
        support = self.get_support(coef)
    
        # convert 2D array to 1D
        coef = np.reshape(coef, (np.product(coef.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return coef,threshold,support
    
    #LogisticRegression
    def regularization_lr_l2(self):
        #clf = LogisticRegression(penalty="l2")
        clf= LogisticRegression(C=0.01, penalty="l1", dual=False,random_state=self.random_state)
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        coef = model.estimator_.coef_
        # threshold value form the model
        threshold = model.threshold_
        #support = coef > threshold
        
        support = self.get_support(coef)
        # convert 2D array to 1D
        coef = np.reshape(coef, (np.product(coef.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return coef,threshold,support
    
    def linear_reg(self):
        clf= LinearRegression()
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        coef = model.estimator_.coef_
        # threshold value form the model
        threshold = model.threshold_
        
        if self.threshold == 'self-evaluation':
            support = coef > threshold
        else:
            support = self.get_support(coef)
        
        # convert 2D array to 1D
        coef = np.reshape(coef, (np.product(coef.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return coef,threshold,support
    
    def logistic_reg(self):
        clf= LogisticRegression(random_state=self.random_state)
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        coef = model.estimator_.coef_[0]
        # threshold value form the model
        threshold = model.threshold_
        if self.threshold == 'self-evaluation':
            support = coef > threshold
        else:
            support = self.get_support(coef)
            
        # convert 2D array to 1D
        coef = np.reshape(coef, (np.product(coef.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return coef,threshold,support
    
    def lasso(self):
        clf= LassoCV(cv=3,random_state=self.random_state)
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        coef = model.estimator_.coef_
        
        # threshold value form the model
        threshold = model.threshold_
        if self.threshold == 'self-evaluation':
            support = coef > threshold
        else:
            support = self.get_support(coef)
            
        # convert 2D array to 1D
        coef = np.reshape(coef, (np.product(coef.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return coef,threshold,support
    
    def ElasticNet(self):
        clf=ElasticNetCV(cv=3)
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        coef = model.estimator_.coef_
        
        # threshold value form the model
        threshold = model.threshold_
        if self.threshold == 'self-evaluation':
            support = coef > threshold
        else:
            support = self.get_support(coef)
            
        # convert 2D array to 1D
        coef = np.reshape(coef, (np.product(coef.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return coef,threshold,support
    #
    def dec_tree(self):
        #clf= XGBClassifier()
        clf=DecisionTreeClassifier(random_state=self.random_state)
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        
        importances = model.estimator_.feature_importances_
      
        # threshold value form the model
        threshold = model.threshold_
        if self.threshold == 'self-evaluation':
            support = importances > threshold
        else:
            support = self.get_support(importances)
        
        # convert 2D array to 1D
        importances = np.reshape(importances, (np.product(importances.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return importances,threshold,support

In [None]:
class data_fusion:
    def __init__(self,fs_methods):
        self.fs_methods = fs_methods
        self.col_support = []
        for fs in self.fs_methods:
            self.col_support.append(fs + "_support")
    
    def prepare_decision(self, decision,features_scores):
        if decision == 'consensus':
            return self.decision_consensus(features_scores)
        elif decision == 'majority':
            return self.decision_majority(features_scores)
    
    def decision_majority(self,features_scores):
        decsions=[]
        for row in features_scores[self.col_support].values:
            decsion, count = np.unique(row, return_counts=True)
            if np.all(count <= 1):
                decsions.append(row[0])
            else:
                decsions.append(decsion[np.argmax(count)] )

        features_scores['decision'] = decsions
        
        return features_scores
        
    # The support output of all FS methods must be a true 
    def decision_consensus(self,features_scores):
        features_scores['decision'] = features_scores[self.col_support].eq(True, axis=0).all(axis=1)

        return features_scores

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
class classifiers:
    def __init__(self,features,target,cv):
        self.features = features
        self.target= target
        self.cv= cv
        self.random_state= 42
        
    def get_all_results(self):
        # SVM
        svm_clf,svm_grid = self.df_svm()
        svm_b_model= self.best_model(svm_clf,svm_grid)
        self.print_results(svm_b_model)
        
        # Random Forest
        rf_clf,rf_grid = self.df_RandomForest()
        rf_b_model= self.best_model(rf_clf,rf_grid)
        self.print_results(rf_b_model)
        
        #Gradient Boosting
        gb_clf,gb_grid = self.df_GradientBoosting()
        gb_b_model= self.best_model(gb_clf,gb_grid)
        self.print_results(gb_b_model)
        
        # KNN
#         knn_clf,knn_grid = self.df_KNN()
#         knn_b_model= self.best_model(knn_clf,knn_grid)
#         self.print_results(knn_b_model)
        
        # NB
#         nb_clf,nb_grid = self.df_NB()
#         nb_b_model= self.best_model(nb_clf,nb_grid)
#         self.print_results(nb_b_model)
        
        
         # LR
#         lr_clf,lr_grid = self.df_LR()
#         lr_b_model= self.best_model(lr_clf,lr_grid)
#         self.print_results(lr_b_model)
    
    # support vector machine
    def df_svm(self):
        grid = {            
            'C': np.logspace(-3, 2, 6),'kernel': ["sigmoid","linear","rbf"],
            'gamma': np.logspace(-3, 2, 6)
            }
        # Create a classifier object with the classifier and parameter candidates
        clf = GridSearchCV(estimator=SVC(random_state=self.random_state),cv=self.cv, param_grid=grid,n_jobs=-1,refit=True)

        # Train the classifier 
        clf.fit(self.features,self.target ) 
        
        return clf,grid
    
    # naive bayes
    def df_NB(self):
        grid={  
            'var_smoothing': np.logspace(0,-9, num=100)
        }


        clf = GridSearchCV(GaussianNB() ,cv=self.cv, param_grid=grid,n_jobs=-1)
        
        clf.fit(self.features,self.target ) 
        
        return clf,grid
    
    # logistic regression
    def df_LR(self):
        grid={
            'C' : np.logspace(-3, 2, 6),
            "fit_intercept": [False, True]
        }


        clf= GridSearchCV( LogisticRegression(random_state=self.random_state), param_grid= grid, cv=self.cv,n_jobs=-1)
        
        clf.fit(self.features,self.target ) 
        
        return clf,grid
    
    # k nearest neighbor
    def df_KNN(self):
        grid={
           'n_neighbors':range(1,30), "metric":["manhattan","euclidean","minkowski"]
        }
             
        
        # Create a classifier object with the classifier and parameter candidates
        clf = GridSearchCV(KNeighborsClassifier(),cv=self.cv, param_grid=grid,n_jobs=-1,refit=True)

        clf.fit(self.features,self.target ) 
        
        return clf,grid
    
    def df_RandomForest (self):
        grid={
            'n_estimators':[200,300,400, 500], 
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : [4,5,6,7,8],
            'criterion' :['gini', 'entropy']
        }
       

        # Create a classifier object with the classifier and parameter candidates
        clf = GridSearchCV(RandomForestClassifier(random_state=self.random_state),cv=self.cv, param_grid=grid,n_jobs=-1,refit=True)

        clf.fit(self.features,self.target ) 
        
        return clf,grid
    
    def df_GradientBoosting (self):
        grid={
            "loss":["deviance"],
            "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1],
            "max_depth":[3,5,8],
            "max_features":["log2","sqrt"],
            "criterion": ["friedman_mse",  "mae"],
            "subsample":[0.5, 0.6,0.7, 0.8, 0.9, 1.0],
            "n_estimators":[10]
        }
       
        #         "min_samples_split": np.linspace(0.1, 0.5, 12),
        #         "min_samples_leaf": np.linspace(0.1, 0.5, 12),

        # Create a classifier object with the classifier and parameter candidates
        clf = GridSearchCV(GradientBoostingClassifier(random_state=self.random_state),cv=self.cv, param_grid=grid,n_jobs=-1,refit=True)

        clf.fit(self.features,self.target ) 
        
        return clf,grid
    
    def best_model(self,clf,grid):
        print("classifier name:",clf.best_estimator_.__class__.__name__)
        #print(clf.best_estimator_,"\n")

        best_parameters = clf.best_estimator_.get_params()
        for param_name in sorted(list(grid.keys())):
            print("\t{0}: {1}".format(param_name, best_parameters[param_name]))
        

        best_model = clf.best_estimator_
        best_score=clf.best_score_
        #print("The best score:", best_score)
        
        return best_model

    def print_results(self,best_model):
        scoring = {"f1_macro": make_scorer( f1_score, average="macro" ),
                   "accuracy":make_scorer( accuracy_score ),
                   "precision": make_scorer( precision_score, average="macro" ),
                   "recall": make_scorer( recall_score, average="macro" )}     
          
          
        Predict=cross_val_predict(best_model, self.features, self.target, cv=self.cv)
        res=cross_validate(best_model, self.features, self.target , cv=self.cv, scoring=scoring, return_train_score=True)
        sorted(res.keys())
        #display(res)
        print("Accuracy of the best model :%.2f  (+\-%0.2f)"% (res["test_accuracy"].mean() * 100, res["test_accuracy"].std()*2 * 100))
        print("Macro precision  of the best model :%.2f (+\-%0.2f) "% ( res["test_precision"].mean() * 100, res["test_precision"].std()*2 * 100))
        print("Macro recall  of the best model :%.2f (+\-%0.2f) "% ( res["test_recall"].mean() * 100, res["test_recall"].std()*2 * 100))
        #print("Macro f1-score  of the best model :%.2f (+\-%0.2f) "% ( res["test_f1_macro"].mean(), res["test_f1_macro"].std()*2))
        #print("Training Macro f1-score  of the best model :%.2f (+\-%0.2f) "% ( res["train_f1_macro"].mean(), res["train_f1_macro"].std()*2))
        
        #print("Training accuracy of the best model :%.2f (+\-%0.2f)"% (res["train_accuracy"].mean(), res["train_accuracy"].std()*2))



### Loading and preparing a data frame for the features and scores based on fs methods


In [None]:
#fs_methods = ["mi","dec_tree","logistic_reg", "lasso","linear_reg"]
#fs_methods = ["reg_lr_l2","mi","logistic_reg"]

#fs_methods = ["linear_reg","dec_tree","lasso"]
#fs_methods = ["lasso","linear_reg"]
fs_methods= ["anova","mi","logistic_reg","lasso","dec_tree"]

#fs_methods = ["linear_reg","lasso"]
#fs_methods = ["linear_reg","dec_tree"]
#fs_methods = ["lasso","dec_tree"]


#fs_methods = ["anova","mi","elastic_net"]

#threshold = 'median'
#threshold = 'mean'
threshold = 'self-evaluation'

#decision = 'consensus'
decision = 'majority'

#calculated
#threshold_type= 'calculated'
#threshold_type= 'from_model' 

pre_data_cls = preparing_data()
features,target = pre_data_cls.load_data()

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(target)
target = le.transform(target)
features_scores = pre_data_cls.prepare_features_scores_list(features)
features_scores

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,feature
0,V1
1,V2
2,V3
3,V4
4,V5
...,...
1995,V1996
1996,V1997
1997,V1998
1998,V1999


### Filling dataframe with scores based on fs methods


In [None]:
fs = features_selection(features,target,fs_methods,threshold) 
features_scores= fs.get_features_scores(features_scores)
features_scores



Unnamed: 0,feature,anova_f,anova_p_val,anova_support,mi_value,mi_support,logistic_reg_coef,logistic_reg_support,lasso_coef,lasso_support,dec_tree_coef,dec_tree_support
0,V1,0.641398,0.426365,False,0.000000,True,-0.004209,False,-0.0,False,0.0,False
1,V2,0.002387,0.961194,False,0.093211,False,0.008659,False,-0.0,False,0.0,False
2,V3,0.386330,0.536590,False,0.015255,True,0.019149,False,-0.0,False,0.0,False
3,V4,0.145885,0.703849,False,0.004555,True,-0.005497,False,-0.0,False,0.0,False
4,V5,0.056955,0.812188,False,0.037575,True,-0.006177,False,-0.0,False,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,V1996,0.482037,0.490181,False,0.000000,True,0.011926,False,-0.0,False,0.0,False
1996,V1997,2.439861,0.123546,False,0.000000,True,0.006985,False,0.0,False,0.0,False
1997,V1998,10.313773,0.002124,True,0.093592,False,0.043417,True,0.0,False,0.0,False
1998,V1999,0.899225,0.346793,False,0.000000,True,0.038369,True,0.0,False,0.0,False


### Preparing  the decision for each feature

In [None]:
d_fusion = data_fusion(fs_methods)
features_scores = d_fusion.prepare_decision(decision,features_scores)
features_scores

Unnamed: 0,feature,anova_f,anova_p_val,anova_support,mi_value,mi_support,logistic_reg_coef,logistic_reg_support,lasso_coef,lasso_support,dec_tree_coef,dec_tree_support,decision
0,V1,0.641398,0.426365,False,0.000000,True,-0.004209,False,-0.0,False,0.0,False,False
1,V2,0.002387,0.961194,False,0.093211,False,0.008659,False,-0.0,False,0.0,False,False
2,V3,0.386330,0.536590,False,0.015255,True,0.019149,False,-0.0,False,0.0,False,False
3,V4,0.145885,0.703849,False,0.004555,True,-0.005497,False,-0.0,False,0.0,False,False
4,V5,0.056955,0.812188,False,0.037575,True,-0.006177,False,-0.0,False,0.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,V1996,0.482037,0.490181,False,0.000000,True,0.011926,False,-0.0,False,0.0,False,False
1996,V1997,2.439861,0.123546,False,0.000000,True,0.006985,False,0.0,False,0.0,False,False
1997,V1998,10.313773,0.002124,True,0.093592,False,0.043417,True,0.0,False,0.0,False,False
1998,V1999,0.899225,0.346793,False,0.000000,True,0.038369,True,0.0,False,0.0,False,False


### Filtering the rows based on the decision

In [None]:
filtered_fs_scores = pre_data_cls.filter_by_decision(features_scores)
filtered_fs_scores


Unnamed: 0,feature,anova_f,anova_p_val,anova_support,mi_value,mi_support,logistic_reg_coef,logistic_reg_support,lasso_coef,lasso_support,dec_tree_coef,dec_tree_support,decision
14,V15,5.281545,0.025054,True,0.000000,True,0.040616,True,-0.000000,False,0.0,False,True
53,V54,7.138158,0.009701,True,0.000000,True,0.028357,True,0.000000,False,0.0,False,True
117,V118,4.810661,0.032171,True,0.038136,True,0.032416,True,0.000000,False,0.0,False,True
163,V164,7.554419,0.007896,True,0.000000,True,0.080933,True,0.018958,True,0.0,False,True
187,V188,6.142825,0.016025,True,0.003084,True,0.043472,True,0.000000,False,0.0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910,V1911,5.216268,0.025931,True,0.046194,True,0.027090,True,0.000000,False,0.0,False,True
1915,V1916,7.124551,0.009767,True,0.082293,False,0.070215,True,0.014223,True,0.0,False,True
1934,V1935,11.719212,0.001120,True,0.082222,False,0.098063,True,0.043574,True,0.0,False,True
1951,V1952,0.254423,0.615825,False,0.000000,True,0.047659,True,0.003510,True,0.0,False,True


In [None]:
filtered_fs_scores.shape

(65, 13)

### Transforming selected features into the training data X

In [None]:
new_features = pre_data_cls.transform_new_features(features,filtered_fs_scores)
new_features

Unnamed: 0,V15,V54,V118,V164,V188,V213,V226,V228,V237,V251,...,V1804,V1810,V1859,V1891,V1896,V1911,V1916,V1935,V1952,V1954
0,2.035089,1.538324,0.969836,1.012554,1.201727,0.670643,1.356519,1.425127,0.729774,0.987932,...,-0.196252,-1.272061,-0.898932,-0.181223,-2.400446,-1.158136,-1.275139,-1.977302,0.171296,-1.523087
1,2.184510,1.109248,0.826933,0.830204,1.647886,0.184877,1.355272,0.248032,0.437548,0.459324,...,-2.168849,-0.601746,-0.981504,-0.426691,-1.846559,-0.220970,-0.432712,-2.367909,-0.011954,-0.216916
2,2.865767,1.458204,1.868558,1.718855,0.223534,1.016268,0.898474,0.932580,1.248726,1.040705,...,-0.382179,-1.246477,-1.188001,-0.973996,-0.829717,-1.419053,-1.574573,-0.909947,-0.984897,-1.065977
3,2.255694,1.112147,1.444674,1.545227,0.725323,0.820482,2.304778,1.609751,1.077711,1.205307,...,-1.386044,3.542825,-0.921261,-0.087963,-1.149286,-0.968686,-1.384379,-1.972659,-0.080658,-0.758213
4,2.383546,2.070685,1.415645,1.324904,0.206027,0.761945,1.085062,1.481545,0.955313,0.904904,...,0.106405,-1.845355,-1.319766,-0.523970,-1.323563,-0.968795,-1.567167,-0.976367,-0.840013,-1.208997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2.576927,1.908137,1.445814,1.497633,1.327282,0.565840,2.188200,1.376605,1.009719,2.419516,...,-0.293738,-0.661786,-1.090999,-1.178725,-1.730223,-0.410779,-0.289293,-1.161223,0.751380,-0.535171
58,2.773353,2.247273,1.478023,1.950222,1.110105,1.341779,0.776916,1.473735,1.679681,2.312786,...,-0.144757,0.679870,-0.934598,-0.827484,-1.124957,-0.035577,-1.017295,-0.563905,0.091013,0.165929
59,2.104167,1.577702,2.002133,2.190752,1.916171,0.850630,2.085853,1.156246,1.002147,0.919267,...,-0.426977,-0.524605,-0.691800,0.029404,-2.148282,-0.585751,-1.018359,-1.438532,-0.724705,-1.381443
60,2.304205,1.741622,1.938732,1.494403,1.316519,1.615623,2.255276,1.828747,1.756576,1.648343,...,-0.278328,-1.241291,-0.361736,-1.649490,-1.226807,0.315564,-0.543700,-1.132150,-0.317939,-1.395705


In [None]:
clf_base10 = classifiers(sfs_features,target,10)
clf_base10.get_all_results()



classifier name: SVC
	C: 0.1
	gamma: 0.001
	kernel: linear
Accuracy of the best model :90.00  (+\-26.67)
Macro precision  of the best model :92.33 (+\-20.40) 
Macro recall  of the best model :91.25 (+\-22.50) 




classifier name: RandomForestClassifier
	criterion: gini
	max_depth: 4
	max_features: auto
	n_estimators: 300


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy of the best model :83.57  (+\-25.86)
Macro precision  of the best model :78.83 (+\-48.12) 
Macro recall  of the best model :80.83 (+\-35.59) 
classifier name: GradientBoostingClassifier
	criterion: mae
	learning_rate: 0.1
	loss: deviance
	max_depth: 5
	max_features: log2
	n_estimators: 10
	subsample: 0.6
Accuracy of the best model :84.05  (+\-24.75)
Macro precision  of the best model :83.92 (+\-38.64) 
Macro recall  of the best model :80.42 (+\-30.29) 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
clf_fs10 = classifiers(new_features,target,10)
clf_fs10.get_all_results()

In [None]:
# clf_fs5 = classifiers(new_features,target,5)
# clf_fs5.get_all_results()

In [None]:
from sklearn import svm
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, new_features,target, cv=5)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=200, random_state=0)

scores = cross_val_score(clf, new_features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
import nltk
import sklearn
import numpy 
import pandas


In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
sfs1 = SFS(lr, 
           k_features=int(new_features.shape[1] / 2), 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=5)

sfs1 = sfs1.fit(new_features, target)

In [None]:
sfs_features = sfs1.transform(new_features)

In [None]:
sfs_features

array([[ 1.53832377e+00,  9.69836495e-01,  1.01255421e+00,
         9.71362225e-01, -2.52599607e-01,  8.13738000e-04,
        -9.18247370e-02, -1.36399873e+00, -1.19347943e+00,
        -1.52308659e+00],
       [ 1.10924778e+00,  8.26932667e-01,  8.30203574e-01,
         3.05376281e-01, -4.29750201e-01, -2.01189190e-01,
        -1.32568184e+00, -1.13132549e+00, -1.36290797e+00,
        -2.16915702e-01],
       [ 1.45820429e+00,  1.86855818e+00,  1.71885483e+00,
         6.48836324e-01,  4.41602891e-01,  1.25633106e-01,
        -7.01685884e-01, -3.76903565e-01, -2.95681244e-01,
        -1.06597741e+00],
       [ 1.11214727e+00,  1.44467366e+00,  1.54522720e+00,
         4.30152393e-01,  4.13370292e-01, -1.52163629e-01,
        -9.79802280e-01, -1.24135502e+00, -1.03119970e+00,
        -7.58212732e-01],
       [ 2.07068481e+00,  1.41564488e+00,  1.32490423e+00,
         1.02394662e+00,  4.62804360e-02, -3.47299362e-01,
        -2.11156755e-01, -7.45168672e-01, -1.47265122e+00,
        -1.

In [None]:
sfs1.k_score_

0.9679487179487178

In [None]:
from sklearn import svm
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

# model = LogisticRegression(solver='lbfgs')
model = svm.SVC(kernel='linear', C=100)
# model = ExtraTreesClassifier(n_estimators=100)
# model = RandomForestClassifier(n_estimators=200, random_state=0)
rfe = RFE(model, int(new_features.shape[1] / 2))
#rfe = RFE(model, 10)
fit = rfe.fit(new_features, target)
# fit.support_
print("Num Features: %d" % fit.n_features_)
# print("Selected Features: %s" % fit.support_)
# print("Feature Ranking: %s" % fit.ranking_)
rfe_features = fit.transform(new_features)
rfe_features.shape 

Num Features: 32


(62, 32)

In [None]:
int(new_features.shape[1] / 2)

32