In [11]:
import numpy as np 
import pandas as pd 

class preparing_data:
    def __init__(self):
        self.data_url = "data/colon.csv"
        
    def load_data(self):
        #df = pd.read_csv(self.data_url, delimiter=',')
        df = pd.read_csv(self.data_url, delimiter=';')
        features = df.iloc[:,:-1]
        target = df.iloc[:,-1]
        
        return features,target
    
    def prepare_features_scores_list(self,features):
        features_scores = pd.DataFrame()
        features_scores.insert(0, 'feature', list(features))
        
        return features_scores
    
    def add_score_column(self,features_scores,col_name,col_value):
        # convert 1D array to rows and insert it into df colum
        features_scores[col_name] = pd.Series(col_value)
        
        return features_scores
    
    # get the rows according to true decision
    def filter_by_decision(self,features_scores):
        filtered= features_scores.loc[(features_scores['decision'] == True)]
        
        return filtered
                       

    # transform selected features into the training data X
    def transform_new_features(self,X,features_scores):
        features_list= features_scores['feature'].values
        new_features = X.loc[:, features_list]
        
        return new_features

        


In [2]:
import numpy as np 
import pandas as pd 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif,mutual_info_classif
from sklearn import svm
from sklearn.feature_selection import SelectFromModel


class features_selection:
    def __init__(self,features,target,fs_methods,threshold):
        self.fs_methods = fs_methods
        self.features = features
        self.target = target
        self.threshold = threshold

        
    def get_features_scores(self,features_scores):

        pre_data= preparing_data()
        
        if 'anova' in self.fs_methods:
            f,p_val,anova_support = self.anova(0.05)
            features_scores = pre_data.add_score_column(features_scores,'anova_f',f)
            features_scores = pre_data.add_score_column(features_scores,'anova_p_val',p_val)
            features_scores = pre_data.add_score_column(features_scores,'anova_support',anova_support)
        
        if 'mi' in self.fs_methods:
            mi,mi_threshold,mi_support = self.mutual_info()
            features_scores = pre_data.add_score_column(features_scores,'mi_value',mi)
            features_scores = pre_data.add_score_column(features_scores,'mi_support',mi_support)
        
        if 'fm' in self.fs_methods:
            coef,fm_threshold,fm__support = self.from_model()
            features_scores = pre_data.add_score_column(features_scores,'fm_coef',coef)
            features_scores = pre_data.add_score_column(features_scores,'fm_support',fm__support)
        
        return features_scores
        
    def anova(self, threshold):
        #f,p_value = f_classif(self.features,self.target)
        anova = SelectKBest(f_classif, k='all').fit(self.features,self.target)
        f= anova.scores_
        p_value = anova.pvalues_
        support = p_value < threshold
        

        return f,p_value,support
    
    def mutual_info(self):
        mi = SelectKBest(mutual_info_classif, k='all').fit(self.features,self.target)
        scores= mi.scores_
        # mean as a threshold
        #threshold = mi.estimator_.
        if self.threshold == 'median':
            threshold = np.median(scores)
        else:
            threshold = np.mean(scores)
            
        support = scores > threshold

        return scores,threshold,support
    
    def from_model(self):
        clf = svm.SVC(kernel='linear', C=1)
        model = SelectFromModel(estimator=clf).fit(self.features,self.target)
        coef = model.estimator_.coef_
        threshold = model.threshold_
       
        support = coef > threshold
        # convert 2D array to 1D
        coef = np.reshape(coef, (np.product(coef.shape),))
        support = np.reshape(support, (np.product(support.shape),))
        
        return coef,threshold,support

In [3]:
class data_fusion:
    def __init__(self,fs_methods):
        self.fs_methods = fs_methods
        self.col_support = []
        for fs in self.fs_methods:
            self.col_support.append(fs + "_support")
    
    def prepare_decision(self, decision,features_scores):
        if decision == 'consensus':
            return self.decision_consensus(features_scores)
        elif decision == 'majority':
            return self.decision_majority(features_scores)
    
    def decision_majority(self,features_scores):
        decsions=[]
        for row in features_scores[self.col_support].values:
            decsion, count = np.unique(row, return_counts=True)
            if np.all(count <= 1):
                decsions.append(row[0])
            else:
                decsions.append(decsion[np.argmax(count)] )

        features_scores['decision'] = decsions
        
        return features_scores
        
    # The support output of all FS methods must be a true 
    def decision_consensus(self,features_scores):
        features_scores['decision'] = features_scores[self.col_support].eq(True, axis=0).all(axis=1)

        return features_scores

### Loading and preparing a data frame for the features and scores based on fs methods


In [4]:
fs_methods = ["anova", "mi", "fm"]
threshold = 'median'
decision = 'consensus'

pre_data_cls = preparing_data()
features,target = pre_data_cls.load_data()
features_scores = pre_data_cls.prepare_features_scores_list(features)
features_scores

Unnamed: 0,feature
0,V1
1,V2
2,V3
3,V4
4,V5
...,...
1995,V1996
1996,V1997
1997,V1998
1998,V1999


### Filling dataframe with scores based on fs methods


In [5]:
fs = features_selection(features,target,fs_methods,threshold )
features_scores= fs.get_features_scores(features_scores)
features_scores

Unnamed: 0,feature,anova_f,anova_p_val,anova_support,mi_value,mi_support,fm_coef,fm_support
0,V1,0.641398,0.426365,False,0.000000,False,-0.001927,False
1,V2,0.002387,0.961194,False,0.093211,True,0.001249,False
2,V3,0.386330,0.536590,False,0.015255,False,0.003638,False
3,V4,0.145885,0.703849,False,0.004555,False,-0.002680,False
4,V5,0.056955,0.812188,False,0.037575,True,-0.002106,False
...,...,...,...,...,...,...,...,...
1995,V1996,0.482037,0.490181,False,0.000000,False,0.001586,False
1996,V1997,2.439861,0.123546,False,0.000000,False,0.002472,False
1997,V1998,10.313773,0.002124,True,0.093592,True,0.011555,True
1998,V1999,0.899225,0.346793,False,0.000000,False,0.012881,True


### Preparing  the decision for each feature

In [6]:
d_fusion = data_fusion(fs_methods)
features_scores = d_fusion.prepare_decision(decision,features_scores)
features_scores

Unnamed: 0,feature,anova_f,anova_p_val,anova_support,mi_value,mi_support,fm_coef,fm_support,decision
0,V1,0.641398,0.426365,False,0.000000,False,-0.001927,False,False
1,V2,0.002387,0.961194,False,0.093211,True,0.001249,False,False
2,V3,0.386330,0.536590,False,0.015255,False,0.003638,False,False
3,V4,0.145885,0.703849,False,0.004555,False,-0.002680,False,False
4,V5,0.056955,0.812188,False,0.037575,True,-0.002106,False,False
...,...,...,...,...,...,...,...,...,...
1995,V1996,0.482037,0.490181,False,0.000000,False,0.001586,False,False
1996,V1997,2.439861,0.123546,False,0.000000,False,0.002472,False,False
1997,V1998,10.313773,0.002124,True,0.093592,True,0.011555,True,True
1998,V1999,0.899225,0.346793,False,0.000000,False,0.012881,True,False


### Filtering the rows based on the decision

In [7]:
filtered = pre_data_cls.filter_by_decision(features_scores)
filtered

Unnamed: 0,feature,anova_f,anova_p_val,anova_support,mi_value,mi_support,fm_coef,fm_support,decision
42,V43,15.231294,0.000243,True,0.100077,True,0.008334,True,True
46,V47,13.658871,0.000476,True,0.098696,True,0.015714,True,True
71,V72,11.300812,0.001353,True,0.098436,True,0.010295,True,True
99,V100,16.830599,0.000125,True,0.115068,True,0.010397,True,True
117,V118,4.810661,0.032171,True,0.038136,True,0.008013,True,True
...,...,...,...,...,...,...,...,...,...
1962,V1963,6.034942,0.016936,True,0.295611,True,0.015976,True,True
1971,V1972,17.003775,0.000116,True,0.088015,True,0.009477,True,True
1982,V1983,17.970879,0.000079,True,0.104030,True,0.012622,True,True
1992,V1993,9.264142,0.003466,True,0.083429,True,0.025890,True,True


### Transforming selected features into the training data X

In [8]:
new_features = pre_data_cls.transform_new_features(features,filtered)
new_features

Unnamed: 0,V43,V47,V72,V100,V118,V141,V187,V190,V199,V237,...,V1920,V1935,V1943,V1959,V1960,V1963,V1972,V1983,V1993,V1998
0,1.638994,1.841401,0.837295,1.562545,0.969836,0.380573,0.911996,1.101830,0.722555,0.729774,...,-2.074948,-1.977302,-1.431748,-0.028511,-0.810619,-1.217036,-0.233146,-0.721758,-0.983356,-0.314519
1,1.652925,2.083153,1.299608,1.023201,0.826933,0.550491,0.863113,0.261851,0.360765,0.437548,...,-0.971048,-2.367909,-1.504903,-0.524474,-1.064721,-0.430265,0.090045,-0.658216,-3.063926,-1.213857
2,2.163850,2.527047,1.424466,0.996510,1.868558,1.032093,1.467008,0.658755,0.987293,1.248726,...,-2.166992,-0.909947,0.204145,-1.810719,-0.910476,-1.317392,-1.064836,-1.642952,0.318488,-1.014786
3,1.644518,2.052499,1.963838,0.469170,1.444674,1.452548,1.198822,0.581791,0.622009,1.077711,...,-2.745942,-1.972659,-0.467709,-1.584988,-1.327259,-0.878310,-1.498032,-3.129875,0.079492,-1.606205
4,1.832037,2.026698,1.379147,1.783923,1.415645,0.977549,1.136374,1.219419,0.890070,0.955313,...,-1.866233,-0.976367,-0.497108,-1.193273,-1.471401,-1.262787,-2.118877,-1.953679,-0.081214,-1.210256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2.306037,2.529741,1.219807,2.266295,1.445814,1.349228,1.772102,1.043595,1.084486,1.009719,...,0.186144,-1.161223,-2.516730,0.483365,-0.011483,-0.643382,0.734046,0.084850,0.330376,-0.580251
58,2.338591,2.770440,1.940581,2.005886,1.478023,1.534218,2.229719,1.821454,1.628953,1.679681,...,-1.588604,-0.563905,-1.824751,-0.500416,-0.696111,-0.672547,0.258621,-0.446646,0.379994,-0.654357
59,2.062854,2.252410,1.832161,1.983318,2.002133,1.238813,1.374994,1.239655,1.161608,1.002147,...,-0.538933,-1.438532,-2.772417,0.252883,-0.403814,-0.723028,0.273870,-0.450565,-0.315230,-0.745123
60,2.117819,2.173026,1.878996,1.967643,1.938732,0.929690,2.114795,1.163909,1.488643,1.756576,...,-1.110134,-1.132150,-1.620037,0.246830,-0.376433,-0.529291,0.170839,-0.095567,0.186172,-0.364456


In [9]:
from sklearn import svm
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.84 (+/- 0.26)


In [10]:
from sklearn import svm
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, new_features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.89 (+/- 0.26)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
clf = GaussianNB()

scores = cross_val_score(clf, features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
clf = GaussianNB()

scores = cross_val_score(clf, new_features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
 from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
clf = KNeighborsClassifier(n_neighbors=3)

scores = cross_val_score(clf, features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
 from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
clf = KNeighborsClassifier(n_neighbors=3)

scores = cross_val_score(clf, new_features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=200, random_state=0)

scores = cross_val_score(clf, features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=200, random_state=0)

scores = cross_val_score(clf, new_features,target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
best_model = SVC(decision_function_shape = "ovo",probability=True) 
grid = {
     
'C': [0.001, 0.01, 0.1, 1, 10],'kernel': ["sigmoid","linear","rbf"],'gamma': [0.0001, 0.001, 0.01, 0.1]
}

grid_search = GridSearchCV(best_model, param_grid=grid,return_train_score=True, cv=10,n_jobs=-1)
grid_search.fit(features,target)

In [None]:
print(grid_search.best_estimator_,"\n")

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(list(grid.keys())):
    print("\t{0}: {1}".format(param_name, best_parameters[param_name]))

selected_model = grid_search.best_estimator_


In [None]:
scores = cross_val_score(selected_model, features, target, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))