In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pylab 
import scipy.stats as stats
import time
import sklearn.ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from pandas import ExcelWriter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.svm import LinearSVC

from sklearn.feature_selection import RFE

from sklearn.datasets import make_moons, make_circles, make_classification

import collections

import winsound
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 1000  # Set Duration To 1000 ms == 1 second

In [None]:
cd C:\Users\DELL003\Desktop

In [None]:
data = pd.read_excel('All_New.xlsx', sheet_name = 'All')
data.shape

In [None]:
#Seperating the groups
group0 = data[data.grup==0]
group1 = data[data.grup==1]
group2 = data[data.grup==2]

# Choose only ONE from below options for groups
-----------------------------------------------------------------

In [None]:
#For All groups
data = pd.concat([group0, group1, group2])

In [None]:
#For Controls and Low Performers
data = pd.concat([group0, group2])

In [None]:
#For Controls and High Performers
data = pd.concat([group0, group1])

In [None]:
#For High Performers and Low Performers
data = pd.concat([group1, group2])

 -------------------------------------------------------------

In [None]:
#Seperating Modalities
aBV = data.filter(like='aBV')
CBF = data.filter(like='CBF')
SPECTROSCOPY = data.filter(like='SPECTROSCOPY')
grup = data['grup']

# Choose only ONE from below options for modalities
 ----------------------------

In [None]:
#For CBF
data = pd.concat([grup, CBF], axis=1)

In [None]:
#For aBV
data = pd.concat([grup, aBV], axis=1)

In [None]:
#For ASL
data = pd.concat([grup, CBF, aBV], axis=1)

In [None]:
#For SPECTROSCOPY
data = pd.concat([grup, SPECTROSCOPY], axis=1)

---------------------------

In [None]:
# In practice, feature selection should be done after data pre-processing,
# so ideally, all the categorical variables are encoded into numbers,
# and then you can assess how deterministic they are of the target

# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape

In [None]:
#Dropping NA
data = data.dropna(axis = 1, thresh = round((0.8)*data.shape[0])) #removing features existing in less than 0.2 of the samples 
data = data.dropna(axis = 0, thresh = round((0.5)*data.shape[1])) #removing samples having features less than half of the total features 
print(data.shape)

In [None]:
#Checking missing values in features
pd.options.display.max_rows = 4000
data.isna().sum()

In [None]:
pd.set_option('display.max_columns', None)  


In [None]:
data

In [None]:
#Change number of selected features
number_of_selected_features = 1

In [None]:
imputer1 = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer2 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

In [None]:
#Feature Selection Algorithms
fs1 = RFE(RandomForestClassifier(n_estimators=100), n_features_to_select=number_of_selected_features, verbose=2)
fs2 = RFE(ExtraTreesClassifier(n_estimators=100, bootstrap=True), n_features_to_select=number_of_selected_features, verbose=2)
fs3 = RFE(GradientBoostingClassifier(n_estimators=100, max_features='auto', subsample=0.7), n_features_to_select=number_of_selected_features, verbose=2)
fs4 = RFE(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3), n_estimators=100), n_features_to_select=number_of_selected_features, verbose=2)
#Add more feature selection options


In [None]:
#Pipelines

pipe1 = Pipeline([('imputer', imputer1), ('scaler', StandardScaler()),
                 ('fs', fs1)])

pipe2 = Pipeline([('imputer', imputer2), ('scaler', StandardScaler()),
                 ('fs', fs1)])


pipe3 = Pipeline([('imputer', imputer1), ('scaler', StandardScaler()),
                 ('fs', fs2)])


pipe4 = Pipeline([('imputer', imputer2), ('scaler', StandardScaler()),
                 ('fs', fs2)])


pipe5 = Pipeline([('imputer', imputer1), ('scaler', StandardScaler()),
                 ('fs', fs3)])


pipe6 = Pipeline([('imputer', imputer2), ('scaler', StandardScaler()),
                 ('fs', fs3)])



pipe7 = Pipeline([('imputer', imputer1), ('scaler', StandardScaler()),
                 ('fs', fs4)])


pipe8 = Pipeline([('imputer', imputer2), ('scaler', StandardScaler()),
                 ('fs', fs4)])

#Add more pipelines



In [None]:
pipelist = [pipe1, pipe2, pipe3, pipe4, pipe5, pipe6]  

In [None]:
# Checking training set model performance for detecting underfitting
start = time.time()
for pipe in pipelist:
    pipe = pipe.fit(np.array(data.drop(labels=['grup'], axis=1)), data['grup'])
    training_pred = pipe.predict(np.array(data.drop(labels=['grup'], axis=1)))
    print(confusion_matrix(data['grup'],training_pred))

end = time.time()
print('Computation Time:',end - start)
winsound.Beep(frequency, duration)

In [None]:
Estimated_Time = ((end - start)*100)
print('Estimated Time:',Estimated_Time/60,'min','or',Estimated_Time/3600,'h')

In [None]:
start = time.time()
print('Program start to run at',time.localtime())
counter_collections=collections.Counter()

for pipe in pipelist:
    selectedfeatures=[]

    for i in range(100):

        X_train, X_test, y_train, y_test = train_test_split(np.array(data.drop(labels=['grup'], axis=1)), data['grup'], test_size=0.2)


        # run grid search
        pipe = pipe.fit(np.array(X_train), y_train)
        selectedfeatures.append(tuple(pipe.steps[2][1].get_support(indices=True)))          #for RF-RFE




    counter=collections.Counter(selectedfeatures)
    counter_collections = counter_collections + counter
    
    print('Most Common Selected Features:',np.array(counter.most_common))    #change according to number of selected features
    
    
    

print('Total Selected Features in all methods:',np.array(counter_collections.most_common))

end = time.time()
print('Computation Time:',(end - start)/60,'min')
winsound.Beep(frequency, duration)

In [None]:
features = data.drop(labels=['grup'], axis=1).iloc[:,[92,14,4,53,60,2]]
features.columns

In [None]:
sns.lmplot('aBVmedian331', # Horizontal axis
           'aBVmedian207', # Vertical axis
           data=data, # Data source
           fit_reg=False, # Don't fix a regression line
           hue="grup", # Set color
           scatter_kws={"marker": "D", # Set marker style
                        "s": 40}) # S marker size


In [None]:
corr = features.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(features.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(features.columns)
ax.set_yticklabels(features.columns)
plt.show()

In [None]:
corr

In [None]:
selected_features_with_grup = pd.concat([features,data['grup']], axis=1)

In [None]:
selected_features_with_grup

In [None]:
cd C:\Users\DELL003\Desktop\Output_Features

In [None]:
writer = ExcelWriter('featuresCBFmeanAll_KH.xlsx')
pd.DataFrame(selected_features_with_grup).to_excel(writer,'Sheet1', index=False)
writer.save()