In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pylab 
import scipy.stats as stats
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.svm import LinearSVC

from sklearn.feature_selection import RFE

import collections

import winsound
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 1000  # Set Duration To 1000 ms == 1 second

In [None]:
cd C:\Users\DELL003\Desktop

In [None]:
data = pd.read_excel('Book1.xlsx', sheet_name = 'ALL_median')
data.shape

In [None]:
# In practice, feature selection should be done after data pre-processing,
# so ideally, all the categorical variables are encoded into numbers,
# and then you can assess how deterministic they are of the target

# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape

In [None]:
#Dropping NA
data = data.dropna(axis = 1, thresh = round((0.8)*data.shape[0])) #removing features existing in less than 0.2 of the samples 
data = data.dropna(axis = 0, thresh = round((0.5)*data.shape[1])) #removing samples having features less than half of the total features 
#data = data.fillna(0) #filling missing values with zeroes
print(data.shape)

In [None]:
#Change number of selected features
number_of_selected_features = 2

In [None]:
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
imputer2 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

In [None]:
#Feature Selection Algorithms
fs1 = RFE(RandomForestClassifier(n_estimators=100), n_features_to_select=number_of_selected_features, verbose=2)
fs2 = RFE(ExtraTreesClassifier(n_estimators=100), n_features_to_select=number_of_selected_features, verbose=2)

#Add more feature selection options





In [None]:
#Pipelines

pipe1 = Pipeline([('imputer', imputer1), ('scaler', StandardScaler()),
                 ('fs', fs1)])

pipe2 = Pipeline([('imputer', imputer1), ('scaler', StandardScaler()),
                 ('fs', fs2)])


pipe3 = Pipeline([('imputer', imputer2), ('scaler', StandardScaler()),
                 ('fs', fs1)])

#Add more pipelines



In [None]:
gslist = [pipe1, pipe2, pipe3]

In [None]:
# Checking training set model performance for detecting underfitting
start = time.time()
for gs in gslist:
    gs = gs.fit(np.array(data.drop(labels=['grup'], axis=1)), data['grup'])
    training_pred = gs.predict(np.array(data.drop(labels=['grup'], axis=1)))
    print(confusion_matrix(data['grup'],training_pred))

end = time.time()
print('Computation Time:',end - start)
winsound.Beep(frequency, duration)

In [None]:
Estimated_Time = ((end - start)*100)
print('Estimated Time:',Estimated_Time/60,'min','or',Estimated_Time/3600,'h')

In [None]:
start = time.time()
print('Program start to run at',time.localtime())

for gs in gslist:
    
    pred=[]
    response=[]
    selectedfeatures=[]

    for i in range(100):

        X_train, X_test, y_train, y_test = train_test_split(np.array(data.drop(labels=['grup'], axis=1)), data['grup'], test_size=0.2)


        # run grid search
        gs = gs.fit(np.array(X_train), y_train)
        selectedfeatures.append(tuple(gs.steps[2][1].get_support(indices=True)))          #for RF-RFE




    counter=collections.Counter(selectedfeatures)

    print('Most Common Selected Features:',np.array(counter.most_common))    #change according to number of selected features




end = time.time()
print('Computation Time:',(end - start)/60,'min')
winsound.Beep(frequency, duration)

In [None]:
df = data.drop(labels=['grup'], axis=1).iloc[:,[148, 197]]
df.columns

In [None]:
plt.scatter(data.drop(labels=['grup'], axis=1).iloc[:,[151]], data.drop(labels=['grup'], axis=1).iloc[:,[367]], c=np.reshape(np.array(data['grup']), (-1, 1)), s=30, alpha=0.5)
plt.show()

In [None]:
plt.scatter(data.drop(labels=['grup'], axis=1).iloc[:,[3]], data['grup'], s=30, c=np.reshape(np.array(data['grup']), (-1, 1)), alpha=0.5)
plt.show()

In [None]:
features = data.drop(labels=['grup'], axis=1).iloc[:,[151,367,37,219,200,404,87,419,331,323,272,21,288,125]]
corr = features.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(features.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(features.columns)
ax.set_yticklabels(features.columns)
plt.show()

In [None]:
corr