In [67]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

# features selection libraries
from sklearn.feature_selection import RFE
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

#import libraries models
from sklearn import model_selection, metrics, grid_search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Import training data 

CAUTION! when importing the file the arrays nummer which were previously the names of the rows are no in a column!(take a look at the table) So we have to adjust this for the test and training data!

In [3]:
x_train = pd.read_csv('../data/train_features.csv')
y_train = pd.read_csv('../data/train_labels.csv')

Correct columns and names of rows for train (will do test data later). For some reason when importing the saved x_train and y_trian data pandas puts the names of the rows in a seperate columns. Thus adding an additional column (see x and y train after importing it)

In [4]:
# rename rows of x_train
x_train = x_train.rename(columns= {'Unnamed: 0':'arrays'})

array = 0 
for i in x_train.arrays:
    x_train = x_train.rename(index = {array : i})
    array +=1
x_train = x_train.drop('arrays', axis =1)

# rename rows of y_train
y_train = y_train.rename(columns= {'Unnamed: 0':'arrays'})

array = 0 
for i in y_train.arrays:
    y_train = y_train.rename(index = {array : i})
    array +=1
y_train = y_train.drop('arrays', axis =1)

In [None]:
#y_train['Subgroups'] = y_train['Subgroups'].map(lambda x: x.strip('"'))

# RFE feature selection 

Note, RFE features selection might take a couple of minutes when testing for multiple parameters. 

In the paper they SVM as estimator and uses different penalties for C
for the SVM estimator for optimization.
C penalties used: 0.25, 1, 4, 16, 64, 256

Use the GridSearchCV function scikit-learn provides to search for best C penalty parameter


## 8 features

### Which C penalties gives best score for features selection

In [5]:
# select best SVC c penalty for 8 features
c_range = [0.25, 1, 4, 16, 64, 256]

estimator = svm.SVC(kernel="linear")
selector_8f = RFE(estimator,8)
grid8 = GridSearchCV(selector_8f, param_grid={'estimator__C': [0.25, 1, 4, 16, 64, 256]})
grid8.fit(x_train, y_train.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=RFE(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
  n_features_to_select=8, step=1, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'estimator__C': [0.25, 1, 4, 16, 64, 256]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [6]:
print(grid8.best_params_)
print(grid8.best_score_)


{'estimator__C': 0.25}
0.775


### Use parameter with best score

In [7]:
# feature selection for 8 features with best C score(=0.25)
estimator = svm.SVC(kernel ='linear', C=0.25) # in the paper they used SVC as the estimator (see appendix)
selector_RFE8 = RFE(estimator, 8)
selector_RFE8 = selector_RFE8.fit(x_train,y_train.values.ravel())

Since the feature columns are label 0 to 2833 we can see which columns are selected by finding the TRUE items in selector_8f.support_. This command gives you an array with true or false. If the feature are that position was selected it is label true

In [8]:
#find selected features
selector_RFE8.get_support(indices = True)


array([ 192, 1062, 1677, 1900, 2024, 2184, 2213, 2750], dtype=int64)

In [9]:
x_train_RFE8 = selector_RFE8.transform(x_train)
x_train_RFE8.shape

(80, 8)

## 32 features

### Which C penalties gives best score for features selection

In [10]:
# select best SVC c penalty for 32 features using gridsearchCV

estimator = svm.SVC(kernel="linear")
selector_32f = RFE(estimator,32)
grid32 = GridSearchCV(selector_32f, param_grid={'estimator__C': [0.25, 1, 4, 16, 64, 256]})
grid32.fit(x_train, y_train.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=RFE(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
  n_features_to_select=32, step=1, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'estimator__C': [0.25, 1, 4, 16, 64, 256]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
print(grid32.best_params_)
print(grid32.best_score_)

{'estimator__C': 0.25}
0.8


### Use parameter with best score


In [15]:
# feature selection for 32 features for best C score(=0.25)

estimator = svm.SVC(kernel ='linear', C=0.25)
selector_RFE32 = RFE(estimator, 32)
selector_RFE32 = selector_RFE32.fit(x_train,y_train.values.ravel())


In [16]:
#find selected features
selector_RFE32.get_support(indices = True)

array([ 190,  192,  226,  673,  762,  768,  771,  772,  854, 1035, 1062,
       1243, 1562, 1569, 1655, 1657, 1677, 1773, 1900, 1962, 2021, 2024,
       2030, 2079, 2125, 2183, 2184, 2213, 2214, 2548, 2655, 2750], dtype=int64)

In [17]:
x_train_RFE32 = selector_RFE32.transform(x_train)
x_train_RFE32.shape

(80, 32)

## 94 features

### Which C penalties gives best score for features selection

In [18]:
# select best SVC c penalty for 32 features using gridsearchCV

estimator = svm.SVC(kernel="linear")
selector_94f = RFE(estimator,94)
grid94 = GridSearchCV(selector_94f, param_grid={'estimator__C': [0.25, 1, 4, 16, 64, 256]})
grid94.fit(x_train, y_train.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=RFE(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
  n_features_to_select=94, step=1, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'estimator__C': [0.25, 1, 4, 16, 64, 256]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
print(grid94.best_params_)
print(grid94.best_score_)

{'estimator__C': 0.25}
0.7625


### Use parameter with best score


In [20]:
# feature selection for 94 features for best C score(=0.25)
estimator = svm.SVC(kernel ='linear' , C = 0.25)
selector_RFE94 = RFE(estimator, 94)
selector_RFE94 = selector_RFE94.fit(x_train,y_train.values.ravel())
selector_RFE94.get_support(indices = True)


array([  41,  190,  192,  194,  226,  262,  463,  486,  672,  673,  762,
        765,  768,  769,  771,  772,  792,  849,  854,  855,  998,  999,
       1009, 1032, 1035, 1062, 1111, 1114, 1243, 1244, 1245, 1352, 1401,
       1561, 1562, 1569, 1596, 1655, 1657, 1663, 1677, 1773, 1812, 1817,
       1870, 1876, 1900, 1902, 1960, 1962, 1971, 1972, 2019, 2021, 2024,
       2029, 2030, 2032, 2056, 2068, 2070, 2078, 2079, 2125, 2126, 2182,
       2183, 2184, 2185, 2186, 2202, 2206, 2207, 2210, 2213, 2214, 2217,
       2218, 2515, 2528, 2547, 2548, 2549, 2655, 2734, 2748, 2749, 2750,
       2752, 2760, 2770, 2771, 2816, 2826], dtype=int64)

In [21]:
# feature selection for 32 features
selector_RFE94.get_support(indices = True)

array([  41,  190,  192,  194,  226,  262,  463,  486,  672,  673,  762,
        765,  768,  769,  771,  772,  792,  849,  854,  855,  998,  999,
       1009, 1032, 1035, 1062, 1111, 1114, 1243, 1244, 1245, 1352, 1401,
       1561, 1562, 1569, 1596, 1655, 1657, 1663, 1677, 1773, 1812, 1817,
       1870, 1876, 1900, 1902, 1960, 1962, 1971, 1972, 2019, 2021, 2024,
       2029, 2030, 2032, 2056, 2068, 2070, 2078, 2079, 2125, 2126, 2182,
       2183, 2184, 2185, 2186, 2202, 2206, 2207, 2210, 2213, 2214, 2217,
       2218, 2515, 2528, 2547, 2548, 2549, 2655, 2734, 2748, 2749, 2750,
       2752, 2760, 2770, 2771, 2816, 2826], dtype=int64)

In [22]:
x_train_RFE94 = selector_RFE94.transform(x_train)
x_train_RFE94.shape

(80, 94)

# ANOVA feature selection

 This methode does feature selection using univariate statistics. See http://scikit-learn.org/stable/modules/feature_selection.html for more info.
In this case I used f_classif, which makes the selector use ANOVA to select the best K features of the model.

In the paper no parameter selection was done for ANOVA

## 8 features

In [23]:
# feature selection for 8 features ANOVA
selector_ANOVA8= SelectKBest(f_classif, k=8).fit(x_train,y_train.values.ravel())
x_train_ANOVA8 = selector_ANOVA8.transform(x_train)
x_train_ANOVA8.shape

(80, 8)

In [24]:
selector_ANOVA8.get_support(indices=True)

array([ 673,  849,  852,  853,  854,  855, 2184, 2213], dtype=int64)

## 32 features

In [25]:
# feature selection for 32 features ANOVA
selector_ANOVA32= SelectKBest(f_classif, k=32).fit(x_train,y_train.values.ravel())
x_train_ANOVA32 = selector_ANOVA32.transform(x_train)
x_train_ANOVA32.shape

(80, 32)

In [26]:
selector_ANOVA32.get_support(indices=True)

array([ 672,  673,  848,  849,  851,  852,  853,  854,  855, 1001, 1646,
       1651, 1655, 1656, 1662, 1663, 1664, 1667, 1668, 1677, 1678, 1687,
       1972, 2021, 2032, 2034, 2039, 2040, 2184, 2210, 2213, 2214], dtype=int64)

## 94 features

In [27]:
# feature selection for 94 features ANOVA
selector_ANOVA94= SelectKBest(f_classif, k=94).fit(x_train,y_train.values.ravel())
x_train_ANOVA94 = selector_ANOVA94.transform(x_train)

In [28]:
selector_ANOVA94.get_support(indices=True)

array([ 461,  463,  464,  672,  673,  819,  837,  839,  840,  842,  843,
        844,  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,
        855,  856,  857,  861,  999, 1000, 1001, 1002, 1004, 1642, 1643,
       1645, 1646, 1647, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1659,
       1660, 1662, 1663, 1664, 1665, 1667, 1668, 1669, 1670, 1671, 1672,
       1673, 1674, 1676, 1677, 1678, 1679, 1687, 1971, 1972, 1973, 2019,
       2020, 2021, 2022, 2023, 2024, 2026, 2027, 2028, 2029, 2032, 2033,
       2034, 2035, 2036, 2038, 2039, 2040, 2184, 2206, 2207, 2210, 2213,
       2214, 2223, 2748, 2749, 2750, 2751], dtype=int64)

# Save selected features for later

In [43]:
# save RFE for all features (8,32 and 94)


# Classification methodes

## KNN (K-Nearest-Neighbor)

First we want to do hyperparameter optimization. in the paper they optimize for: k_neighbors, Distance function and weight function

In [29]:
knn = KNeighborsClassifier()
# define the parameter range that should be searched
k_range = list(range(1, 31))
dist_func = [1,2]# 1 = manhattan, 2= Euclidean
dist_weight = ['uniform', 'distance']

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, p=dist_func,weights=dist_weight)

# instantiate the grid
grid_search_knn= GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

### Fit parameters for all features RFE

In [32]:
# 8 features
grid_RFE8_knn = grid_search_knn.fit(x_train_RFE8, y_train.values.ravel())
print('Best score for 8 features:',grid_search_knn.best_score_)
print('Best parameters for 8 features:',grid_search_knn.best_params_)

# 32 features
grid_RFE32_knn = grid_search_knn.fit(x_train_RFE32, y_train.values.ravel())
print('Best score for 32 features:',grid_search_knn.best_score_)
print('Best parameters for 32 features:',grid_search_knn.best_params_)

# 94 features
grid_RFE94_knn = grid_search_knn.fit(x_train_RFE94, y_train.values.ravel())
print('Best score for 94 features:',grid_search_knn.best_score_)
print('Best parameters for 94 features:',grid_search_knn.best_params_)


Best score for 8 features: 0.9375
Best parameters for 8 features: {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
Best score for 32 features: 0.9625
Best parameters for 32 features: {'n_neighbors': 11, 'p': 2, 'weights': 'distance'}
Best score for 94 features: 0.875
Best parameters for 94 features: {'n_neighbors': 10, 'p': 2, 'weights': 'distance'}


For 8 features the best outcome was: k = 25 | p = euclidean distance | weights = distance

For 32 features the best outcome was: k = 11 | p = euclidean distance | weights = distance

For 94 features the best outcome was: k = 10 | p = euclidean distance | weights = distance


### Fit parameters for all features ANOVA

In [44]:
# 8 features
grid_ANOVA8_knn = grid_search_knn.fit(x_train_ANOVA8, y_train.values.ravel())
print('Best score for 8 features:',grid_search_knn.best_score_)
print('Best parameters for 8 features:',grid_search_knn.best_params_)

# 32 features
grid_ANOVA32_knn = grid_search_knn.fit(x_train_ANOVA32, y_train.values.ravel())
print('Best score for 32 features:',grid_search_knn.best_score_)
print('Best parameters for 32 features:',grid_search_knn.best_params_)

# 94 features
grid_ANOVA94_knn = grid_search_knn.fit(x_train_ANOVA94, y_train.values.ravel())
print('Best score for 94 features:',grid_search_knn.best_score_)
print('Best parameters for 94 features:',grid_search_knn.best_params_)

Best score for 8 features: 0.8375
Best parameters for 8 features: {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
Best score for 32 features: 0.8625
Best parameters for 32 features: {'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}
Best score for 94 features: 0.8375
Best parameters for 94 features: {'n_neighbors': 8, 'p': 2, 'weights': 'distance'}


For 8 features the best outcome was: k = 15 | p = euclidean distance | weights = distance

For 32 features the best outcome was: k = 6 | p = euclidean distance | weights = uniform

For 94 features the best outcome was: k = 8 | p = euclidean distance | weights = distance

## Make KNN models

### RFE models

In [45]:
#8 features 
knn_model_RFE8 = KNeighborsClassifier(n_neighbors = 25, p = 2, weights = 'distance')
knn_model_RFE8.fit(x_train_RFE8,y_train.values.ravel())

#32 features 
knn_model_RFE32 = KNeighborsClassifier(n_neighbors = 11, p = 2, weights = 'distance')
knn_model_RFE32.fit(x_train_RFE32,y_train.values.ravel())

#94 features 
knn_model_RFE94 = KNeighborsClassifier(n_neighbors = 10, p = 2, weights = 'distance')
knn_model_RFE94.fit(x_train_RFE94,y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='distance')

### ANOVA models

In [46]:
#8 features 
knn_model_ANOVA8 = KNeighborsClassifier(n_neighbors = 15, p = 2, weights = 'distance')
knn_model_ANOVA8.fit(x_train_ANOVA8,y_train.values.ravel())

#32 features 
knn_model_ANOVA32 = KNeighborsClassifier(n_neighbors = 6, p = 2, weights = 'uniform')
knn_model_ANOVA32.fit(x_train_ANOVA32,y_train.values.ravel())

#94 features 
knn_model_ANOVA94 = KNeighborsClassifier(n_neighbors = 8, p = 2, weights = 'distance')
knn_model_ANOVA94.fit(x_train_ANOVA94,y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='distance')

# Predict y_test

### Import test set

In [None]:
x_test = pd.read_csv('../data/test_features.csv')
y_test = pd.read_csv('../data/test_labels.csv')

In [None]:
# rename rows of x_train
x_test = x_test.rename(columns= {'Unnamed: 0':'arrays'})

array = 0 
for i in x_test.arrays:
    x_test = x_test.rename(index = {array : i})
    array +=1
x_test = x_test.drop('arrays', axis =1)

# rename rows of y_train
y_test = y_test.rename(columns= {'Unnamed: 0':'arrays'})

array = 0 
for i in y_test.arrays:
    y_test = y_test.rename(index = {array : i})
    array +=1
y_test = y_test.drop('arrays', axis =1)

### Take selected features per selection methode from test data

### RFE 

In [None]:
# x_test for 8 features RFE
x_test_RFE8 = x_test[x_test.columns[selector_RFE8.get_support(indices = True)]]
# x_test for 32 features
x_test_RFE32 = x_test[x_test.columns[selector_RFE32.get_support(indices = True)]]
# x_test for 94 features
x_test_RFE94 = x_test[x_test.columns[selector_RFE94.get_support(indices = True)]]

print(x_test_RFE8.shape)
print(x_test_RFE32.shape)
print(x_test_RFE94.shape)


### ANOVA

In [None]:
# x_test for 8 features ANOVA
x_test_ANOVA8 = x_test[x_test.columns[selector_ANOVA8.get_support(indices = True)]]
# x_test for 32 features
x_test_ANOVA32 = x_test[x_test.columns[selector_ANOVA32.get_support(indices = True)]]
# x_test for 94 features
x_test_ANOVA94 = x_test[x_test.columns[selector_ANOVA94.get_support(indices = True)]]

print(x_test_RFE8.shape)
print(x_test_RFE32.shape)
print(x_test_RFE94.shape)

### Prediction RFE KNN (STILL HAVE TO DO THIS FOR TEST DATA!!!)

In [47]:
# 8 features
y_predict_RFE8_knn = knn_model_RFE8.predict(x_train_RFE8) # will do for test data
print('accuracy for 8 feature RFE: ',metrics.accuracy_score(y_train, y_predict_RFE8_knn))
# 32 features
y_predict_RFE32_knn = knn_model_RFE32.predict(x_train_RFE32) # will do for test data
print('accuracy for 32 features RFE: ',metrics.accuracy_score(y_train, y_predict_RFE32_knn))
# 94 features
y_predict_RFE94_knn = knn_model_RFE94.predict(x_train_RFE94) # will do for test data
print('accuracy for 94 features RFE: ',metrics.accuracy_score(y_train, y_predict_RFE94_knn))


accuracy for 8 feature RFE:  1.0
accuracy for 32 features RFE:  1.0
accuracy for 94 features RFE:  1.0


### Prediction ANOVA KNN (STILL HAVE TO DO THIS FOR TEST DATA!!!)

In [48]:
# 8 features
y_predict_ANOVA8_knn = knn_model_ANOVA8.predict(x_train_ANOVA8) # will do for test data
print('accuracy for 8 feature ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA8_knn))
# 32 features
y_predict_ANOVA32_knn = knn_model_ANOVA32.predict(x_train_ANOVA32) # will do for test data
print('accuracy for 32 features ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA32_knn))
# 94 features
y_predict_ANOVA94_knn = knn_model_ANOVA94.predict(x_train_ANOVA94) # will do for test data
print('accuracy for 94 features ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA94_knn))

accuracy for 8 feature ANOVA:  0.8875
accuracy for 32 features ANOVA:  0.9
accuracy for 94 features ANOVA:  1.0


# Support Vector Machine (SVM)

Hyperparameters from paper:

Kernel Function to use as a decision function RBF and Linear

RBF Gamma Kernel coefficient for RBF 0.03125, 0.125, 0.5, 2, 8, 32, 128, and 512

RBF C Penalty Parameter of the error term 0.03125, 0.125, 0.5, 2, 8, 32, 128, and 512

Linear C Penalty Parameter of the error term 0.03125, 0.125, 0.5, 2, 8, 32, 128, and 512

## hyperparameter optimization SVM

In [71]:
svm_hyper = svm.SVC()
kernel_type = ['linear','rbf']
c_range = [0.03125, 0.125, 0.5, 2, 8, 32, 128, 512]
g_range = [0.03125, 0.125, 0.5, 2, 8, 32, 128, 512]
params_grid_SVM = dict(C=c_range, gamma = g_range, kernel = kernel_type)

grid_search_svm= GridSearchCV(svm_hyper, params_grid_SVM, cv=10, scoring='accuracy')

### fit SVM for RFE features

In [72]:
# 8 features
grid_RFE8_svm = grid_search_svm.fit(x_train_RFE8, y_train.values.ravel())
print('Best score for 8 features:',grid_search_svm.best_score_)
print('Best parameters for 8 features:',grid_search_svm.best_params_)

# 32 features
grid_RFE32_svm = grid_search_svm.fit(x_train_RFE32, y_train.values.ravel())
print('Best score for 32 features:',grid_search_svm.best_score_)
print('Best parameters for 32 features:',grid_search_svm.best_params_)

# 94 features
grid_RFE94_svm = grid_search_svm.fit(x_train_RFE94, y_train.values.ravel())
print('Best score for 94 features:',grid_search_svm.best_score_)
print('Best parameters for 94 features:',grid_search_svm.best_params_)


Best score for 8 features: 1.0
Best parameters for 8 features: {'C': 8, 'gamma': 0.03125, 'kernel': 'linear'}
Best score for 32 features: 1.0
Best parameters for 32 features: {'C': 0.03125, 'gamma': 0.03125, 'kernel': 'linear'}
Best score for 94 features: 1.0
Best parameters for 94 features: {'C': 0.03125, 'gamma': 0.03125, 'kernel': 'linear'}


Best parameters for 8 features: {'C': 8, 'gamma': 0.03125, 'kernel': 'linear'}

Best parameters for 32 features: {'C': 0.03125, 'gamma': 0.03125, 'kernel': 'linear'}

Best parameters for 94 features: {'C': 0.03125, 'gamma': 0.03125, 'kernel': 'linear'}

### fit SVM for ANOVA features

In [73]:
# 8 features
grid_ANOVA8_svm = grid_search_svm.fit(x_train_ANOVA8, y_train.values.ravel())
print('Best score for 8 features:',grid_search_svm.best_score_)
print('Best parameters for 8 features:',grid_search_svm.best_params_)

# 32 features
grid_ANOVA32_svm = grid_search_svm.fit(x_train_ANOVA32, y_train.values.ravel())
print('Best score for 32 features:',grid_search_svm.best_score_)
print('Best parameters for 32 features:',grid_search_svm.best_params_)

# 94 features
grid_ANOVA94_svm = grid_search_svm.fit(x_train_ANOVA94, y_train.values.ravel())
print('Best score for 94 features:',grid_search_svm.best_score_)
print('Best parameters for 94 features:',grid_search_svm.best_params_)

Best score for 8 features: 0.85
Best parameters for 8 features: {'C': 8, 'gamma': 0.5, 'kernel': 'rbf'}
Best score for 32 features: 0.9125
Best parameters for 32 features: {'C': 0.03125, 'gamma': 0.03125, 'kernel': 'linear'}
Best score for 94 features: 0.9
Best parameters for 94 features: {'C': 0.125, 'gamma': 0.03125, 'kernel': 'linear'}


Best parameters for 8 features: {'C': 8, 'gamma': 0.5, 'kernel': 'rbf'}

Best parameters for 32 features: {'C': 0.03125, 'gamma': 0.03125, 'kernel': 'linear'}

Best parameters for 94 features: {'C': 0.125, 'gamma': 0.03125, 'kernel': 'linear'}

## Make SVM models

### RFE Models SVM

In [74]:
#8 features 
svm_model_RFE8 = svm.SVC()
svm_model_RFE8.fit(x_train_RFE8,y_train.values.ravel())

#32 features 
svm_model_RFE32 = svm.SVC()
svm_model_RFE32.fit(x_train_RFE32,y_train.values.ravel())

#94 features 
svm_model_RFE94 = svm.SVC()
svm_model_RFE94.fit(x_train_RFE94,y_train.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### ANOVA Models SVM

In [75]:
#8 features 
svm_model_ANOVA8 = svm.SVC()
svm_model_ANOVA8.fit(x_train_ANOVA8,y_train.values.ravel())

#32 features 
svm_model_ANOVA32 = svm.SVC()
svm_model_ANOVA32.fit(x_train_ANOVA32,y_train.values.ravel())

#94 features 
svm_model_ANOVA94 = svm.SVC()
svm_model_ANOVA94.fit(x_train_ANOVA94,y_train.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Predict y_test RFE SVM (NOT USING TEST SET YET!)

In [76]:
# 8 features
y_predict_RFE8_svm = svm_model_RFE8.predict(x_train_RFE8) # will do for test data
print('accuracy for 8 feature RFE: ',metrics.accuracy_score(y_train, y_predict_RFE8_svm))
# 32 features
y_predict_RFE32_svm = svm_model_RFE32.predict(x_train_RFE32) # will do for test data
print('accuracy for 32 features RFE: ',metrics.accuracy_score(y_train, y_predict_RFE32_svm))
# 94 features
y_predict_RFE94_svm = svm_model_RFE94.predict(x_train_RFE94) # will do for test data
print('accuracy for 94 features RFE: ',metrics.accuracy_score(y_train, y_predict_RFE94_svm))


accuracy for 8 feature RFE:  1.0
accuracy for 32 features RFE:  1.0
accuracy for 94 features RFE:  1.0


### Predict y_test ANOVA SVM (NOT USING TEST SET YET!)

In [77]:
# 8 features
y_predict_ANOVA8_svm = svm_model_ANOVA8.predict(x_train_ANOVA8) # will do for test data
print('accuracy for 8 feature ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA8_svm))
# 32 features
y_predict_ANOVA32_svm = svm_model_ANOVA32.predict(x_train_ANOVA32) # will do for test data
print('accuracy for 32 features ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA32_svm))
# 94 features
y_predict_ANOVA94_svm = svm_model_ANOVA94.predict(x_train_ANOVA94) # will do for test data
print('accuracy for 94 features ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA94_svm))

accuracy for 8 feature ANOVA:  0.8375
accuracy for 32 features ANOVA:  0.925
accuracy for 94 features ANOVA:  0.9125


# Logistic Regression (LR)

Logistic Regression 

Penalty Norm used in the penalization l2

C Inverse of regularization strength; smaller values specify
stronger regularization
0.25, 1, 4, 16, 64, and 256

## Optimize for parameters LR

In [None]:
from sklearn.linear_model import LogisticRegression

In [54]:
lr = LogisticRegression(penalty = 'l2')
c_range = [0.25, 1, 4, 16, 64, 256]
params_grid_lr = dict(C=c_range)

grid_search_lr= GridSearchCV(lr, params_grid_lr, cv=10, scoring='accuracy')

### fit LR for RFE features

In [55]:
# 8 features
grid_RFE8_lr = grid_search_lr.fit(x_train_RFE8, y_train.values.ravel())
print('Best score for 8 features:',grid_search_lr.best_score_)
print('Best parameters for 8 features:',grid_search_lr.best_params_)

# 32 features
grid_RFE32_lr = grid_search_lr.fit(x_train_RFE32, y_train.values.ravel())
print('Best score for 32 features:',grid_search_lr.best_score_)
print('Best parameters for 32 features:',grid_search_lr.best_params_)

# 94 features
grid_RFE94_lr = grid_search_lr.fit(x_train_RFE94, y_train.values.ravel())
print('Best score for 94 features:',grid_search_lr.best_score_)
print('Best parameters for 94 features:',grid_search_lr.best_params_)


Best score for 8 features: 0.975
Best parameters for 8 features: {'C': 1}
Best score for 32 features: 0.9875
Best parameters for 32 features: {'C': 0.25}
Best score for 94 features: 1.0
Best parameters for 94 features: {'C': 0.25}


### fit LR for ANOVA features

In [56]:
# 8 features
grid_ANOVA8_lr = grid_search_lr.fit(x_train_ANOVA8, y_train.values.ravel())
print('Best score for 8 features:',grid_search_lr.best_score_)
print('Best parameters for 8 features:',grid_search_lr.best_params_)

# 32 features
grid_ANOVA32_lr = grid_search_lr.fit(x_train_ANOVA32, y_train.values.ravel())
print('Best score for 32 features:',grid_search_lr.best_score_)
print('Best parameters for 32 features:',grid_search_lr.best_params_)

# 94 features
grid_ANOVA94_lr = grid_search_lr.fit(x_train_ANOVA94, y_train.values.ravel())
print('Best score for 94 features:',grid_search_lr.best_score_)
print('Best parameters for 94 features:',grid_search_lr.best_params_)

Best score for 8 features: 0.8
Best parameters for 8 features: {'C': 1}
Best score for 32 features: 0.9125
Best parameters for 32 features: {'C': 0.25}
Best score for 94 features: 0.9375
Best parameters for 94 features: {'C': 0.25}


# Make LR models

### RFE Models LR

In [62]:
#8 features 
lr_model_RFE8 = LogisticRegression(C = 1)
lr_model_RFE8.fit(x_train_RFE8,y_train.values.ravel())

#32 features 
lr_model_RFE32 = LogisticRegression(C = 0.25)
lr_model_RFE32.fit(x_train_RFE32,y_train.values.ravel())

#94 features 
lr_model_RFE94 = LogisticRegression(C = 0.25)
lr_model_RFE94.fit(x_train_RFE94,y_train.values.ravel())

LogisticRegression(C=0.25, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### ANOVA Models LR

In [63]:
#8 features 
lr_model_ANOVA8 = LogisticRegression(C = 1)
lr_model_ANOVA8.fit(x_train_ANOVA8,y_train.values.ravel())

#32 features 
lr_model_ANOVA32 = LogisticRegression(C = 0.25)
lr_model_ANOVA32.fit(x_train_ANOVA32,y_train.values.ravel())

#94 features 
lr_model_ANOVA94 = LogisticRegression(C = 0.25)
lr_model_ANOVA94.fit(x_train_ANOVA94,y_train.values.ravel())

LogisticRegression(C=0.25, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Predict y_test RFE LR (NOT USING TEST SET YET!)

In [64]:
# 8 features
y_predict_RFE8_lr = lr_model_RFE8.predict(x_train_RFE8) # will do for test data
print('accuracy for 8 feature RFE: ',metrics.accuracy_score(y_train, y_predict_RFE8_lr))
# 32 features
y_predict_RFE32_lr = lr_model_RFE32.predict(x_train_RFE32) # will do for test data
print('accuracy for 32 features RFE: ',metrics.accuracy_score(y_train, y_predict_RFE32_lr))
# 94 features
y_predict_RFE94_lr = lr_model_RFE94.predict(x_train_RFE94) # will do for test data
print('accuracy for 94 features RFE: ',metrics.accuracy_score(y_train, y_predict_RFE94_lr))


accuracy for 8 feature RFE:  1.0
accuracy for 32 features RFE:  1.0
accuracy for 94 features RFE:  1.0


### Predict y_test ANOVA LR (NOT USING TEST SET YET!)

In [65]:
# 8 features
y_predict_ANOVA8_lr = lr_model_ANOVA8.predict(x_train_ANOVA8) # will do for test data
print('accuracy for 8 feature ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA8_lr))
# 32 features
y_predict_ANOVA32_lr = lr_model_ANOVA32.predict(x_train_ANOVA32) # will do for test data
print('accuracy for 32 features ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA32_lr))
# 94 features
y_predict_ANOVA94_lr = lr_model_ANOVA94.predict(x_train_ANOVA94) # will do for test data
print('accuracy for 94 features ANOVA: ',metrics.accuracy_score(y_train, y_predict_ANOVA94_lr))


accuracy for 8 feature ANOVA:  0.825
accuracy for 32 features ANOVA:  0.9375
accuracy for 94 features ANOVA:  0.9625
