<a href="https://colab.research.google.com/github/Satwikram/Feature-Selection/blob/main/Feature%20Selection%20Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Imports

In [69]:
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

from sklearn.metrics import *

### Loading Dataset

In [6]:
df = pd.read_csv("/content/dataset.csv")

In [7]:
df.head()

Unnamed: 0,alpha_ec_0,alpha_ec_1,alpha_ec_2,alpha_ec_3,alpha_ec_4,alpha_ec_5,alpha_ec_6,alpha_ec_7,alpha_ec_8,alpha_ec_9,...,ratio_theta_39,ratio_theta_40,ratio_theta_41,ratio_theta_42,ratio_theta_43,ratio_theta_44,ratio_theta_45,ratio_theta_46,ratio_theta_47,labels
0,0.112,0.124,0.129,0.113,0.106,0.107,0.108,0.111,0.109,0.159,...,0.739,0.587,0.552,0.548,0.579,0.593,0.604,0.563,1.01,0.0
1,0.298,0.269,0.262,0.38,0.344,0.285,0.248,0.25,0.26,0.26,...,0.328,0.432,0.496,0.407,0.313,0.664,0.584,0.467,0.749,0.0
2,0.369,0.396,0.39,0.258,0.406,0.442,0.44,0.399,0.315,0.283,...,0.865,1.09,0.318,0.364,0.506,0.335,0.494,0.624,0.614,0.0
3,0.761,0.777,0.77,0.602,0.702,0.727,0.762,0.789,0.73,0.522,...,0.251,0.432,0.713,0.533,0.285,0.681,0.663,0.253,0.854,0.0
4,0.168,0.151,0.147,0.157,0.15,0.142,0.132,0.128,0.144,0.173,...,1.73,1.24,2.76,3.01,2.44,2.53,2.58,2.01,2.29,0.0


### Taking X and Y

In [13]:
X = df.drop("labels", axis=1)
y = df["labels"]

### Splitting Dataset

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Pre-Processing

In [15]:
sc = StandardScaler()

In [16]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
X_train

array([[ 1.34704064,  1.09923367,  0.93896227, ..., -0.90726301,
        -0.92109706, -0.87442269],
       [-1.37733747, -1.35593698, -1.3533501 , ..., -0.55781888,
        -0.54854731, -0.1049701 ],
       [ 0.64608572,  0.64334125,  0.87596003, ...,  0.42932678,
         0.22442007,  1.15334524],
       ...,
       [ 0.2862622 ,  0.13996003, -0.02545668, ..., -0.01937579,
        -0.24493394,  1.94883195],
       [-0.96611058, -0.71008938, -0.53432095, ..., -0.19885682,
         0.23908738, -0.97277378],
       [-0.05954223, -0.14497274, -0.08845892, ..., -0.13767011,
        -0.08212677, -0.75293018]])

### LOOCV for Evaluating Machine Learning Algorithms

In [78]:
cv = LeaveOneOut()

In [83]:
def loocv(model, cv, X_train, y_train):
  scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  print('Accuracy: %.3f (%.3f)' % (scores.mean(), scores.std()))

### Metrics for Performance Evaluation

In [84]:
def report(model, X_test, y_test):
  
  y_pred = model.predict(X_test)
  print(classification_report(y_test, y_pred))

### KNN without Feature Selection

In [85]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
report(knn, X_test, y_test)

              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91        20
         1.0       1.00      0.75      0.86        16

    accuracy                           0.89        36
   macro avg       0.92      0.88      0.88        36
weighted avg       0.91      0.89      0.89        36



In [86]:
loocv(knn, cv, X_train, y_train)

Accuracy: 0.854 (0.353)


In [27]:
svm = SVC()
svm.fit(X_train, y_train)

SVC()

### Metrics

In [29]:
def report(model, X_true, y_true):
  
  y_pred = model.predict(X_test)
  print(classification_report(y_true, y_pred))

In [30]:
report(knn, X_test, y_test)

              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91        20
         1.0       1.00      0.75      0.86        16

    accuracy                           0.89        36
   macro avg       0.92      0.88      0.88        36
weighted avg       0.91      0.89      0.89        36



In [31]:
report(svm, X_test, y_test)

              precision    recall  f1-score   support

         0.0       0.87      1.00      0.93        20
         1.0       1.00      0.81      0.90        16

    accuracy                           0.92        36
   macro avg       0.93      0.91      0.91        36
weighted avg       0.93      0.92      0.92        36



### feature Selection

In [37]:
num_feats = 10

#### Pearson Correlation

In [38]:
def cor_selector(X, y, num_feats):
    
    cor_list = []
    feature_name = X.columns.tolist()
    
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y, 10)
print(str(len(cor_feature)), 'selected features')

10 selected features


### Selecting features using Lasso regularisation using SelectFromModel

In [43]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l2'))
sel_.fit(X_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=1))

#### Visualising features that were kept by the lasso regularisation

In [44]:
sel_.get_support()

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False,  True, False,  True,  True,
        True, False, False,  True,  True,  True, False,  True, False,
       False, False, False, False,  True, False, False, False,  True,
       False,  True, False,  True, False, False,  True, False,  True,
        True, False, False, False,  True,  True, False,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False,  True, False,
       False, False,  True,  True, False, False, False,  True, False,
        True, False,  True, False,  True, False,  True, False,  True,
        True, False,  True,  True, False, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False,  True,
        True, False, False, False,  True,  True, False, False,  True,
       False, False,

#### Make a list of with the selected features

In [46]:
selected_feat = X.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 432
selected features: 186
features with coefficients shrank to zero: 0


### Identifying the removed features

In [None]:
X.columns, sel_.get_support()

In [63]:
selected = []
for i, v in enumerate(sel_.get_support()):
  if v:
    selected.append(i)

In [66]:
X.columns[selected]

Index(['alpha_ec_5', 'alpha_ec_14', 'alpha_ec_16', 'alpha_ec_17',
       'alpha_ec_18', 'alpha_ec_21', 'alpha_ec_22', 'alpha_ec_23',
       'alpha_ec_25', 'alpha_ec_31',
       ...
       'ratio_theta_28', 'ratio_theta_33', 'ratio_theta_35', 'ratio_theta_36',
       'ratio_theta_38', 'ratio_theta_39', 'ratio_theta_43', 'ratio_theta_45',
       'ratio_theta_46', 'ratio_theta_47'],
      dtype='object', length=186)