In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pandas import plotting

#plotly 
import plotly.offline as py
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
from plotly import tools
init_notebook_mode(connected=True)
import plotly.figure_factory as ff
import plotly.express as px

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import confusion_matrix,classification_report,precision_score
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.formula.api as smf


plt.style.use('fivethirtyeight')

In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.columns

In [None]:
col=['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

In [None]:
df.columns

In [None]:
X_train=df[col]
X_train.shape

# Using variance threshold from sklearn
Variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [None]:
# using sklearn variancethreshold to find constant features

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0)
sel.fit(X_train) 

In [None]:
# get_support is a boolean vector that indicates which features are retained
# if we sum over get_support, we get the number of features that are not constant
sum(sel.get_support())

In [None]:
# alternate way of finding non-constant features
len(X_train.columns[sel.get_support()])

In [None]:
# print the constant features
print(
    len([
        x for x in X_train.columns
        if x not in X_train.columns[sel.get_support()]
    ]))

[x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]

* We can see that there are 0 columns / variables that are constant. This means that 51 variables show the same value, just one value, for all the observations of the training set.

In [None]:
# we can then drop these columns from the train and test sets
X_train = sel.transform(X_train)

In [None]:
# check the shape of training and test set

X_train.shape

* We can see how by removing constant features, we managed to reduced the feature space quite a bit.
* As we dont have any constant feature 

# Remove quasi-constant features
* Quasi-constant features are those that show the same value for the great majority of the observations of the dataset. In general, these features provide little if any information that allows a machine learning model to discriminate or predict a target. 
* So we should be careful when removing these type of features. Identifying and removing quasi-constant features, is an easy first step towards feature selection and more easily interpretable machine learning models.


In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
X_train=df[col]
X_train.shape

In [None]:
sel = VarianceThreshold(threshold=0.01)  # 0.1 indicates 99% of observations approximately

sel.fit(X_train)

In [None]:
# get_support is a boolean vector that indicates which features 
# are retained. If we sum over get_support, we get the number
# of features that are not quasi-constant
sum(sel.get_support())

In [None]:
# alternative way of doing the above operation:
len(X_train.columns[sel.get_support()])

In [None]:
# finally we can print the quasi-constant features
print(
    len([
        x for x in X_train.columns
        if x not in X_train.columns[sel.get_support()]
    ]))

[x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]

* We can see that 16 columns / variables are almost constant. This means that 107 variables show predominantly one value for ~99% the observations of the training set.

In [None]:
# percentage of observations showing each of the different values
X_train['smoothness_mean'].value_counts() / np.float(len(X_train))

* We can see that > 99% of the observations show one value, 0. Therefore, this feature is almost constant.

In [None]:
# we can then remove the features from training and test set
X_train = sel.transform(X_train)

In [None]:
# check the shape of training and test set
X_train.shape

# Univariate selection methods
* Univariate feature selection methods works by selecting the best features based on univariate statistical tests like ANOVA. It can be seen as a preprocessing step to an estimator.
* Scikit-learn exposes feature selection routines as objects that implement the transform method.
* The methods based on F-test estimate the degree of linear dependency between two random variables
* They assume a linear relationship between the feature and the target. These methods also assume that the variables follow a Gaussian distribution.
* There are 4 methods that fall under this category :-

        1 SelectKBest
        2 SelectPercentile
        3 SelectFpr, SelectFdr, or family wise error SelectFwe
        4 GenericUnivariateSelection
* Here, I will limit the discussion to SelectKBest and SelectPercentile, because these two are most commonly used in practice.

#  SelectKBest
* This method select features according to the k highest scores.

* For instance, we can perform a chi-square test to the samples to retrieve only the two best features from iris dataset as follows:

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, chi2


In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.shape

In [None]:
X=df[col]
y=df['diagnosis']

In [None]:
X.shape

In [None]:
# select the two best features
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape

* Thus, we have selected the two best features from the iris dataset.

In [None]:
# select the two best features
X_new1 = SelectKBest(chi2, k=10).fit_transform(X, y)
X_new1.shape

* Thus, we have selected the 10 best features from the iris dataset.

# SelectPercentile
* Select features according to a percentile of the highest scores.

In [None]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2

In [None]:
X=df[col]
y=df['diagnosis']

In [None]:
X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
X_new.shape

* We can see that only 7 features lie on the top 10 percentile and hence we select them accordingly.

# ANOVA F-value For Feature Selection
* Compute the ANOVA F-value for the provided sample.

* If the features are categorical, we will calculate a chi-square statistic between each feature and the target vector. However, if the features are quantitative, we will compute the ANOVA F-value between each feature and the target vector.

* The F-value scores examine if, when we group the numerical feature by the target vector, the means for each group are significantly different.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
X=df[col]
y=df['diagnosis']

In [None]:
X.values

In [None]:
y.values

In [None]:
# Select Features With Best ANOVA F-Values

# Create an SelectKBest object to select features with two best ANOVA F-Values
fvalue_selector = SelectKBest(f_classif, k=2)

# Apply the SelectKBest object to the features and target
X_kbest = fvalue_selector.fit_transform(X, y)

In [None]:
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

In [None]:
# Select Features With Best ANOVA F-Values

# Create an SelectKBest object to select features with 10 best ANOVA F-Values
fvalue_selector = SelectKBest(f_classif, k=10)

# Apply the SelectKBest object to the features and target
X_kbest = fvalue_selector.fit_transform(X, y)

In [None]:
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

# Forward Selection Method

In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
# step forward feature selection

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
diagnosis={'M':1, 'B':0}
df['diagnosis']=[diagnosis[x] for x in df['diagnosis']]

In [None]:
df=df.drop('Unnamed: 32', axis=1)


In [None]:
df.columns

In [None]:
# In practice, feature selection should be done after data pre-processing,
# so ideally, all the categorical variables are encoded into numbers,
# and then you can assess how deterministic they are of the target

# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(df.select_dtypes(include=numerics).columns)
data = df[numerical_vars]
data.shape

In [None]:
data.columns

In [None]:
X=data[col]
y=data['diagnosis']

In [None]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=0)

X_train.shape, X_test.shape

In [None]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

In [None]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

In [None]:
# step forward feature selection

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sfs1 = SFS(RandomForestRegressor(), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train), y_train)

In [None]:
sfs1.k_feature_idx_

In [None]:
X_train.columns[list(sfs1.k_feature_idx_)]

* We can see that forward feature selection results in the above columns being selected from all the given columns.
* 10 features are selected

# Backward Elimination 
* In backward elimination, we start with all the features and removes the least significant feature at each iteration which improves the performance of the model. We repeat this until no improvement is observed on removal of features.

* The procedure starts with the full set of attributes. At each step, it removes the worst attribute remaining in the set.



In [None]:
# step backward feature elimination

sfs1 = SFS(RandomForestRegressor(), 
           k_features=10, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train), y_train)

In [None]:
sfs1.k_feature_idx_

In [None]:
X_train.columns[list(sfs1.k_feature_idx_)]

In [None]:
# step backward feature elimination

sfs1 = SFS(RandomForestRegressor(), 
           k_features=12, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train), y_train)

In [None]:
sfs1.k_feature_idx_

In [None]:
X_train.columns[list(sfs1.k_feature_idx_)]

* So, backward feature elimination results in the following columns being selected.