# 10.1 Thresholding Numerical Feature Variance

I have a set of numerical features and want to remove those with low variance.

In [6]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
# import some data to play with
iris = datasets.load_iris()
# Create features and target
features = iris.data
target = iris.target
# Create thresholder
thresholder = VarianceThreshold(threshold=0.5)
# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)
# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [7]:
# View variances
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [9]:
from sklearn.preprocessing import StandardScaler

# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
# Calculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

# 10.2 Threholding Binary Feature Variance

You have a binary categorical features and want to remove those with low variance.

In [1]:
from sklearn.feature_selection import VarianceThreshold

# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]
# Run threshold by variance
thresholder = VarianceThreshold(threshold=(0.75 * (1 - 0.75)))
thresholder.fit_transform(features)



array([[0],
       [1],
       [0],
       [1],
       [0]])

# 10.3 Handling Highly Correlated Features

I have a feature matrix and suspect some features are highly correlated.

In [3]:
import pandas as pd
import numpy as np

# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])
# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)
# Create correlation matrix
corr_matrix = dataframe.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                  k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head()

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1


In [4]:
# Correlation matrix
dataframe.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [5]:
# Upper triangle of correlation matrix
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


# 10.4 Removing Irrelevant Features for Classification

I have a categorical target vector and want to remove uninformative features.

In [10]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

#Load data
iris = load_iris()
features = iris.data
target = iris.target
#Convert to categorical data by converting data to integers
features = features.astype(int)
# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)
# Show results
print('Original number of features:', features.shape[1])
print('Reduced number of features:', features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


ANOVA F-Value

In [8]:
# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)
# Show results
print('Original number of features:', features.shape[1])
print('Reduced number of features:', features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [9]:
from sklearn.feature_selection import SelectPercentile

# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)
# Show results
print('Original number of features:', features.shape[1])
print('Reduced number of features:', features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


# 10.5 Recursively Eliminating Features

I want to automatically select the best features to keep.

In [15]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action='ignore', module='scipy',
                        message='^internal gelsd')
# Generate features matrix, target vector and the true coefficients
features, target = make_regression(n_samples=10000,
                                   n_features=100,
                                   n_informative=2)
# Create a linear regression
ols = linear_model.LinearRegression()
# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring='neg_mean_squared_error')
rfecv.fit(features, target)
rfecv.transform(features)



array([[ 0.72143633,  0.93960953],
       [-1.15369574,  1.75113588],
       [-1.48439924,  0.63773071],
       ...,
       [ 0.86584061, -0.51242653],
       [ 0.0365818 ,  1.09262368],
       [-0.0822978 ,  0.63684126]])

In [16]:
# Number of best features
rfecv.n_features_

2

In [17]:
# Which categories are the best
rfecv.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [18]:
# Rank features best (1) to worst
rfecv.ranking_

array([47, 96, 98, 74, 95, 15, 80, 67, 91, 93, 65, 64, 62, 60, 58, 88, 56,
       39, 75, 30, 18, 85, 42, 52, 90, 73, 45,  6, 24, 72, 86,  2, 57, 87,
        7, 29,  1, 12, 61, 68, 44, 63, 21, 49, 97, 35, 34,  9, 11, 54, 36,
       25, 59, 41, 50, 89, 32, 23, 76, 46, 78, 69, 84, 66, 83, 17, 48, 20,
       79, 53, 10, 43, 94, 55, 33,  3, 28, 19, 51, 37, 26,  4, 27, 16, 38,
        8, 71,  1, 92, 13, 22, 31, 77, 99,  5, 82, 81, 40, 70, 14])