These examples are adapted from the book "Introduction to Machine Learning with Python using Mueller & Guido
See the following GitHub site: https://github.com/amueller/introduction_to_ml_with_python

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
import numpy as np
cancer = load_breast_cancer()

In [3]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [6]:
#Number of features in the dataset
len(cancer['feature_names'])

30

In [9]:
#We will add some random columns - noise - the features
random_numbers = np.random.RandomState(99)
noisy_features = random_numbers.normal(size = (len(cancer.data), 50)) #adding 50 "noisy" features
x_with_noise = np.hstack([cancer.data, noisy_features] ) #add the 50 noisy columns to the original 30 features

In [10]:
x_with_noise.shape

(569, 80)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(
    x_with_noise, cancer.target, random_state=0, test_size=.3)

# use f_classif (the default) and SelectPercentile to select 40% of features

select = SelectPercentile(percentile=40)
select.fit(x_train, y_train)
# transform training set
x_train_selected = select.transform(x_train)


In [12]:
#How many features did it select?
print("Original Dataset:", x_train.shape)
print("Reduced Feature Dataset:", x_train_selected.shape)

Original Dataset: (398, 80)
Reduced Feature Dataset: (398, 32)


In [13]:
#Get the selected features
select.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False])

In [16]:
#Let us get the names of the selected features
original_feature_names = list(cancer['feature_names'])
random_features = ["Random_" + str(i) for i in range(50)] #create feature names for the 50 random columns
#Combine the two to get all feature names
feature_names = original_feature_names + random_features
feature_names[:5]

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness']

In [19]:
#Let us get the names of features selected by SelectPercentile
np.array(feature_names)[select.get_support()]

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'radius error',
       'perimeter error', 'area error', 'smoothness error',
       'compactness error', 'concavity error', 'concave points error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry',
       'worst fractal dimension', 'Random_11', 'Random_12', 'Random_19',
       'Random_25', 'Random_29', 'Random_41'], dtype='<U23')

In [21]:
#Let us see how the original dataset performs vis-a-vis the selected features
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 10000)
classifier.fit(x_train, y_train)
print("Original dataset accuracy:", classifier.score(x_test, y_test))

Original dataset accuracy: 0.9122807017543859


In [22]:
#now, fit the data on the reduced features
classifier.fit(x_train_selected, y_train)
#select the features for x_test
x_test_selected = select.transform(x_test)
print("Selected Features accuracy:", classifier.score(x_test_selected, y_test))

Selected Features accuracy: 0.9590643274853801


In [27]:
#Let us try SelectFromModel for the same dataset
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=99),
    threshold="mean") 


In [28]:
#Create reduced features
select.fit(x_train, y_train)
x_train_selected = select.transform(x_train)
x_test_selected = select.transform(x_test)

In [29]:
#Let us see how the original dataset performs vis-a-vis the selected features
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 10000)
classifier.fit(x_train, y_train)
print("Original dataset accuracy:", classifier.score(x_test, y_test))

Original dataset accuracy: 0.9122807017543859


In [30]:
#now, fit the data on the reduced features
classifier.fit(x_train_selected, y_train)
print("Selected Features accuracy:", classifier.score(x_test_selected, y_test))

Selected Features accuracy: 0.9473684210526315


In [31]:
#Let us try recursive feature elimination (RFE)
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=108),
             n_features_to_select=32)


In [35]:
#Let us get the names of features selected by SelectPercentile
np.array(feature_names)[select.get_support()]

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'radius error',
       'texture error', 'perimeter error', 'area error',
       'compactness error', 'concavity error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry',
       'worst fractal dimension', 'Random_0', 'Random_9', 'Random_12',
       'Random_26', 'Random_37', 'Random_41', 'Random_47'], dtype='<U23')

In [32]:
#Create reduced features
select.fit(x_train, y_train)
x_train_selected = select.transform(x_train)
x_test_selected = select.transform(x_test)

In [33]:
#Let us see how the original dataset performs vis-a-vis the selected features
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 10000)
classifier.fit(x_train, y_train)
print("Original dataset accuracy:", classifier.score(x_test, y_test))

Original dataset accuracy: 0.9122807017543859


In [34]:
#now, fit the data on the reduced features
classifier.fit(x_train_selected, y_train)
print("Selected Features accuracy:", classifier.score(x_test_selected, y_test))

Selected Features accuracy: 0.9415204678362573
