In [2]:
# feature selection examples 
from sklearn.datasets import load_diabetes
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import (ensemble, model_selection, preprocessing, tree)
from sklearn.metrics import (confusion_matrix, accuracy_score, classification_report)
from sklearn.model_selection import (train_test_split)
from sklearn.linear_model import LogisticRegression

In [None]:
## load breast cancer dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns = data.feature_names)

In [None]:
y = df['target']
X = df.drop(columns = ['target'])
X.head()

In [None]:
# scale the 30 columns
from sklearn.preprocessing import StandardScaler 
X = StandardScaler().fit_transform(X)
len(X[1])

In [22]:
# split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [23]:
# what if I don't want to remove features yet 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [None]:
# run a baseline logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# select 10 features / columns in X
lr = LogisticRegression() 
rfe = RFE(lr, n_features_to_select=10)
rfe.fit(X, y)
print(rfe.ranking_)

In [None]:
print(rfe.support_)

In [None]:
# check the selected features / columns 
X[:, rfe.support_][1]

In [None]:
# get a new X from 10 feature selected columns 
X_new = X[:, rfe.support_]

In [32]:
X_train_new, X_test_new, y_train, y_test = train_test_split(
    X_new, y, test_size=0.2, random_state=42)

In [None]:
# then do another logistic regression on X new 
lr = LogisticRegression()
lr.fit(X_train_new, y_train)
y_pred = lr.predict(X_test_new)
print(classification_report(y_test, y_pred))

In [35]:
y = df['target']
X = df.drop(columns = ['target'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [68]:
# do the 'scale', 'feature selection', and 'fit' in a pipeline style 
from sklearn.pipeline import Pipeline 

clf = Pipeline([
    ('Scale', StandardScaler()),
    ('Classification', LogisticRegression())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [72]:
# do it again in a pipeline style 
clf = Pipeline([
    ('Scale', StandardScaler()),
    ('feature_selection', RFE(LogisticRegression(), n_features_to_select=10)),
    ('Classification', LogisticRegression())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))