### 特征工程

#### 1. 移除低方差的特征(Removing features with low variance)

In [5]:
import numpy as np
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

#### 2. 单变量特征选择(Univariate feature selection)

+ 作为打分函数输入的对象，返回单变量的概率值：
    + 用于回归:f_regression, mutual_info_regression
    + 用于分类:chi2, f_classif, mutual_info_classif
+ 稀疏数据的特征选择
    + 只有chi2, mutual_info_regression, mutual_info_classif能在处理数据时保持其稀疏性.

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target  # X.shape
X_new = SelectKBest(chi2, k=2).fit_transform(X, y) # X_new.shape

#### 3. 递归特征消除(Recursive feature elimination)

In [4]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.datasets import load_digits
digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target
# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)

#### 4. 使用SelectFromModel选择特征(Feature selection using SelectFromModel)

In [6]:
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
boston = load_boston()
X, y = boston['data'], boston['target']
clf = LassoCV()
# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

#### 4.1. 基于L1的特征选择(L1-based feature selection)

In [7]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

(150, 3)

#### 4.2. 随机稀疏模型(Randomized sparse models)

#### 4.3. 基于树的特征选择(Tree-based feature selection)

In [8]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
clf.feature_importances_ 
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape   

(150, 2)

In [9]:
clf.feature_importances_

array([ 0.04817595,  0.03163336,  0.35899428,  0.56119642])