本文记录了特征工程中的常用方法

In [None]:
#导入数据
from sklearn.datasets import load_iris
iris = load_iris()
iris #包括特征，标签，标签名，描述

# 1.数据预处理

### a.特征规格不一可以进行无量纲化

In [None]:
#标准化
from sklearn.preprocessing import StandardScaler
SSc = StandardScaler()
SSc.fit_transform(iris.data)

In [None]:
#区间放缩
from sklearn.preprocessing import MinMaxScaler
MMs = MinMaxScaler()
MMs.fit_transform(iris.data)


In [None]:
#L2 归一化
from sklearn.preprocessing import Normalizer
Nm = Normalizer()
Nm.fit_transform(iris.data)


### b.特征二值化，如只关心18岁以上or18岁以下


In [None]:
from sklearn.preprocessing import Binarizer
Bz = Binarizer(threshold=3)
Bz.fit_transform(iris.data)

### c.定性特征转换为定量特征 one_hot ecoding



In [None]:
from sklearn.preprocessing import OneHotEncoder
OHe = OneHotEncoder()
OHe.fit_transform(iris.target.reshape((-1,1)))

### d.缺失值填补



In [None]:
from sklearn.preprocessing import Imputer
Imp = Imputer()#默认用每列的mean 填对应列的NaN，
Imp.fit_transform(iris.data)

### e.数据变换

In [None]:
#多项式变换
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
PLf = PolynomialFeatures()#默认2阶
PLf.fit_transform(iris.data)

In [None]:
#对数变换
from sklearn.preprocessing import FunctionTransformer
FTf = FunctionTransformer(np.log2)
FTf.fit_transform(iris.data)

# 2.特征选择

### a.Filter

In [None]:
#方差选取法 Filter
from sklearn.feature_selection import VarianceThreshold
Vt = VarianceThreshold(threshold= 1)#方差小于1
Vt.fit_transform(iris.data)

In [None]:
#相关系数选取法
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
SKb = SelectKBest(lambda X, Y: list(array([pearsonr(x, Y) for x in X.T]).T), k=2)
SKb.fit_transform(iris.data,iris.target)

In [None]:
#卡方检验
from sklearn.feature_selection import chi2
SKb = SelectKBest(chi2,k=2)
SKb.fit_transform(iris.data,iris.target)

In [None]:
#互信息法
from minepy import MINE
def mic(x,y):
    m = MINE()
    m.compute_score(m.mic(),0.5)

SKm = SelectKBest(lambda X,Y:array(map(lambda x:mic(x,Y),X.T )).T,k = 2)
SKm.fit(iris.data,iris.target)

### b.Wrapper

In [None]:
#特征递归消除法  Wrapper
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

Ref = RFE(estimator=LogisticRegression(),n_features_to_select=2)
Ref.fit_transform(iris.data,iris.target)

### c.Embedded

In [None]:
#基于惩罚系数的特征选择法  Embedded
from sklearn.feature_selection import SelectFromModel
#使用L1正则的逻辑回归
SelectFromModel(LogisticRegression(penalty = 'l1',C = 0.1)).fit_transform(iris.data,iris.target)

In [None]:
#使用GBDT为基模型进行特征选择
from sklearn.ensemble import GradientBoostingClassifier
SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data,iris.target)

# 3.降维

### a.主成分分析法PCA

In [None]:
from sklearn.decomposition import PCA
PCA(n_components=2).fit_transform(iris.data)

### b.线性判别分析法LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
LatentDirichletAllocation(n_components=2).fit_transform(iris.data)