In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

In [2]:
iris=load_iris()

### 用相关性寻找最佳特征

In [75]:
pearson_model=SelectKBest(lambda X,Y:np.array(list(map(lambda x:pearsonr(x,Y),X.T))).T[0],k=2).fit(iris.data,iris.target)

In [76]:
pearson_model.get_support()

array([False, False,  True,  True])

In [77]:
list(feature_data.columns.values[pearson_model.get_support()])

['petal length (cm)', 'petal width (cm)']

In [78]:
pearson_model.scores_

array([ 0.78256123, -0.42665756,  0.9490347 ,  0.95654733])

### 运用特征中最大卡方方法

In [53]:
from sklearn.feature_selection import chi2

def get_feature_importance(feature_data, label_data,k=2):
    model = SelectKBest(chi2, k=k)#选择k个最佳特征
    X_new = model.fit_transform(feature_data, label_data)
    #feature_data是特征数据，label_data是标签数据，该函数可以选择出k个特征 
    import pandas as pd
    feature_data=pd.DataFrame(iris.data,columns=iris.feature_names)
    print("model shape: ",X_new.shape)
 
    scores = model.scores_
    print('model scores:', scores)  # 得分越高，特征越重要
 
    p_values = model.pvalues_
    print('model p-values', p_values)  # p-values 越小，置信度越高，特征越重要
 
    # 按重要性排序，选出最重要的 k 个
    indices = np.argsort(scores)[::-1]
    k_best_features = list(feature_data.columns.values[indices[0:k]])

    print('k best features are: ',k_best_features)
    return k_best_features

In [55]:
k_best_features=get_feature_importance(iris.data,iris.target,k=2)

model shape:  (150, 2)
model scores: [ 10.81782088   3.7107283  116.31261309  67.0483602 ]
model p-values [4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]
k best features are:  ['petal length (cm)', 'petal width (cm)']


In [56]:
k_best_features

['petal length (cm)', 'petal width (cm)']

### RFE 法

In [61]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [64]:
lr_model=RFE(estimator=LogisticRegression(multi_class='auto',solver='lbfgs',max_iter=500),n_features_to_select=2).fit(iris.data,iris.target)

In [70]:
lr_model.get_support()

array([False, False,  True,  True])

In [72]:
list(feature_data.columns.values[lr_model.get_support()])

['petal length (cm)', 'petal width (cm)']

### 模型选择法

In [80]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

In [83]:
lr_select_model=SelectFromModel(LogisticRegression(penalty='l2',C=0.1,solver='lbfgs',multi_class='auto')).fit(iris.data,iris.target)

In [84]:
lr_select_model.get_support()

array([False, False,  True,  True])

In [85]:
list(feature_data.columns.values[lr_select_model.get_support()])

['petal length (cm)', 'petal width (cm)']