In [1]:
#介绍两种相关方法:1> RFE(feature selection) ; 2> PE(feature importance)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE  #Recursive Feature Elimination (RFE) 
import xgboost as xgb

ds = pd.read_csv("ahc.csv")
ds.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,42,43,44,45,46,47,48,49,50,symptom
0,0.385714,-0.730334,-0.13952,0.885217,0.185512,-0.092579,1.675729,-0.686223,0.531079,1.260273,...,0.310443,0.685676,0.743884,-1.707476,0.409856,0.23479,0.124626,-0.304451,-0.098937,1
1,-1.139784,0.202991,-0.172729,2.288758,0.832926,-1.270105,-0.541457,-0.299268,-0.662836,-1.111758,...,0.34789,0.022021,-0.588785,-2.409142,-0.092867,0.687847,-1.272507,0.852925,-0.070433,1
2,0.301588,-1.216219,-0.320328,1.682189,1.250116,-1.544504,-0.439693,0.573244,-0.466864,0.410886,...,0.69988,0.7227,-0.565397,-1.177416,1.832512,-1.76352,1.174136,-0.11095,-0.721374,1
3,0.610831,-0.224714,3.514447,0.002929,-2.178927,1.008115,-0.221103,0.152922,0.782051,-0.069084,...,0.100465,1.928904,-0.030931,0.060462,-1.101277,0.496262,-0.477511,1.070145,0.226384,1
4,0.683467,0.778225,-0.244681,-0.104264,-0.15393,1.351049,0.544877,1.948468,-0.672722,-0.002173,...,0.068522,0.431119,1.112156,0.008301,-0.151477,1.208057,-1.728618,-0.297925,-2.181558,1


In [2]:
tr = ds.columns[:-1]
#Splitting labeled dataset
X_train, X_test, y_train, y_test = train_test_split(ds[tr],ds['symptom'], stratify=ds['symptom'])

In [7]:
#第一种RFE(feature selection)
#X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
estimator = xgb.XGBClassifier(n_estimators=500, learning_rate=0.1, random_state = 67)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X_train, y_train)

In [8]:
#The mask of selected features
selector.support_  #ndarray of shape (n_features,)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
        True, False,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False])

In [9]:
#The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1
selector.ranking_  #ndarray of shape (n_features,)

array([21,  3, 25, 38, 31, 42, 27,  5, 16, 17, 12, 20, 13, 41, 10, 35, 28,
       37, 19, 22,  1, 24, 23, 32, 14, 44, 30,  1,  8,  1,  6,  1, 34, 46,
       29, 33, 39,  7, 26, 36, 45, 43,  4, 40,  1, 15, 18, 11,  9,  2])

In [2]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

'''KFold devides the dataset into k folds. 
Where as Stratified ensures that each fold of dataset has the same proportion of observations with a given label.'''

tr = ds.columns[:-1]
X = ds[tr]
y = ds.symptom

min_features_to_select = 1  # Minimum number of features to consider
clf = xgb.XGBClassifier(n_estimators=500, learning_rate=0.1, random_state = 67)
cv = StratifiedKFold(10)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 44


In [3]:
#The mask of selected features
rfecv.support_ #ndarray of shape (n_features,)

array([ True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True, False,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [5]:
#The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1
rfecv.ranking_  #ndarray of shape (n_features,)

array([1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1])

In [3]:
#第二种PE(feature importance): To research what features have the biggest impact on predictions? ----called Feature Importance
import eli5
from eli5.sklearn import PermutationImportance

estimator = xgb.XGBClassifier(n_estimators=500, learning_rate=0.1, random_state = 67)
my_model = estimator.fit(X_train, y_train)
perm = PermutationImportance(my_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0056  ± 0.0043,25
0.0053  ± 0.0069,11
0.0045  ± 0.0033,6
0.0028  ± 0.0066,4
0.0028  ± 0.0107,49
0.0014  ± 0.0031,9
0.0011  ± 0.0054,18
0.0008  ± 0.0028,37
0.0008  ± 0.0063,8
0.0006  ± 0.0088,45
