In [1]:
import pandas as pd

In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE ,RFECV


In [5]:
data =  pd.read_csv('BostonHousing.csv')
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [13]:
x = data.iloc[:,:12]
y = data.iloc[:,12]

In [15]:
model =LinearRegression()

In [17]:
rfe = RFE(estimator=model , n_features_to_select=7)

In [19]:
result =  rfe.fit(x,y)

In [23]:
result.n_features_    #return the number of selected features

7

In [25]:
result.support_

array([ True, False,  True,  True,  True,  True, False,  True, False,
       False,  True, False])

In [29]:
selected = x.columns[result.support_]
selected

Index(['crim', 'indus', 'chas', 'nox', 'rm', 'dis', 'ptratio'], dtype='object')

In [43]:
x_new = x[selected]   # return it by using the new selected columns
x_new.values    # value returns it in 2D arrays 

array([[6.3200e-03, 2.3100e+00, 0.0000e+00, ..., 6.5750e+00, 4.0900e+00,
        1.5300e+01],
       [2.7310e-02, 7.0700e+00, 0.0000e+00, ..., 6.4210e+00, 4.9671e+00,
        1.7800e+01],
       [2.7290e-02, 7.0700e+00, 0.0000e+00, ..., 7.1850e+00, 4.9671e+00,
        1.7800e+01],
       ...,
       [6.0760e-02, 1.1930e+01, 0.0000e+00, ..., 6.9760e+00, 2.1675e+00,
        2.1000e+01],
       [1.0959e-01, 1.1930e+01, 0.0000e+00, ..., 6.7940e+00, 2.3889e+00,
        2.1000e+01],
       [4.7410e-02, 1.1930e+01, 0.0000e+00, ..., 6.0300e+00, 2.5050e+00,
        2.1000e+01]])

## RFECV  - REF cross validation 
- Key Concepts of RFECV:
Recursive Feature Elimination (RFE):

Like RFE, RFECV recursively eliminates the least important features based on model performance until it finds the most important subset of features.
Cross-Validation:

Instead of just eliminating features, RFECV uses cross-validation to evaluate the model’s performance at each step. Cross-validation helps ensure that the model’s performance is not just good on the training set but also generalizes well to unseen data.

In [48]:
rfecv  = RFECV(estimator=model,min_features_to_select=1,cv=10)

In [50]:
res1 = rfecv.fit(x,y)

In [52]:
res1.support_

array([ True, False,  True,  True,  True,  True,  True,  True, False,
       False,  True, False])

In [57]:
x.columns[res1.support_]

Index(['crim', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'ptratio'], dtype='object')

In [59]:
x[x.columns[res1.support_]].values

array([[6.3200e-03, 2.3100e+00, 0.0000e+00, ..., 6.5200e+01, 4.0900e+00,
        1.5300e+01],
       [2.7310e-02, 7.0700e+00, 0.0000e+00, ..., 7.8900e+01, 4.9671e+00,
        1.7800e+01],
       [2.7290e-02, 7.0700e+00, 0.0000e+00, ..., 6.1100e+01, 4.9671e+00,
        1.7800e+01],
       ...,
       [6.0760e-02, 1.1930e+01, 0.0000e+00, ..., 9.1000e+01, 2.1675e+00,
        2.1000e+01],
       [1.0959e-01, 1.1930e+01, 0.0000e+00, ..., 8.9300e+01, 2.3889e+00,
        2.1000e+01],
       [4.7410e-02, 1.1930e+01, 0.0000e+00, ..., 8.0800e+01, 2.5050e+00,
        2.1000e+01]])

In [65]:
x_new2 = rfecv.fit_transform(x,y)    # return directly the selected 2D array

In [63]:
x_new2

array([[6.3200e-03, 2.3100e+00, 0.0000e+00, ..., 6.5200e+01, 4.0900e+00,
        1.5300e+01],
       [2.7310e-02, 7.0700e+00, 0.0000e+00, ..., 7.8900e+01, 4.9671e+00,
        1.7800e+01],
       [2.7290e-02, 7.0700e+00, 0.0000e+00, ..., 6.1100e+01, 4.9671e+00,
        1.7800e+01],
       ...,
       [6.0760e-02, 1.1930e+01, 0.0000e+00, ..., 9.1000e+01, 2.1675e+00,
        2.1000e+01],
       [1.0959e-01, 1.1930e+01, 0.0000e+00, ..., 8.9300e+01, 2.3889e+00,
        2.1000e+01],
       [4.7410e-02, 1.1930e+01, 0.0000e+00, ..., 8.0800e+01, 2.5050e+00,
        2.1000e+01]])