# Step Forward Feature Selection

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
!pip install mlxtend



In [3]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Import dataset & split into train and set

In [4]:
dataset2 = pd.read_csv("dataset_2.csv")
dataset2.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109
0,4.53271,3.280834,17.982476,4.404259,2.34991,0.603264,2.784655,0.323146,12.009691,0.139346,...,2.079066,6.748819,2.941445,18.360496,17.726613,7.774031,1.473441,1.973832,0.976806,2.541417
1,5.821374,12.098722,13.309151,4.125599,1.045386,1.832035,1.833494,0.70909,8.652883,0.102757,...,2.479789,7.79529,3.55789,17.383378,15.193423,8.263673,1.878108,0.567939,1.018818,1.416433
2,1.938776,7.952752,0.972671,3.459267,1.935782,0.621463,2.338139,0.344948,9.93785,11.691283,...,1.861487,6.130886,3.401064,15.850471,14.620599,6.849776,1.09821,1.959183,1.575493,1.857893
3,6.02069,9.900544,17.869637,4.366715,1.973693,2.026012,2.853025,0.674847,11.816859,0.011151,...,1.340944,7.240058,2.417235,15.194609,13.553772,7.229971,0.835158,2.234482,0.94617,2.700606
4,3.909506,10.576516,0.934191,3.419572,1.871438,3.340811,1.868282,0.439865,13.58562,1.153366,...,2.738095,6.565509,4.341414,15.893832,11.929787,6.954033,1.853364,0.511027,2.599562,0.811364


In [5]:
fs = dataset2.drop(labels = "target", axis = 1) # futureSet
dv = dataset2["target"] # DecisionVar

x_train2, x_test2, y_train2, y_test2 = train_test_split(fs, 
                                                        dv, 
                                                        test_size = 0.3, 
                                                        random_state = 00)

In [6]:
x_train2.shape, x_test2.shape

((35000, 108), (15000, 108))

## Step Forwards Feature Selection

In [7]:
rfc = RandomForestClassifier(n_estimators=10, 
                             n_jobs=4, 
                             random_state=0)
sfs = SFS(rfc, 
          k_features=5, 
          forward=True, 
          floating=False, 
          verbose=2, 
          scoring="accuracy", 
          cv=2)

In [8]:
sfs = sfs.fit(x_train2, y_train2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   33.2s finished

[2024-03-06 14:28:07] Features: 1/5 -- score: 0.7614285714285715[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 107 out of 107 | elapsed:   28.0s finished

[2024-03-06 14:28:35] Features: 2/5 -- score: 0.7609428571428571[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 106 out of 106 | elapsed:   27.9s finished

[2024-03-06 14:29:03] Features: 3/5 -- score: 0.7577714285714285[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [9]:
selected_features = x_train2.columns[list(sfs.k_feature_idx_)]
selected_features

Index(['var_16', 'var_45', 'var_48', 'var_69', 'var_91'], dtype='object')

## Regression

In [10]:
housing = pd.read_csv("housing.csv")
housing

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [11]:
numerics = ["int64", "float64"]
numerical_vars = list(housing.select_dtypes(include = numerics).columns)
housing = housing[numerical_vars]

In [12]:
fs2 = housing.drop(labels = 'SalePrice', axis = 1)
dv2 = housing["SalePrice"]


In [13]:
x_train, x_test, y_train, y_test = train_test_split(fs2, dv2, test_size = 0.3, random_state = 0)

In [14]:
x_train.shape, x_test.shape

((1022, 37), (438, 37))

In [15]:
x_train.fillna(0, inplace = True)
x_test.fillna(0, inplace = True)

## Step Forward Feature Selection for Regression dataset

In [18]:
rfr = RandomForestRegressor(n_estimators = 10, 
                            n_jobs = 4, 
                            random_state = 0)
sfs2 = SFS(rfr, 
          k_features = 20, 
          forward = True, 
          floating = True, 
          verbose = 2, 
          scoring = 'r2', 
          cv = 2)
sfs2 = sfs2.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:    1.2s finished

[2024-03-06 14:31:43] Features: 1/20 -- score: 0.6400959584952356[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    1.1s finished

[2024-03-06 14:31:44] Features: 2/20 -- score: 0.6977071040391631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jo

[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.5s finished

[2024

In [17]:
columnNames = x_train.columns[list(sfs.k_feature_idx_)]
columnNames

Index(['BsmtFullBath', 'OpenPorchSF', 'ScreenPorch', 'PoolArea', 'MiscVal'], dtype='object')