# Step - Backward Feature selection

# import libraries - datasets

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
dataset1 = pd.read_csv("dataset_2.csv")
dataset2 = pd.read_csv("housing.csv")

In [3]:
dataset1.shape

(50000, 109)

In [4]:
dataset2.shape

(1460, 81)

## Seperate to train - test sets

In [5]:
numerics = ['int64', 'float64']
numerical_vars = list(dataset2.select_dtypes(include=numerics).columns)
dataset2 = dataset2[numerical_vars]

In [6]:
fs1 = dataset1.drop(labels=['target'], axis=1)
fs2 = dataset2.drop(labels=['SalePrice'], axis=1)
dv1 = dataset1['target']
dv2 = dataset2['SalePrice']
x_train, x_test, y_train, y_test = train_test_split(fs1,dv1,test_size=0.3,random_state=0)
x_train2, x_test2, y_train2, y_test2 = train_test_split(fs2,dv2,test_size=0.3,random_state=0)

In [7]:
x_train2.fillna(0, inplace=True)
x_test2.fillna(0, inplace=True)

## Step backward Feature selection for both datasets

In [8]:
sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0),
          k_features=105,
          forward=False,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv=2)

sfs = sfs.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  2.7min finished

[2024-03-11 16:09:24] Features: 107/105 -- score: 0.7290285714285714[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 107 out of 107 | elapsed:  2.7min finished

[2024-03-11 16:12:05] Features: 106/105 -- score: 0.7298[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 106 out of 106 | elapsed:  2.7min finished

[2024-03-11 16:14:45] Features: 105/105 -- score: 0.7299142857142857

In [9]:
sfs2 = SFS(RandomForestRegressor(n_estimators=10, n_jobs=4, random_state=10), 
           k_features=20, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=2)

sfs2 = sfs2.fit(x_train2, y_train2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:    2.1s finished

[2024-03-11 16:14:47] Features: 36/20 -- score: 0.8264413785481026[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    2.0s finished

[2024-03-11 16:14:49] Features: 35/20 -- score: 0.8288924139651588[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:    2.1s finished

[2024-03-11 16:14:51] Features: 34/20 -- score: 0.8314722992772682[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Don

In [10]:
sfs2.k_feature_names_

('MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'BsmtFinSF1',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'KitchenAbvGr',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'MoSold',
 'YrSold')