# House Price Dataset

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Wrapper Method (Sequential Feature Selector for Regression)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Regression Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR  # Support Vector Regressor
from sklearn.linear_model import Lasso       
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Embedded Feature Selection (RFECV for Regression)
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold


In [3]:
df = pd.read_csv("/kaggle/input/houseprice-fs/house_fs.csv")
print(df)

      MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0             60         65.0     8450            7            5       2003   
1             20         80.0     9600            6            8       1976   
2             60         68.0    11250            7            5       2001   
3             70         60.0     9550            7            5       1915   
4             60         84.0    14260            8            5       2000   
...          ...          ...      ...          ...          ...        ...   
1455          60         62.0     7917            6            5       1999   
1456          20         85.0    13175            6            6       1978   
1457          70         66.0     9042            7            9       1941   
1458          20         68.0     9717            5            6       1950   
1459          20         75.0     9937            5            6       1965   

      YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtUnf

In [4]:
X = df.drop('target', axis = 1)
y = df['target']

**Forward Selection**

In [6]:
sfs1 = SFS(RandomForestRegressor(n_jobs=4, random_state=42),
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='r2', 
           cv=3)

sfs1 = sfs1.fit(X, y)

print('Best accuracy score: %.4f' % sfs1.k_score_)
print('Best subset (indices):', sfs1.k_feature_idx_)
print('Best subset (names):', sfs1.k_feature_names_)

x_forward = X[list(sfs1.k_feature_names_)]



[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   28.2s
[Parallel(n_jobs=1)]: Done  98 out of  98 | elapsed:  1.1min finished

[2025-08-27 08:55:40] Features: 1/5 -- score: 0.680991022507515[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   31.2s
[Parallel(n_jobs=1)]: Done  97 out of  97 | elapsed:  1.3min finished

[2025-08-27 08:56:55] Features: 2/5 -- score: 0.731702861129259[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   31.4s
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:  1.2min finished

[2025-08-27 08:58:10] Features: 3/5 -- score: 0.757744344475277[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   34.7s
[Parallel(n_jobs=1)]: Done  95 out of  95 | elapsed:  1.4min finished

[2025-08-27 08:59:32] Features: 4/5 -- score: 0.8258285174167561[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   36.9s


Best accuracy score: 0.8474
Best subset (indices): (3, 8, 12, 13, 22)
Best subset (names): ('OverallQual', 'BsmtFinSF1', '2ndFlrSF', 'GrLivArea', 'GarageCars')


[Parallel(n_jobs=1)]: Done  94 out of  94 | elapsed:  1.4min finished

[2025-08-27 09:00:59] Features: 5/5 -- score: 0.8474177876285172

**Backward Selection**

In [14]:
sfs1 = SFS(RandomForestRegressor(n_jobs=4, n_estimators=25, random_state=42),
           k_features=5,
           forward=False,
           floating=False,
           verbose=2,
           scoring='r2',
           cv=2)

sfs1 = sfs1.fit(X, y)

print('Best precision score: %.4f' % sfs1.k_score_)
print('Best subset (indices):', sfs1.k_feature_idx_)
print('Best subset (names):', sfs1.k_feature_names_)
x_backward = X[list(sfs1.k_feature_names_)]


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   13.7s
[Parallel(n_jobs=1)]: Done  98 out of  98 | elapsed:   33.6s finished

[2025-08-27 09:55:30] Features: 97/5 -- score: 0.8563096946976734[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   13.7s
[Parallel(n_jobs=1)]: Done  97 out of  97 | elapsed:   33.1s finished

[2025-08-27 09:56:03] Features: 96/5 -- score: 0.856755583997297[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   13.4s
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:   32.0s finished

[2025-08-27 09:56:36] Features: 95/5 -- score: 0.86033637023918[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   13.3s
[Parallel(n_jobs=1)]: Done  95 out of  95 | elapsed:   31.6s finished

[2025-08-27 09:57:07] Features: 94/5 -- score: 0.8583655689121321[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   13.3s
[Parallel(n_jobs=1)]: Done  94 out of  94 | elapsed:   31.3s finished

[2025-08-27 09:57:38] Features: 93/5 -- score: 0.8618444607735442[Parallel(n_jobs=1

Best precision score: 0.8399
Best subset (indices): (3, 5, 8, 12, 13)
Best subset (names): ('OverallQual', 'YearBuilt', 'BsmtFinSF1', '2ndFlrSF', 'GrLivArea')


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.0s finished

[2025-08-27 10:17:52] Features: 5/5 -- score: 0.8398965268418335

**Exhaustive Search**

In [18]:
from sklearn.ensemble import RandomForestRegressor
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import Ridge
import numpy as np

# Step 1: Pre-filter top features
model = RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(X, y)
top_idx = np.argsort(model.feature_importances_)[-10:]
X_top = X.iloc[:, top_idx]

# Step 2: Run EFS on reduced set
efs = EFS(Ridge(alpha=1.0),
          min_features=2,
          max_features=3,   
          scoring='r2',
          cv=2,            
          print_progress=True,
          n_jobs=1)

efs = efs.fit(X_top, y)

# Final selected features
print("Best R²:", efs.best_score_)
print("Best features:", efs.best_feature_names_)


Features: 165/165

Best R²: 0.7351952170673904
Best features: ('GarageCars', 'GrLivArea', 'OverallQual')


**Recursive Feature Elimination**

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

# Step 1: Create the model
clf = RandomForestRegressor(n_jobs=4, random_state=42)

# Step 2: Initialize RFECV
rfecv = RFECV(estimator=clf,
              step=1,
              cv=KFold(n_splits=3, shuffle=True, random_state=42),  # ✅ Use KFold
              scoring='r2',  # ✅ Regression scoring
              n_jobs=4,
              verbose=2)

# Step 3: Fit the selector
rfecv.fit(X, y)

# Step 4: Output the results
print("Optimal number of features: %d" % rfecv.n_features_)
print("Selected feature indices:", rfecv.support_.nonzero()[0])
print("Selected feature names:", X.columns[rfecv.support_].tolist())

# Step 5: Extract selected features
X_selected_rs = X.loc[:, rfecv.support_]


Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Optimal number of features: 83
Selected feature indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 28 29 30 31 33 34 35 36 40 41 42 43 44 45 46 48 49 50 51 52 53
 54 55 57 58 59 60 61 62 63 64 65 68 69 70 71 72 74 75 76 77 78 79 81 83
 84 85 86 87 88 89 91 94 95 96 97]
Selected feature names: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1

# Emebedded Methods

**Scaling**

In [27]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**Lasso L1 Regularization**

In [35]:
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X_scaled, y)

selected_lasso = X.columns[(lasso.coef_ != 0)]
print("LASSO selected features:", list(selected_lasso))


LASSO selected features: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'MSZoning_FV', 'MSZoning_RL', 'MSZoning_RM', 'LotShape_Reg', 'LotConfig_CulDSac', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'BldgType_Duplex', 'HouseStyle_2Story', 'RoofStyle_Gable', 'RoofStyle_Hip', 'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_MetalSd', 'Exterior

**L2 Ridge Regularization**

In [34]:
from sklearn.linear_model import Ridge

# Train Ridge model (L2 regularized regression)
ridge = Ridge(alpha=0.01, max_iter=10000)
ridge.fit(X_scaled, y)

# Get features with non-zero positive coefficients
selected_ridge = X.columns[ridge.coef_ > 0]

print("RIDGE selected features: ", list(selected_ridge))


RIDGE selected features:  ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'EnclosedPorch', 'MSZoning_FV', 'MSZoning_RL', 'MSZoning_RM', 'LotShape_Reg', 'LotConfig_CulDSac', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_MeadowV', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'RoofStyle_Hip', 'Exterior1st_CemntBd', 'Exterior2nd_HdBoard', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng', 'ExterCond_Fa', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'BsmtExposure_Gd', 'BsmtFinType1_BLQ', 'BsmtFinType1_GLQ', 'GarageType_Attchd', 'GarageType_BuiltIn', 'GarageType_Detchd', 'PavedDrive_Y', 'SaleType_New', 'SaleCondition_Normal']


**L1/L2 Regularization Elastic Net**

In [33]:
elastic_net = ElasticNet(alpha=0.02, l1_ratio=0.5, random_state=42, max_iter=10000)

elastic_net.fit(X_scaled, y)

selected_elastic = X.columns[elastic_net.coef_ != 0]

print("Elastic Net selected features:", list(selected_elastic))


Elastic Net selected features: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'MSZoning_FV', 'MSZoning_RL', 'MSZoning_RM', 'LotShape_Reg', 'LotConfig_CulDSac', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'BldgType_Duplex', 'HouseStyle_2Story', 'RoofStyle_Gable', 'RoofStyle_Hip', 'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_MetalSd', 'Ex