# P2: Feature Selection 2 (Greedy Feature Selection Methods)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import beta
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Data Preprocessing

In [2]:
df = pd.read_csv("../dataset/House_Price_Prediction.csv")
df.head()

Unnamed: 0,date,price,bedrooms,grade,has_basement,living_in_m2,renovated,nice_view,perfect_condition,real_bathrooms,has_lavatory,single_floor,month,quartile_zone
0,2014-09-26,305000.0,2,1,False,76.18046,False,False,True,1,False,True,9,2
1,2014-05-14,498000.0,3,2,True,210.88981,False,False,False,2,True,True,5,2
2,2015-03-23,590000.0,2,4,False,262.91549,False,False,False,2,True,False,3,2
3,2014-07-15,775000.0,3,3,False,159.79316,False,False,False,1,True,False,7,3
4,2015-04-14,350000.0,2,1,False,92.903,False,False,False,1,True,True,4,3


In [3]:
df = df.drop(columns=["date"])
df.head()

Unnamed: 0,price,bedrooms,grade,has_basement,living_in_m2,renovated,nice_view,perfect_condition,real_bathrooms,has_lavatory,single_floor,month,quartile_zone
0,305000.0,2,1,False,76.18046,False,False,True,1,False,True,9,2
1,498000.0,3,2,True,210.88981,False,False,False,2,True,True,5,2
2,590000.0,2,4,False,262.91549,False,False,False,2,True,False,3,2
3,775000.0,3,3,False,159.79316,False,False,False,1,True,False,7,3
4,350000.0,2,1,False,92.903,False,False,False,1,True,True,4,3


In [4]:
boolean_columns = ["has_basement","renovated","nice_view","perfect_condition","has_lavatory","single_floor"]
for column in boolean_columns:
    df[column] = df[column].astype(int)
df.head()

Unnamed: 0,price,bedrooms,grade,has_basement,living_in_m2,renovated,nice_view,perfect_condition,real_bathrooms,has_lavatory,single_floor,month,quartile_zone
0,305000.0,2,1,0,76.18046,0,0,1,1,0,1,9,2
1,498000.0,3,2,1,210.88981,0,0,0,2,1,1,5,2
2,590000.0,2,4,0,262.91549,0,0,0,2,1,0,3,2
3,775000.0,3,3,0,159.79316,0,0,0,1,1,0,7,3
4,350000.0,2,1,0,92.903,0,0,0,1,1,1,4,3


In [5]:
numerical_columns = ["price","bedrooms","grade","living_in_m2","real_bathrooms","month","quartile_zone"]
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df.head()

Unnamed: 0,price,bedrooms,grade,has_basement,living_in_m2,renovated,nice_view,perfect_condition,real_bathrooms,has_lavatory,single_floor,month,quartile_zone
0,-0.819164,-0.347184,-1.562704,0,-1.540812,0,0,1,-1.060632,0,1,0.78348,-0.39618
1,0.108578,1.096722,-0.532991,1,0.431209,0,0,0,0.534124,1,1,-0.50723,-0.39618
2,0.550818,-0.347184,1.526433,0,1.192817,0,0,0,0.534124,1,0,-1.152585,-0.39618
3,1.440104,1.096722,0.496721,0,-0.316799,0,0,0,-1.060632,1,0,0.138125,0.537812
4,-0.602851,-0.347184,-1.562704,0,-1.296009,0,0,0,-1.060632,1,1,-0.829907,0.537812


In [6]:
X = df.drop(columns=["price"])
y = df["price"]

train_split = 0.8
train_index = int(train_split * len(df))

X_train, X_test = X[:train_index], X[train_index:]
y_train, y_test = y[:train_index], y[train_index:]

## Greedy Forward Selection

In [26]:
class LR:
    def __init__(self):
        self.c = None
        self.i = None
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        X = np.column_stack((np.ones(X.shape[0]),X))
        
        theta = np.linalg.inv(X.T @ X) @ X.T @ y
        
        self.i = theta[0]
        self.c = theta[1:]
        
    def predict(self, X):
        X = np.array(X)
        X = np.column_stack((np.ones(X.shape[0]),X))
        
        return X @ np.concatenate(([self.i], self.c))
        

In [34]:
def greedyForward(model):
    selectedFeatures = []
    remainingFeatures = list(X.columns)
    bestMse = float('inf')
    
    while remainingFeatures:
        scores = {}
        for feature in remainingFeatures:
            currFeatures = selectedFeatures + [feature]
            X_train_temp = X_train[currFeatures]
            X_test_temp = X_test[currFeatures]
            
            model.fit(X_train_temp, y_train)
            y_pred = model.predict(X_test_temp)
            
            mse = np.mean((y_pred - y_test)**2)
            scores[feature] = mse
            
        bestFeature = min(scores, key=scores.get)
        currBestMse = scores[bestFeature]
        
        if currBestMse < bestMse:
            bestMse = currBestMse
            selectedFeatures.append(bestFeature)
            remainingFeatures.remove(bestFeature)
            print(f'Selected {bestFeature} with MSE: {currBestMse}')
        else:
            break
    
    print(selectedFeatures)
    return selectedFeatures

In [43]:
def greedyBackward(model):
    remainingFeatures = list(X.columns)
    removedFeatures = []
    bestMse = float('inf')
    
    while remainingFeatures:
        scores = {}
        for feature in remainingFeatures:
            currFeatures = [f for f in remainingFeatures if f != feature]
            X_train_temp = X_train[currFeatures]
            X_test_temp = X_test[currFeatures]
            
            model.fit(X_train_temp, y_train)
            y_pred = model.predict(X_test_temp)
            
            mse = np.mean((y_pred - y_test)**2)
            scores[feature] = mse
            
        worstFeature = min(scores, key=scores.get)
        currBestMse = scores[worstFeature]
        
        if currBestMse < bestMse:
            bestMse = currBestMse
            removedFeatures.append(worstFeature)
            remainingFeatures.remove(worstFeature)
            print(f'Removed {worstFeature} with MSE: {currBestMse}')
        else:
            break
            
    print(remainingFeatures)
    return remainingFeatures

In [45]:
model = LR()
features1 = greedyForward(model)
features2 = greedyBackward(model)
features1.sort()
features2.sort()
print("\n")
print(features1)
print("\n")
print(features2)

Selected quartile_zone with MSE: 0.5462550640011844
Selected living_in_m2 with MSE: 0.3077117910218584
Selected nice_view with MSE: 0.2786002252881445
Selected grade with MSE: 0.25432037550966985
Selected perfect_condition with MSE: 0.2499873153737846
Selected renovated with MSE: 0.24661147332355376
Selected has_basement with MSE: 0.24537545833171126
Selected has_lavatory with MSE: 0.24437544412173187
Selected month with MSE: 0.24366830855987331
Selected single_floor with MSE: 0.24345525586182948
['quartile_zone', 'living_in_m2', 'nice_view', 'grade', 'perfect_condition', 'renovated', 'has_basement', 'has_lavatory', 'month', 'single_floor']
Removed real_bathrooms with MSE: 0.24347201281783895
Removed bedrooms with MSE: 0.24345525586182948
['grade', 'has_basement', 'living_in_m2', 'renovated', 'nice_view', 'perfect_condition', 'has_lavatory', 'single_floor', 'month', 'quartile_zone']


['grade', 'has_basement', 'has_lavatory', 'living_in_m2', 'month', 'nice_view', 'perfect_condition', '