# Feature Selection

### Reproducibility: Setting the seed
With the aim to ensure reproducibility between runs of the same notebook, but also between the research and production environment, for each step that includes some element of randomness, it is extremely important that we set the seed

---

In [1]:
# Import Libraries
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [2]:
# Load the dataset, get features and label
data = pl.read_csv('Data/final_data.csv')
labels = data.select('SalePrice')
features = data.drop('SalePrice')

# Convert to pandas DataFrame for sklearn compatibility
features_pd = features.to_pandas()
labels_pd = labels.to_pandas()

# Handle missing values by filling NaNs with the mean of each column
features_pd = features_pd.fillna(features_pd.mean())
labels_pd = labels_pd.fillna(labels_pd.mean())

# Train RandomForest and get the feature importance
X_train, X_test, y_train, y_test = train_test_split(features_pd, labels_pd, test_size=0.1, random_state=26)

# Create the linear regression model
model = LinearRegression()

# Select at least 10 features and run the Recursive Feature Elimination
n_features_to_select = 10
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train.values.ravel())
rfe_df = pd.DataFrame({'feature': features_pd.columns, 'selected_by_rfe': rfe.support_, 'ranking': rfe.ranking_}).sort_values(by='ranking')

rfe_df.to_csv('Data/features_selected.csv')