# Feature Engineering

**Objective:**  
In this notebook, we apply feature engineering techniques to enhance the dataset and prepare it for modeling. The goal is to extract, transform, and encode features in a way that improves the model’s ability to predict `SalePrice`.

---

## Key Steps:

1. **Handle missing values**
   - Domain-specific imputation strategies
   - Use of indicators for missingness (if meaningful)

2. **Transform variables**
   - Log transformation for skewed features
   - Binning or discretization of continuous variables
   - Date or time-based feature extraction

3. **Encode categorical features**
   - Ordinal encoding for ranked categories
   - One-hot encoding for nominal variables
   - Group rare categories

4. **Create new features**
   - Total square footage (`TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF`)
   - Age-related features (`Age = YrSold - YearBuilt`)
   - Quality-related scores (`OverallQual * OverallCond`)

5. **Feature scaling (if required)**
   - Standardization or normalization of numerical values for certain models

---

**Outcome:**  
We produce a cleaned and transformed dataset with meaningful features that are ready to be fed into machine learning models for accurate price prediction.


In [None]:
# Import Libraries
import polars as pl
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from bokeh.models import NumeralTickFormatter
import holoviews as hv
from sklearn.model_selection import train_test_split
from numpy import log
hv.extension('bokeh')

In [2]:
# Load the dataset, handle missing values and get basic information
data = pl.read_csv('data/train.csv', null_values="NA")
print(f'The dataset contains {data.shape[0]} rows and {data.shape[1]} columns')
data.head()

The dataset contains 1460 rows and 81 columns


Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,…,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
i64,i64,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,str,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,str,i64,str,str,i64,str,i64,i64,str,str,str,i64,i64,i64,i64,i64,i64,str,str,str,i64,i64,i64,str,str,i64
1,60,"""RL""",65,8450,"""Pave""",,"""Reg""","""Lvl""","""AllPub""","""Inside""","""Gtl""","""CollgCr""","""Norm""","""Norm""","""1Fam""","""2Story""",7,5,2003,2003,"""Gable""","""CompShg""","""VinylSd""","""VinylSd""","""BrkFace""",196,"""Gd""","""TA""","""PConc""","""Gd""","""TA""","""No""","""GLQ""",706,"""Unf""",0,…,854,0,1710,1,0,2,1,3,1,"""Gd""",8,"""Typ""",0,,"""Attchd""",2003,"""RFn""",2,548,"""TA""","""TA""","""Y""",0,61,0,0,0,0,,,,0,2,2008,"""WD""","""Normal""",208500
2,20,"""RL""",80,9600,"""Pave""",,"""Reg""","""Lvl""","""AllPub""","""FR2""","""Gtl""","""Veenker""","""Feedr""","""Norm""","""1Fam""","""1Story""",6,8,1976,1976,"""Gable""","""CompShg""","""MetalSd""","""MetalSd""","""None""",0,"""TA""","""TA""","""CBlock""","""Gd""","""TA""","""Gd""","""ALQ""",978,"""Unf""",0,…,0,0,1262,0,1,2,0,3,1,"""TA""",6,"""Typ""",1,"""TA""","""Attchd""",1976,"""RFn""",2,460,"""TA""","""TA""","""Y""",298,0,0,0,0,0,,,,0,5,2007,"""WD""","""Normal""",181500
3,60,"""RL""",68,11250,"""Pave""",,"""IR1""","""Lvl""","""AllPub""","""Inside""","""Gtl""","""CollgCr""","""Norm""","""Norm""","""1Fam""","""2Story""",7,5,2001,2002,"""Gable""","""CompShg""","""VinylSd""","""VinylSd""","""BrkFace""",162,"""Gd""","""TA""","""PConc""","""Gd""","""TA""","""Mn""","""GLQ""",486,"""Unf""",0,…,866,0,1786,1,0,2,1,3,1,"""Gd""",6,"""Typ""",1,"""TA""","""Attchd""",2001,"""RFn""",2,608,"""TA""","""TA""","""Y""",0,42,0,0,0,0,,,,0,9,2008,"""WD""","""Normal""",223500
4,70,"""RL""",60,9550,"""Pave""",,"""IR1""","""Lvl""","""AllPub""","""Corner""","""Gtl""","""Crawfor""","""Norm""","""Norm""","""1Fam""","""2Story""",7,5,1915,1970,"""Gable""","""CompShg""","""Wd Sdng""","""Wd Shng""","""None""",0,"""TA""","""TA""","""BrkTil""","""TA""","""Gd""","""No""","""ALQ""",216,"""Unf""",0,…,756,0,1717,1,0,1,0,3,1,"""Gd""",7,"""Typ""",1,"""Gd""","""Detchd""",1998,"""Unf""",3,642,"""TA""","""TA""","""Y""",0,35,272,0,0,0,,,,0,2,2006,"""WD""","""Abnorml""",140000
5,60,"""RL""",84,14260,"""Pave""",,"""IR1""","""Lvl""","""AllPub""","""FR2""","""Gtl""","""NoRidge""","""Norm""","""Norm""","""1Fam""","""2Story""",8,5,2000,2000,"""Gable""","""CompShg""","""VinylSd""","""VinylSd""","""BrkFace""",350,"""Gd""","""TA""","""PConc""","""Gd""","""TA""","""Av""","""GLQ""",655,"""Unf""",0,…,1053,0,2198,1,0,2,1,4,1,"""Gd""",9,"""Typ""",1,"""TA""","""Attchd""",2000,"""RFn""",3,836,"""TA""","""TA""","""Y""",192,84,0,0,0,0,,,,0,12,2008,"""WD""","""Normal""",250000


###  Missing Values

In [3]:
# We start the missing value process by the categorical data
print('Missing Data - Categorical Columns')
categorical_features = data.select(pl.col(pl.String)).columns

# First we find which columns have more or less than 10% missing data. 
lot_missing_data = [column for column in categorical_features if data.select(pl.col(column).is_null().sum()).item() / data.height > 0.1]
few_missing_data = [column for column in categorical_features if data.select(pl.col(column).is_null().sum()).item() / data.height <= 0.1]
print(f'We have {len(lot_missing_data)} of columns with more than 10% data and {len(few_missing_data)} of columns with less than 10%')

# Ensure that all the categorical features are regarded as strings
data = data.with_columns([pl.col(col).cast(pl.Utf8).alias(col) for col in categorical_features])

Missing Data - Categorical Columns
We have 5 of columns with more than 10% data and 38 of columns with less than 10%


In [None]:
# Assume `data` is a Polars DataFrame
labels = data["SalePrice"]
features = data.drop(["Id", "SalePrice"])

# Convert to pandas for sklearn compatibility
features_pd = features.to_pandas()
labels_np = labels.to_numpy()

# Split
X_train_pd, X_test_pd, y_train, y_test = train_test_split(features_pd, labels_np, test_size=0.1, random_state=26)

# Log-transform target
y_train = np.log(y_train)
y_test = np.log(y_test)

# Scale
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_pd)
X_test_scaled = scaler.transform(X_test_pd)

# Convert back to Polars
X_train = pl.DataFrame(X_train_scaled, schema=X_train_pd.columns)
X_test = pl.DataFrame(X_test_scaled, schema=X_test_pd.columns)

For the column MSZoning we have 0
For the column Street we have 0
For the column Alley we have 0
For the column LotShape we have 0
For the column LandContour we have 0
For the column Utilities we have 0
For the column LotConfig we have 0
For the column LandSlope we have 0
For the column Neighborhood we have 0
For the column Condition1 we have 0
For the column Condition2 we have 0
For the column BldgType we have 0
For the column HouseStyle we have 0
For the column RoofStyle we have 0
For the column RoofMatl we have 0
For the column Exterior1st we have 0
For the column Exterior2nd we have 0
For the column MasVnrType we have 0
For the column ExterQual we have 0
For the column ExterCond we have 0
For the column Foundation we have 0
For the column BsmtQual we have 0
For the column BsmtCond we have 0
For the column BsmtExposure we have 0
For the column BsmtFinType1 we have 0
For the column BsmtFinType2 we have 0
For the column Heating we have 0
For the column HeatingQC we have 0
For the colu

In [5]:
# We follow the missing value process by the numerical data
print('Missing Data - Numerical Columns')
numerical_features = data.select(pl.col(pl.Int64)).columns
numerical_features.remove('SalePrice')

# First we find which columns have more or less than 10% missing data. 
lot_missing_data = [column for column in numerical_features if data.select(pl.col(column).is_null().sum()).item() / data.height > 0.1]
few_missing_data = [column for column in numerical_features if data.select(pl.col(column).is_null().sum()).item() / data.height <= 0.1]
print(f'We have {len(lot_missing_data)} of columns with more than 10% data and {len(few_missing_data)} of columns with less than 10%')

# Ensure that all the categorical features are regarded as strings
data = data.with_columns([pl.col(column).cast(pl.Utf8).alias(column) for column in numerical_features])

Missing Data - Numerical Columns
We have 1 of columns with more than 10% data and 36 of columns with less than 10%


### Numerical variable transformation


In [6]:
# We know that the columns LotFrontage, 1stFlrSF, GrLivArea have a right skewed distribution
log_columns = ['LotFrontage', '1stFlrSF', 'GrLivArea']
data = data.with_columns([pl.col(column).cast(pl.Float64).fill_null(1.0).log().alias(column) for column in log_columns])

### Binarize skewed variables


In [7]:
# We know that the columns 'BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal' have a skewed distribution
binarised_columns = ['BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal']
data = data.with_columns([(pl.col(column).cast(pl.Float64).fill_null(0) > 0).cast(pl.Int8).alias(column) for column in binarised_columns])

### Categorical Variables Mapping

In [8]:
# Map categorical values to numerical values for the quality related columns
quality_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}
quality_columns  = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']
data = data.with_columns(pl.col(column).replace(quality_mappings) for column in quality_columns)

In [9]:
# Map categorical values to numerical values for the exposure column
exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
exposure_column   = 'BsmtExposure'
data = data.with_columns(pl.col(exposure_column).replace(exposure_mappings))

In [10]:
# Map categorical values to numerical values for the finishing columns
finish_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
finish_columns = ['BsmtFinType1', 'BsmtFinType2']
data = data.with_columns(pl.col(column).replace(finish_mappings) for column in finish_columns)

In [11]:
# Map categorical values to numerical values for the garage column
garage_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
garage_columns  = 'GarageFinish'
data = data.with_columns(pl.col(garage_columns).replace(garage_mappings))

In [12]:
# Map categorical values to numerical values for the fence column
fence_mappings = {'Missing': 0, 'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
fence_column  = 'Fence'
data = data.with_columns(pl.col(fence_column).replace(fence_mappings))

### Feature Scaling

In [13]:
# Let's separate into train and test set
labels = data['SalePrice']
features = data.drop(['Id', 'SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=26)

# Remember to use log for the target label that is the SalePrice to fix distribution
y_train = np.log(y_train)
y_test = np.log(y_test)

# Create scaler
scaler = MinMaxScaler()
scaler.fit(X_train) 

# 
X_train = pl.DataFrame({col: scaler.transform(X_train)[:, idx] for idx, col in enumerate(X_train.columns)})
X_test  = pl.DataFrame({col: scaler.transform(X_test)[:, idx] for idx, col in enumerate(X_train.columns)})

ValueError: could not convert string to float: 'RL'