# Feature Engineering

**Objective:**  
In this notebook, we apply feature engineering techniques to enhance the dataset and prepare it for modeling. The goal is to extract, transform, and encode features in a way that improves the model’s ability to predict `SalePrice`.

---

## Key Steps:

1. **Handle missing values**
   - Domain-specific imputation strategies
   - Use of indicators for missingness (if meaningful)

2. **Transform variables**
   - Log transformation for skewed features
   - Binning or discretization of continuous variables
   - Date or time-based feature extraction

3. **Encode categorical features**
   - Ordinal encoding for ranked categories
   - One-hot encoding for nominal variables
   - Group rare categories

4. **Create new features**
   - Total square footage (`TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF`)
   - Age-related features (`Age = YrSold - YearBuilt`)
   - Quality-related scores (`OverallQual * OverallCond`)

5. **Feature scaling (if required)**
   - Standardization or normalization of numerical values for certain models

---

**Outcome:**  
We produce a cleaned and transformed dataset with meaningful features that are ready to be fed into machine learning models for accurate price prediction.


In [5]:
# Import Libraries
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import hvplot.polars
import hvplot.pandas
import numpy as np
from bokeh.models import NumeralTickFormatter
import holoviews as hv
from sklearn.model_selection import train_test_split
from numpy import log
hv.extension('bokeh')

In [2]:
# Load the dataset, handle missing values and get basic information
data = pl.read_csv('data/train.csv', null_values="NA")
print(f'The dataset contains {data.shape[0]} rows and {data.shape[1]} columns')
data.head()

The dataset contains 1460 rows and 81 columns


Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,…,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
i64,i64,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,str,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,str,i64,str,str,i64,str,i64,i64,str,str,str,i64,i64,i64,i64,i64,i64,str,str,str,i64,i64,i64,str,str,i64
1,60,"""RL""",65,8450,"""Pave""",,"""Reg""","""Lvl""","""AllPub""","""Inside""","""Gtl""","""CollgCr""","""Norm""","""Norm""","""1Fam""","""2Story""",7,5,2003,2003,"""Gable""","""CompShg""","""VinylSd""","""VinylSd""","""BrkFace""",196,"""Gd""","""TA""","""PConc""","""Gd""","""TA""","""No""","""GLQ""",706,"""Unf""",0,…,854,0,1710,1,0,2,1,3,1,"""Gd""",8,"""Typ""",0,,"""Attchd""",2003,"""RFn""",2,548,"""TA""","""TA""","""Y""",0,61,0,0,0,0,,,,0,2,2008,"""WD""","""Normal""",208500
2,20,"""RL""",80,9600,"""Pave""",,"""Reg""","""Lvl""","""AllPub""","""FR2""","""Gtl""","""Veenker""","""Feedr""","""Norm""","""1Fam""","""1Story""",6,8,1976,1976,"""Gable""","""CompShg""","""MetalSd""","""MetalSd""","""None""",0,"""TA""","""TA""","""CBlock""","""Gd""","""TA""","""Gd""","""ALQ""",978,"""Unf""",0,…,0,0,1262,0,1,2,0,3,1,"""TA""",6,"""Typ""",1,"""TA""","""Attchd""",1976,"""RFn""",2,460,"""TA""","""TA""","""Y""",298,0,0,0,0,0,,,,0,5,2007,"""WD""","""Normal""",181500
3,60,"""RL""",68,11250,"""Pave""",,"""IR1""","""Lvl""","""AllPub""","""Inside""","""Gtl""","""CollgCr""","""Norm""","""Norm""","""1Fam""","""2Story""",7,5,2001,2002,"""Gable""","""CompShg""","""VinylSd""","""VinylSd""","""BrkFace""",162,"""Gd""","""TA""","""PConc""","""Gd""","""TA""","""Mn""","""GLQ""",486,"""Unf""",0,…,866,0,1786,1,0,2,1,3,1,"""Gd""",6,"""Typ""",1,"""TA""","""Attchd""",2001,"""RFn""",2,608,"""TA""","""TA""","""Y""",0,42,0,0,0,0,,,,0,9,2008,"""WD""","""Normal""",223500
4,70,"""RL""",60,9550,"""Pave""",,"""IR1""","""Lvl""","""AllPub""","""Corner""","""Gtl""","""Crawfor""","""Norm""","""Norm""","""1Fam""","""2Story""",7,5,1915,1970,"""Gable""","""CompShg""","""Wd Sdng""","""Wd Shng""","""None""",0,"""TA""","""TA""","""BrkTil""","""TA""","""Gd""","""No""","""ALQ""",216,"""Unf""",0,…,756,0,1717,1,0,1,0,3,1,"""Gd""",7,"""Typ""",1,"""Gd""","""Detchd""",1998,"""Unf""",3,642,"""TA""","""TA""","""Y""",0,35,272,0,0,0,,,,0,2,2006,"""WD""","""Abnorml""",140000
5,60,"""RL""",84,14260,"""Pave""",,"""IR1""","""Lvl""","""AllPub""","""FR2""","""Gtl""","""NoRidge""","""Norm""","""Norm""","""1Fam""","""2Story""",8,5,2000,2000,"""Gable""","""CompShg""","""VinylSd""","""VinylSd""","""BrkFace""",350,"""Gd""","""TA""","""PConc""","""Gd""","""TA""","""Av""","""GLQ""",655,"""Unf""",0,…,1053,0,2198,1,0,2,1,4,1,"""Gd""",9,"""Typ""",1,"""TA""","""Attchd""",2000,"""RFn""",3,836,"""TA""","""TA""","""Y""",192,84,0,0,0,0,,,,0,12,2008,"""WD""","""Normal""",250000


In [4]:
# Initially we are going to split the dataset in train and test
label = data.select('SalePrice')
features = data.drop('Id', 'SalePrice')

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.1, random_state=26)
print(f'The size of the training set is {X_train.shape} and the size of the test set is: {X_test.shape}')

The size of the training set is (1314, 79) and the size of the test set is: (146, 79)


### Target Label Distribution Fix

In [None]:
# As we saw during the EDA we want to use the log of the SalePrice to create a data distribution that is closer to the 
y_train = log(y_train)
y_test  = log(y_test)

###  Missing Values

In [None]:
# For both the train and test set 

# Find all the columns with missing values
missing_val_columns = [column for column in X_train.columns if X_train[column].null_count() > 0] # type: ignore
print(f'The dataset contains {len(missing_val_columns)} columns with missing values: {missing_val_columns}')


# Calculate the percentage of missing values for each column and get the top 10 columns with the highest percentage of missing values
missing_val_percentage = X_train.select([pl.col(column).null_count() / X_train.height * 100 for column in missing_val_columns]) # type: ignore
missing_val_percentage_sorted = missing_val_percentage.melt().sort("value", descending=True)
missing_val_percentage_sorted.head(10)


# It makes sense that columns where we have more than 5% of missing values are not useful for our analysis and we can drop them
columns_to_drop = missing_val_percentage_sorted.filter(pl.col('value') > 10).select('variable').to_series().to_list()
X_train = X_train.drop(columns_to_drop)
X_test  = X_test.drop(columns_to_drop)

print(f'The dataset now contains {X_train.height} rows and {X_train.width} columns after dropping the columns with more than 10% of missing values')

The dataset contains 18 columns with missing values: ['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


  missing_val_percentage_sorted = missing_val_percentage.melt().sort("value", descending=True)


ColumnNotFoundError: "Alley" not found

In [10]:
X_train.null_count()

MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,…,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,237,0,0,1231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,34,34,35,34,0,35,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,625,70,70,70,0,0,70,70,0,0,0,0,0,0,0,1308,1054,1262,0,0,0,0,0
