In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

### Feature Engineering for Boston Housing Dataset: Interaction, Polynomial, and Domain-Specific Enhancements

In [5]:
# Load the dataset
data = pd.read_csv('../data/bostonhousing.csv') 

# Define numerical features and the target variable
numerical_features = [
    'crim', 'zn', 'indus', 'nox', 'rm',
    'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'
]
target_variable = 'medv'

# Generate interaction features (combined effect of two variables)
data['interaction_rm_age'] = data['rm'] * data['age']               # Rooms × Age
data['interaction_tax_ptratio'] = data['tax'] * data['ptratio']     # Tax × Pupil-Teacher Ratio

# Create polynomial features (non-linear effects)
data['polynomial_rm_squared'] = data['rm'] ** 2
data['polynomial_age_squared'] = data['age'] ** 2

# Create a domain-specific feature (approximate house age based on 2023)
data['estimated_house_built_year'] = 2023 - data['age']

# Create an aggregated feature (e.g., estimated total rooms based on average rooms and school ratio)
data['aggregated_total_rooms'] = data['rm'] * data['ptratio']

# Categorize houses into age bins (for segmented analysis)
data['binned_age_category'] = pd.cut(
    data['age'],
    bins=[0, 20, 40, 60, 80, 100],
    labels=['1 (0-20)', '2 (21-40)', '3 (41-60)', '4 (61-80)', '5 (81-100)']
)

# Preview the first 5 rows of the enhanced dataset
print(data.head())

      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ...  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296  ...   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242  ...   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242  ...   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222  ...   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222  ...   

        b  lstat  medv  interaction_rm_age  interaction_tax_ptratio  \
0  396.90   4.98  24.0            428.6900                   4528.8   
1  396.90   9.14  21.6            506.6169                   4307.6   
2  392.83   4.03  34.7            439.0035                   4307.6   
3  394.63   2.94  33.4            320.5084                   4151.4   
4  396.90   5.33  36.2            387.3674                   4151.4   

   polynomial_rm_squared  polynomial_age_squared  estimated_house_built_year  \
0              43.230625            

### Testing Feature Combinations for Predicting Boston Housing Prices

In [8]:
# Recreate features (as done before)
data['interaction_rm_age'] = data['rm'] * data['age']
data['interaction_tax_ptratio'] = data['tax'] * data['ptratio']
data['polynomial_rm_squared'] = data['rm'] ** 2
data['polynomial_age_squared'] = data['age'] ** 2
data['estimated_house_built_year'] = 2023 - data['age']
data['aggregated_total_rooms'] = data['rm'] * data['ptratio']
data['binned_age_category'] = pd.cut(
    data['age'],
    bins=[0, 20, 40, 60, 80, 100],
    labels=[1, 2, 3, 4, 5]
)
data['binned_age_category'] = data['binned_age_category'].astype(float)

# Define the target variable
y = data['medv']

# Define various feature sets
feature_sets = {
    'Original': ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'],
    'Original + Interaction': ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat',
                               'interaction_rm_age', 'interaction_tax_ptratio'],
    'Original + Polynomial': ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat',
                              'polynomial_rm_squared', 'polynomial_age_squared'],
    'All Features': ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat',
                     'interaction_rm_age', 'interaction_tax_ptratio',
                     'polynomial_rm_squared', 'polynomial_age_squared',
                     'estimated_house_built_year', 'aggregated_total_rooms',
                     'binned_age_category']
}

# Function to train and evaluate models
def evaluate_model(features):
    X = data[features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return r2, rmse

# Run evaluation for each feature set
print("Feature Set Performance Comparison:")
for name, features in feature_sets.items():
    r2, rmse = evaluate_model(features)
    print(f"{name} → R² Score: {r2:.3f}, RMSE: {rmse:.3f}")

    

Feature Set Performance Comparison:
Original → R² Score: 0.664, RMSE: 4.963
Original + Interaction → R² Score: 0.715, RMSE: 4.570
Original + Polynomial → R² Score: 0.783, RMSE: 3.991
All Features → R² Score: 0.802, RMSE: 3.811


### Evaluating the Impact of Feature Engineering on Housing Price Prediction using the Boston Dataset

In [10]:
# Feature engineering (new features)
data['interaction_rm_age'] = data['rm'] * data['age']
data['interaction_tax_ptratio'] = data['tax'] * data['ptratio']
data['polynomial_rm_squared'] = data['rm'] ** 2
data['polynomial_age_squared'] = data['age'] ** 2
data['estimated_house_built_year'] = 2023 - data['age']
data['aggregated_total_rooms'] = data['rm'] * data['ptratio']
data['binned_age_category'] = pd.cut(
    data['age'], bins=[0, 20, 40, 60, 80, 100], labels=[1, 2, 3, 4, 5]
).astype(float)

# Define feature sets
original_features = ['crim', 'zn', 'indus', 'nox', 'rm', 'age',
                     'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat']
new_features = [
    'interaction_rm_age', 'interaction_tax_ptratio',
    'polynomial_rm_squared', 'polynomial_age_squared',
    'estimated_house_built_year', 'aggregated_total_rooms',
    'binned_age_category'
]
target = 'medv'

# Function to evaluate model
def evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    return r2, rmse

# Run evaluations
y = data[target]

# Model with original features only
X_orig = data[original_features]
r2_orig, rmse_orig = evaluate_model(X_orig, y)

# Model with original + new features
X_new = data[original_features + new_features]
r2_new, rmse_new = evaluate_model(X_new, y)

# Compare results
print("\n🔍 Model Performance Comparison:")
print(f"Original Features Only → R²: {r2_orig:.4f}, RMSE: {rmse_orig:.4f}")
print(f"With New Features      → R²: {r2_new:.4f}, RMSE: {rmse_new:.4f}")

# Evaluate improvement
delta_r2 = r2_new - r2_orig
delta_rmse = rmse_orig - rmse_new
print(f"\n📈 Improvement from New Features:")
print(f"Δ R²   = +{delta_r2:.4f}")
print(f"Δ RMSE = -{delta_rmse:.4f}")


🔍 Model Performance Comparison:
Original Features Only → R²: 0.7553, RMSE: 4.9172
With New Features      → R²: 0.8198, RMSE: 4.2195

📈 Improvement from New Features:
Δ R²   = +0.0645
Δ RMSE = -0.6977
