In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor

RANDOM_STATE = 42


In [2]:
# Define project directories
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"
REPORTS_DIR = PROJECT_ROOT / "reports"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
df = pd.read_csv(DATA_RAW / "house_prices_train.csv")

In [4]:
TARGET_COL = "SalePrice"

# Separate features and target variable
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

In [5]:
train_idx = np.load(SPLITS_DIR / "train_indices.npy")

X_train = X.loc[train_idx].copy()
y_train = y.loc[train_idx].copy()

In [6]:
# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.to_list()
if 'Id' in numeric_cols:
    numeric_cols.remove('Id')
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.to_list()

print(f"Categorical columns: {categorical_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Numeric columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'

In [7]:
# Define preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [8]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [9]:
# Create the full pipeline with preprocessing and model
rf_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", RandomForestRegressor(
            n_estimators=200,      # explicit number of trees
            n_jobs=-1,             # use all CPU cores
            random_state=RANDOM_STATE,
        )),
    ]
)

# Cross-validate RF pipeline
cv_scores = cross_validate(
    rf_model,
    X_train,
    y_train,
    cv=cv,
    scoring=["neg_root_mean_squared_error", "neg_mean_absolute_error"],
    n_jobs=-1,
)

# Convert negative scores to positive errors
rmse_values = -cv_scores["test_neg_root_mean_squared_error"]
mae_values = -cv_scores["test_neg_mean_absolute_error"]

# Prepare a single row with RF metrics
rf_values = {
    "model": "RandomForestRegressor",
    "rmse_mean": rmse_values.mean(),
    "rmse_std": rmse_values.std(),
    "mae_mean": mae_values.mean(),
    "mae_std": mae_values.std(),
}

# Load the baseline results CSV into a DataFrame
df_baseline = pd.read_csv(REPORTS_DIR / "metrics_baseline.csv")

# Use 'model' as an index to update or create the RF row
df_baseline = df_baseline.set_index("model")

# Update or create the RF row
df_baseline.loc["RandomForestRegressor", ["rmse_mean", "rmse_std", "mae_mean", "mae_std"]] = [
    rf_values["rmse_mean"],
    rf_values["rmse_std"],
    rf_values["mae_mean"],
    rf_values["mae_std"],
]

# Reset index back to a normal column and save to CSV
df_baseline = df_baseline.reset_index()
df_baseline.to_csv(REPORTS_DIR / "metrics_baseline.csv", index=False)

# Display the updated DataFrame
print(df_baseline)


                   model     rmse_mean     rmse_std      mae_mean      mae_std
0             Dummy_mean  77160.108457  3673.252798  56318.232277  2612.148500
1      Linear_Regression  36397.273713  7544.717946  19621.981805   806.121901
2        Ridge_alpha_1.0  32814.497631  7332.537866  18676.071378  1116.810159
3        Lasso_alpha_0.1  36039.211360  7401.016075  19444.763807   899.418995
4  RandomForestRegressor  30187.811059  4925.198978  18172.212787  1123.688346
5    Lasso (alpha=100.0)  31597.031137  7633.433043  20612.670160   857.574200
6     Ridge (alpha=10.0)  31556.174539  7216.615113  18001.482651   581.320602


In [11]:
# Extract the fitted preprocessing step from the pipeline
preprocessor_fitted = rf_model.named_steps["preprocessor"]

# Extract the fitted random forest regressor from the pipeline
rf_model_fitted = rf_model.named_steps["regressor"]

# Get the names of the features after the preprocessing step
feature_names = preprocessor_fitted.get_feature_names_out()

# Get the feature importance scores from the fitted random forest model
importances = rf_model_fitted.feature_importances_

print("Number of features after preprocessing:", len(feature_names))
print("Number of importance scores:", len(importances))

Number of features after preprocessing: 285
Number of importance scores: 285


In [12]:
# Build a DataFrame: feature_name + importance
rf_importances_df = pd.DataFrame(
    {
        "feature_name": feature_names,
        "importance": importances,
    }
)

# Sort by importance (descending)
rf_importances_df = rf_importances_df.sort_values(
    by="importance",
    ascending=False,
).reset_index(drop=True)

# Save to CSV
output_path = REPORTS_DIR / "rf_feature_importances.csv"
rf_importances_df.to_csv(output_path, index=False)

# Show top-10 most important features
rf_importances_df.head(10)


Saved Random Forest importances to: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/reports/rf_feature_importances.csv


Unnamed: 0,feature_name,importance
0,num__OverallQual,0.554462
1,num__GrLivArea,0.122914
2,num__TotalBsmtSF,0.034234
3,num__2ndFlrSF,0.031181
4,num__BsmtFinSF1,0.027653
5,num__1stFlrSF,0.026944
6,num__LotArea,0.017562
7,num__GarageArea,0.015578
8,num__GarageCars,0.013764
9,num__YearBuilt,0.011995
