In [54]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_validate
from matplotlib import pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np



RANDOM_STATE = 42


In [55]:
# Define project directories
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"
REPORTS_DIR = PROJECT_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

In [56]:
df = pd.read_csv(DATA_RAW / "house_prices_train.csv")

In [57]:
TARGET_COL = "SalePrice"

# Separate features and target variable
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

In [58]:
train_idx = np.load(SPLITS_DIR / "train_indices.npy")
test_idx = np.load(SPLITS_DIR / "test_indices.npy")

X_train = X.loc[train_idx].copy()
y_train = y.loc[train_idx].copy()

In [59]:
# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.to_list()
if 'Id' in numeric_cols:
    numeric_cols.remove('Id')
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.to_list()

print(f"Categorical columns: {categorical_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Numeric columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'

In [60]:
# Define preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)), 
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [61]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [62]:
hgb_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", HistGradientBoostingRegressor(
            max_iter=120,  
            learning_rate=0.1254335218612733, 
            max_depth=None,  
            random_state=RANDOM_STATE,
            min_samples_leaf=11, 
            max_leaf_nodes=23, 
            early_stopping=True,  
            validation_fraction=0.1, 
        )),
    ]
)

log_target_model = TransformedTargetRegressor(
    regressor=hgb_model,
    func=np.log1p,   
    inverse_func=np.expm1,  
)

In [63]:
# Generate learning curve data
train_sizes_fraction = np.linspace(0.1, 1.0, 6)  # Generates 6 evenly spaced values between 0.1 and 1.0

# Compute learning curve
train_sizes, train_scores, val_scores = learning_curve(
    estimator=log_target_model,
    X=X_train,
    y=y_train,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    train_sizes=train_sizes_fraction,
    n_jobs=-1,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# Calculate mean and standard deviation of RMSE
train_rmse = -train_scores
val_rmse = -val_scores

# Compute mean and std deviation
train_rmse_mean = train_rmse.mean(axis=1)
train_rmse_std = train_rmse.std(axis=1)
val_rmse_mean = val_rmse.mean(axis=1)
val_rmse_std = val_rmse.std(axis=1)

print("Train sizes (absolute):", train_sizes)
print("Train RMSE (mean):", train_rmse_mean)
print("Val RMSE (mean):", val_rmse_mean)



Train sizes (absolute): [ 93 261 429 597 765 934]
Train RMSE (mean): [22493.18236453 13682.06952265 14130.35677496 11966.89674907
 14323.0758572  12781.67890264]
Val RMSE (mean): [40735.49599696 33599.04262548 31433.37263949 30543.44492998
 30400.22943262 27998.61263475]




In [64]:

plt.figure(figsize=(8, 6))

# Training curve with std band
plt.plot(train_sizes, train_rmse_mean, marker="o", label="Train RMSE")
plt.fill_between(
    train_sizes,
    train_rmse_mean - train_rmse_std,
    train_rmse_mean + train_rmse_std,
    alpha=0.2,
)

# Validation curve with std band
plt.plot(train_sizes, val_rmse_mean, marker="o", label="Validation RMSE")
plt.fill_between(
    train_sizes,
    val_rmse_mean - val_rmse_std,
    val_rmse_mean + val_rmse_std,
    alpha=0.2,
)

plt.xlabel("Training set size (number of samples)")
plt.ylabel("RMSE (lower is better)")
plt.title("Learning Curve - HistGradientBoostingRegressor (tuned, log1p target)")
plt.legend()
plt.grid(True)

output_path = FIGURES_DIR / "learning_curve.png"
plt.tight_layout()
plt.savefig(output_path, dpi=150)
plt.close()

print(f"Learning curve saved to: {output_path}")

Learning curve saved to: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/reports/figures/learning_curve.png
