In [1]:
# 1. Import libraries
import os
import tarfile
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from zlib import crc32
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
from scipy import stats

In [2]:
# 2. Fetch & Load Dataset
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing_data = load_housing_data()


  housing_tgz.extractall(path=housing_path)


In [3]:
# 3. Stratified Train-Test Split
housing_data["income_cat"] = pd.cut(
    housing_data["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_data, housing_data["income_cat"]):
    strat_train_set = housing_data.loc[train_index]
    strat_test_set = housing_data.loc[test_index]

# Remove income_cat for modeling
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)


In [4]:
# 4. Prepare Data for ML
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# Custom transformer for feature engineering
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        # Define indices as instance variables
        self.rooms_ix = 3
        self.bedrooms_ix = 4
        self.population_ix = 5
        self.households_ix = 6

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure X is a NumPy array for correct indexing
        X_array = X.values if isinstance(X, pd.DataFrame) else X

        rooms_per_household = X_array[:, self.rooms_ix] / X_array[:, self.households_ix]
        population_per_household = X_array[:, self.population_ix] / X_array[:, self.households_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X_array[:, self.bedrooms_ix] / X_array[:, self.rooms_ix]
            return np.c_[X_array, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X_array, rooms_per_household, population_per_household]

In [5]:
# 5. Create Pipelines
num_attribs = housing.drop("ocean_proximity", axis=1).columns
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)


In [6]:
# 6. Train Models
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

# Decision Tree
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

# Random Forest
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)


In [7]:
# 7. Evaluate with Cross-Validation
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

print("Linear Regression:")
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

print("\nDecision Tree:")
tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                              scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

print("\nRandom Forest:")
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)


Linear Regression:
Scores: [71762.76364394 64114.99166359 67771.17124356 68635.19072082
 66846.14089488 72528.03725385 73997.08050233 68802.33629334
 66443.28836884 70139.79923956]
Mean: 69104.07998247063
Standard Deviation: 2880.3282098180675

Decision Tree:
Scores: [72393.12173553 70656.28713813 68374.63412382 71757.73337849
 69928.59183962 76699.98795124 70719.05217355 73364.80375219
 68671.91195637 69584.06246083]
Mean: 71215.01865097716
Standard Deviation: 2362.142713028468

Random Forest:
Scores: [51497.49898552 48942.06596824 46689.8228144  51809.0043813
 47277.11403038 51851.72449042 52073.88632448 50160.55153405
 48443.26072754 53934.25457836]
Mean: 50267.918383469645
Standard Deviation: 2234.8437081934303


In [8]:
# 8. Grid Search for Best Hyperparameters
param_grid = [
    {'n_estimators': [30, 50, 100], 'max_features': [6, 8, 10]},
    {'bootstrap': [False], 'n_estimators': [30, 50], 'max_features': [6, 8]},
]

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'bootstrap': False, 'max_features': 6, 'n_estimators': 50}


In [9]:
# 9. Evaluate Final Model on Test Set
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print("Final RMSE on Test Set:", final_rmse)

# 95% Confidence Interval
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
interval = np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                                    loc=squared_errors.mean(),
                                    scale=stats.sem(squared_errors)))
print("95% Confidence Interval:", interval)


Final RMSE on Test Set: 46541.118676110134
95% Confidence Interval: [44538.72916047 48460.84047989]


In [10]:
import joblib

# Make sure you re-create and fit your full_pipeline and final_model
# e.g., run the cells for pipeline creation and model training/grid search

# Save the full pipeline (including preprocessors)
joblib.dump(full_pipeline, "full_pipeline.pkl")

# Save the trained model
joblib.dump(final_model, "house_price_model.pkl")

['house_price_model.pkl']