In [1]:
import os
import sys

sys.path.append(os.path.dirname(os.getcwd()))
from src import config
from src.dataset import load_housing_data

housing = load_housing_data(config.PATH_DATA_RAW / "housing.csv")

In [2]:
import numpy as np
import pandas as pd

housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

seed = config.RANDOM_SEED = 42
test_size = config.TEST_SIZE = 0.2

split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [4]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [5]:
housing_stratified = strat_train_set.copy()

quirk_values = [500001, 500000, 450000, 350000, 280000]

# Filter the DataFrame to remove rows with these quirk values
# ? ~ This is the logical NOT operator, which inverts the boolean values,
# ? so it selects rows where median_house_value is NOT in quirk_values.
housing_filtered = housing_stratified[
    ~housing_stratified["median_house_value"].isin(quirk_values)
]

In [6]:
X_train = housing_stratified.drop("median_house_value", axis=1).copy()
y_train = housing_stratified["median_house_value"].copy()

X_test = strat_test_set.drop("median_house_value", axis=1).copy()
y_test = strat_test_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [8]:
print(X_train["ocean_proximity"].value_counts(), X_train["ocean_proximity"].unique())

ocean_proximity
<1H OCEAN     7277
INLAND        5262
NEAR OCEAN    2124
NEAR BAY      1847
ISLAND           2
Name: count, dtype: int64 ['INLAND' 'NEAR OCEAN' '<1H OCEAN' 'NEAR BAY' 'ISLAND']


In [9]:
from sklearn.base import BaseEstimator, TransformerMixin


class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(
        self, add_bedrooms_per_room=True, columns=None
    ):  # Adding columns as an argument
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.columns = columns

    def fit(self, X, y=None):
        if self.columns is not None:
            self.rooms_ix = self.columns.index("total_rooms")
            self.bedrooms_ix = self.columns.index("total_bedrooms")
            self.population_ix = self.columns.index("population")
            self.households_ix = self.columns.index("households")
        return self  # nothing else to do

    def transform(self, X, y=None):
        rooms_per_household = X[:, self.rooms_ix] / X[:, self.households_ix]
        population_per_household = X[:, self.population_ix] / X[:, self.households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, self.bedrooms_ix] / X[:, self.rooms_ix]
            return np.c_[
                X, rooms_per_household, population_per_household, bedrooms_per_room
            ]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [10]:
from sklearn.tree import DecisionTreeRegressor


class ImportantAttributesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, count):
        self.count = count
        self.feature_importances_ = None
        self.selected_indices_ = None

    def fit(self, X, y=None):
        tree_reg = DecisionTreeRegressor(random_state=seed)
        tree_reg.fit(X, y)
        self.feature_importances_ = tree_reg.feature_importances_
        # * The np.argsort reutrns a list of indicies in ascending order ( so the highest value is in the last place)
        self.selected_indices_ = np.argsort(self.feature_importances_)[-self.count :]
        return self

    def transform(self, X, y=None):
        return X[:, self.selected_indices_]

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

col_names = list(X_num)

num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("attribs_adder", CombinedAttributesAdder(columns=col_names)),
        ("std_scaler", StandardScaler()),
    ]
)

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(X_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer(
    [
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ]
)

In [13]:
from sklearn.ensemble import RandomForestRegressor

final_pipeline = Pipeline(
    [
        ("preparation", full_pipeline),
        ("feature_selection", ImportantAttributesSelector(count=6)),
        (
            "regressor",
            RandomForestRegressor(random_state=42),
        ),
    ]
)

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    # Number of features to select
    "feature_selection__count": [4, 6, 8],
    "preparation__num__attribs_adder__add_bedrooms_per_room": [True, False],
    # RandomForestRegressor parameters
    "regressor__n_estimators": [10, 50, 100],
    "regressor__max_features": [4, 6, 8],
}

grid_pipeline = GridSearchCV(
    final_pipeline, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_pipeline.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found by GridSearchCV:")
print(grid_pipeline.best_params_)

# Get the best estimator
best_model = grid_pipeline.best_estimator_

# Use the best model to predict on the test set
y_test_predictions = best_model.predict(X_test)

Traceback (most recent call last):
  File "/Users/aleksandrmedvedev/Desktop/Repositories/Hands-On-ML-CH2/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aleksandrmedvedev/Desktop/Repositories/Hands-On-ML-CH2/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aleksandrmedvedev/Desktop/Repositories/Hands-On-ML-CH2/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/aleksandrmedvedev/Desktop/Repositories/Hands-On-ML-CH2/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", lin

KeyboardInterrupt: 