In [None]:
# CHAPTER 2: End-to-End Machine Learning Project
# Source: Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow (O'Reilly)

# 1️⃣ Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# 2️⃣ Download data
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()


In [None]:
# 3️⃣ Lihat data
housing.info()
housing.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# 4️⃣ Tambahkan income_cat untuk stratified sampling
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3., 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

# Stratified split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Drop income_cat untuk kembali ke data asli
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
# 5️⃣ Pisahkan label
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()


In [None]:
# 6️⃣ Preprocessing pipeline
housing_num = housing.drop("ocean_proximity", axis=1)

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
# 7️⃣ Train model sederhana
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
# 8️⃣ Evaluasi di train
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(f"Training RMSE: ${lin_rmse:,.2f}")


Training RMSE: $68,627.87


In [None]:
# 9️⃣ Test model
housing_test = strat_test_set.drop("median_house_value", axis=1)
housing_test_labels = strat_test_set["median_house_value"].copy()
housing_test_prepared = full_pipeline.transform(housing_test)

test_predictions = lin_reg.predict(housing_test_prepared)
test_mse = mean_squared_error(housing_test_labels, test_predictions)
test_rmse = np.sqrt(test_mse)
print(f"Test RMSE: ${test_rmse:,.2f}")

Test RMSE: $66,913.44


# **EXERCISES**

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from scipy.stats import reciprocal, uniform

# ------------------------------------
# Exercise 1: SVR + GridSearchCV
# ------------------------------------
param_grid = [
    {"kernel": ["linear"], "C": [1, 10, 100, 1000]},
    {"kernel": ["rbf"], "C": [1, 10, 100], "gamma": ["scale", 0.01, 0.1, 1]},
]

svr = SVR()

grid_search_svr = GridSearchCV(svr, param_grid, cv=3,
                               scoring="neg_mean_squared_error",
                               verbose=2, n_jobs=-1)

grid_search_svr.fit(housing_prepared, housing_labels)

best_svr = grid_search_svr.best_estimator_
test_predictions_svr = best_svr.predict(housing_test_prepared)
test_rmse_svr = np.sqrt(mean_squared_error(housing_test_labels, test_predictions_svr))

print(f"Best SVR Test RMSE (GridSearchCV): ${test_rmse_svr:,.2f}")
print(f"Best SVR Params (GridSearchCV): {grid_search_svr.best_params_}")

# ------------------------------------
# Exercise 2: SVR + RandomizedSearchCV
# ------------------------------------
param_distributions = {
    "kernel": ["rbf"],
    "C": uniform(1, 100),
    "gamma": reciprocal(0.001, 0.1),
}

rnd_search_svr = RandomizedSearchCV(SVR(), param_distributions, n_iter=20,
                                    cv=3, scoring="neg_mean_squared_error",
                                    verbose=2, n_jobs=-1, random_state=42)

rnd_search_svr.fit(housing_prepared, housing_labels)

best_svr_rnd = rnd_search_svr.best_estimator_
test_predictions_svr_rnd = best_svr_rnd.predict(housing_test_prepared)
test_rmse_svr_rnd = np.sqrt(mean_squared_error(housing_test_labels, test_predictions_svr_rnd))

print(f"Best SVR Test RMSE (RandomizedSearchCV): ${test_rmse_svr_rnd:,.2f}")
print(f"Best SVR Params (RandomizedSearchCV): {rnd_search_svr.best_params_}")

# ------------------------------------
# Exercise 3: Tambah transformer pilih fitur penting
# ------------------------------------
# Gunakan RandomForest untuk pilih fitur penting
forest = RandomForestRegressor(n_estimators=100, random_state=42)
forest.fit(housing_prepared, housing_labels)

selector = SelectFromModel(forest, threshold="median", prefit=True)

housing_reduced = selector.transform(housing_prepared)
housing_test_reduced = selector.transform(housing_test_prepared)

# Latih model pada data yang sudah dipilih fiturnya
lin_reg_reduced = LinearRegression()
lin_reg_reduced.fit(housing_reduced, housing_labels)

test_predictions_reduced = lin_reg_reduced.predict(housing_test_reduced)
test_rmse_reduced = np.sqrt(mean_squared_error(housing_test_labels, test_predictions_reduced))

print(f"Linear Regression Test RMSE (Feature Selection): ${test_rmse_reduced:,.2f}")

# ------------------------------------
# Exercise 4: Pipeline full preprocessing + prediksi
# ------------------------------------
full_pipeline_with_model = Pipeline([
    ("preparation", full_pipeline),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

full_pipeline_with_model.fit(strat_train_set.drop("median_house_value", axis=1),
                             strat_train_set["median_house_value"])

test_preds_pipeline = full_pipeline_with_model.predict(strat_test_set.drop("median_house_value", axis=1))
test_rmse_pipeline = np.sqrt(mean_squared_error(strat_test_set["median_house_value"], test_preds_pipeline))

print(f"Random Forest Test RMSE (Full Pipeline): ${test_rmse_pipeline:,.2f}")

# ------------------------------------
# Exercise 5: GridSearchCV explore preprocessing + model
# ------------------------------------
from sklearn.base import clone

param_grid_pipeline = {
    "preparation__num__imputer__strategy": ["median", "mean"],
    "model__n_estimators": [50, 100],
    "model__max_features": [4, 6, 8],
}

grid_search_pipeline = GridSearchCV(full_pipeline_with_model, param_grid_pipeline,
                                    cv=3, scoring="neg_mean_squared_error",
                                    verbose=2, n_jobs=-1)

grid_search_pipeline.fit(strat_train_set.drop("median_house_value", axis=1),
                         strat_train_set["median_house_value"])

best_pipeline_model = grid_search_pipeline.best_estimator_

test_preds_best_pipeline = best_pipeline_model.predict(strat_test_set.drop("median_house_value", axis=1))
test_rmse_best_pipeline = np.sqrt(mean_squared_error(strat_test_set["median_house_value"], test_preds_best_pipeline))

print(f"Best Random Forest Test RMSE (GridSearchCV Pipeline): ${test_rmse_best_pipeline:,.2f}")
print(f"Best Params (GridSearchCV Pipeline): {grid_search_pipeline.best_params_}")


Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best SVR Test RMSE (GridSearchCV): $68,366.37
Best SVR Params (GridSearchCV): {'C': 1000, 'kernel': 'linear'}
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best SVR Test RMSE (RandomizedSearchCV): $94,687.27
Best SVR Params (RandomizedSearchCV): {'C': np.float64(97.56320330745594), 'gamma': np.float64(0.041380401125610165), 'kernel': 'rbf'}
Linear Regression Test RMSE (Feature Selection): $70,344.35
Random Forest Test RMSE (Full Pipeline): $48,373.63
Fitting 3 folds for each of 12 candidates, totalling 36 fits




Best Random Forest Test RMSE (GridSearchCV Pipeline): $47,570.60
Best Params (GridSearchCV Pipeline): {'model__max_features': 4, 'model__n_estimators': 50, 'preparation__num__imputer__strategy': 'median'}
