# Load the data

In [43]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))
    
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [44]:
import numpy as np

housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

# Split the data

In [45]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

In [12]:
# for set_ in (strat_train_set, strat_test_set):
#     set_.drop("income_cat", axis=1, inplace=True)

In [46]:
housing = strat_train_set.copy()

In [47]:
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

# Prepare data

In [48]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [49]:
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

In [50]:
##########################
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.select_dtypes(include=[np.number])
imputer.fit(housing_num)
imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2125.    ,  434.    , 1167.    ,
        408.    ,    3.5385])

In [51]:
X = imputer.transform(housing_num)
###########################

# Categorical arguments

In [53]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse_output=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

housing_cat_1hot

array([[0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

# Feature scaling 

## custom transformer that acts much like the `StandardScaler`

In [54]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True): # no *args or **kwargs!
        self.with_mean = with_mean
    
    def fit(self, X, y=None): # y is required even though we don't use it
        X = check_array(X) # checks that X is an array with finite float values
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1] # every estimator stores this in fit()
        return self # always return self!
    
    def transform(self, X):
        check_is_fitted(self) # looks for learned attributes (with trailing _)
        X = check_array(X)
        assert self.n_features_in_ == X.shape[1]
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_

In [55]:
from sklearn.cluster import KMeans

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
               "total_bedrooms", "population", "households", "median_income"]

cat_attribs = ["ocean_proximity"]

cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),
                             OneHotEncoder(handle_unknown="ignore"))


In [57]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"] # feature names out

def ratio_pipeline():
    return make_pipeline(SimpleImputer(strategy="median"), FunctionTransformer(column_ratio, feature_names_out=ratio_name), StandardScaler())

log_pipeline = make_pipeline(SimpleImputer(strategy="median"), FunctionTransformer(np.log, feature_names_out="one-to-one"), StandardScaler())

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

preprocessing = ColumnTransformer([("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
                                   ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
                                   ("people_per_house", ratio_pipeline(), ["population", "households"]),
                                   ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
                                   ("geo", cluster_simil, ["latitude", "longitude"]),
                                   ("cat", cat_pipeline, make_column_selector(dtype_include=object)),],
                                   remainder=default_num_pipeline) # one column remaining: housing_median_age

In [58]:
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape

(16512, 25)

# Select and train model

In [67]:
from sklearn.svm import SVR

svm_reg = make_pipeline(preprocessing, SVR())
svm_reg.fit(housing[:5000], housing_labels[:5000])

In [68]:
housing_predictions = svm_reg.predict(housing)
housing_predictions[:5].round(-2)

array([177200., 177700., 176900., 177000., 177300.])

In [75]:
housing_labels.iloc[:5].values
# housing_labels[:5].values

array([458300., 483800., 101700.,  96100., 361800.])

# Hyper
## 1

In [76]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR


svr_pipeline = Pipeline([("preprocessing", preprocessing), ("svr", SVR()),])

param_grid = [{'svr__kernel': ["linear"], 'svr__C': [10, 30, 100, 300, 1000, 3000, 10000, 30000]},
              {'svr__kernel': ["rbf"], 'svr__C': [1.0, 3.0, 10., 30., 100., 300., 1000.0], 
                                       'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},]

grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error')

grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [77]:
svr_grid_search_rmse = -grid_search.best_score_
svr_grid_search_rmse

69314.46436684526

In [78]:
grid_search.best_params_

{'svr__C': 10000, 'svr__kernel': 'linear'}

In [87]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svr__C,param_svr__kernel,param_svr__gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
6,0.509136,0.030525,0.121501,0.001612,10000,linear,,"{'svr__C': 10000, 'svr__kernel': 'linear'}",-71658.198461,-70333.506457,-65951.688183,-69314.464367,2438.565148,1
7,0.78631,0.108371,0.122474,0.001162,30000,linear,,"{'svr__C': 30000, 'svr__kernel': 'linear'}",-73470.458504,-70449.55954,-65137.198152,-69685.738732,3444.645477,2
5,0.414946,0.002298,0.120336,0.000348,3000,linear,,"{'svr__C': 3000, 'svr__kernel': 'linear'}",-71627.67483,-71759.468187,-67434.017793,-70273.72027,2008.693604,3
4,0.379121,0.004031,0.125169,0.004546,1000,linear,,"{'svr__C': 1000, 'svr__kernel': 'linear'}",-72864.200142,-73247.551275,-68675.366943,-71595.70612,2070.91368,4
3,0.374873,0.017858,0.122212,0.001953,300,linear,,"{'svr__C': 300, 'svr__kernel': 'linear'}",-74692.73483,-75212.652008,-70565.627058,-73490.337965,2078.94667,5


## 2

In [91]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from scipy.stats import expon, loguniform


svr_pipeline = Pipeline([("preprocessing", preprocessing), ("svr", SVR()),])

param_distribs = {'svr__kernel': ["linear", "rbf"],
                  'svr__C': loguniform(20, 200_000),
                  'svr__gamma': expon(scale=1)}

random_search = RandomizedSearchCV(svr_pipeline, param_distribs, n_iter=50, cv=3, scoring='neg_root_mean_squared_error', random_state=42)

random_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [92]:
svr_random_search_rmse = -random_search.best_score_
svr_random_search_rmse

56650.27944265537

In [93]:
random_search.best_params_

{'svr__C': 157055.10989448498,
 'svr__gamma': 0.26497040005002437,
 'svr__kernel': 'rbf'}

## 3

In [95]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

selector_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('selector', SelectFromModel(RandomForestRegressor(random_state=42),
                                 threshold=0.005)),  # min feature importance
    ('svr', SVR(C=random_search.best_params_["svr__C"],
                gamma=random_search.best_params_["svr__gamma"],
                kernel=random_search.best_params_["svr__kernel"])),
])

In [97]:
from sklearn.model_selection import cross_val_score

selector_rmses = -cross_val_score(selector_pipeline,
                                  housing.iloc[:5000],
                                  housing_labels.iloc[:5000],
                                  scoring="neg_root_mean_squared_error",
                                  cv=3)
pd.Series(selector_rmses).describe()

count        3.000000
mean     56621.114270
std       2273.182398
min      54240.926178
25%      55546.904774
50%      56852.883371
75%      57811.208315
max      58769.533260
dtype: float64

## 4

In [101]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import MetaEstimatorMixin, clone

class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):
    def __init__(self, estimator=True): # no *args or **kwargs!
        self.estimator = estimator
    
    def fit(self, X, y=None): # y is required even though we don't use it
        estimator_ = clone(self.estimator)
        estimator_.fit(X, y)
        self.estimator_ = estimator_
        self.n_features_in_ = self.estimator_.n_features_in_
        if hasattr(self.estimator, "feature_names_in_"):
            self.feature_names_in_ = self.estimator.feature_names_in_
        return self # always return self!
    
    def transform(self, X):
        check_is_fitted(self) # looks for learned attributes (with trailing _)
        predictions = self.estimator_.predict(X)
        if predictions.ndim == 1:
            predictions = predictions.reshape(-1, 1)
        return predictions
        
    def get_feature_names_out(self, names=None):
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, "n_outputs_", 1)
        estimator_class_name = self.estimator_.__class__.__name__
        estimator_short_name = estimator_class_name.lower().replace("_", "")
        return [f"{estimator_short_name}_prediction_{i}"
                for i in range(n_outputs)]

In [102]:
from sklearn.utils.estimator_checks import check_estimator

check_estimator(FeatureFromRegressor(KNeighborsRegressor()))

In [103]:
knn_reg = KNeighborsRegressor(n_neighbors=3, weights="distance")
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = housing[["latitude", "longitude"]]
knn_transformer.fit_transform(geo_features, housing_labels)

array([[486100.66666667],
       [435250.        ],
       [105100.        ],
       ...,
       [148800.        ],
       [500001.        ],
       [234333.33333333]])

In [104]:
knn_transformer.get_feature_names_out()

['kneighborsregressor_prediction_0']

including in preprocessing pipeline:

In [105]:
from sklearn.base import clone

transformers = [(name, clone(transformer), columns)
                for name, transformer, columns in preprocessing.transformers]
geo_index = [name for name, _, _ in transformers].index("geo")
transformers[geo_index] = ("geo", knn_transformer, ["latitude", "longitude"])

new_geo_preprocessing = ColumnTransformer(transformers)

In [106]:
new_geo_pipeline = Pipeline([
    ('preprocessing', new_geo_preprocessing),
    ('svr', SVR(C=random_search.best_params_["svr__C"],
                gamma=random_search.best_params_["svr__gamma"],
                kernel=random_search.best_params_["svr__kernel"])),
])

In [107]:
new_pipe_rmses = -cross_val_score(new_geo_pipeline,
                                  housing.iloc[:5000],
                                  housing_labels.iloc[:5000],
                                  scoring="neg_root_mean_squared_error",
                                  cv=3)
pd.Series(new_pipe_rmses).describe()

count         3.000000
mean     105034.396111
std        2919.196941
min      101811.040419
25%      103801.592804
50%      105792.145190
75%      106646.073957
max      107500.002724
dtype: float64

## 5

In [109]:
param_distribs = {'preprocessing__geo__estimator__n_neighbors': range(1, 30),
              'preprocessing__geo__estimator__weights': ["distance", "uniform"],
              'svr__C': loguniform(20, 200_000),
              'svr__gamma': expon(scale=1)}

random_search = RandomizedSearchCV(new_geo_pipeline, param_distribs, n_iter=50, cv=3, scoring='neg_root_mean_squared_error', random_state=42)

random_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [110]:
km_random_search_rmse = -random_search.best_score_
km_random_search_rmse

106867.3603029704

In [111]:
random_search.best_params_

{'preprocessing__geo__estimator__n_neighbors': 20,
 'preprocessing__geo__estimator__weights': 'distance',
 'svr__C': 55456.48365602121,
 'svr__gamma': 0.006976409181650647}

## 6

In [137]:

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True): # no *args or **kwargs!
        self.with_mean = with_mean
    
    def fit(self, X, y=None): # y is required even though we don't use it
        X_orig = X
        X = check_array(X) # checks that X is an array with finite float values
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1] # every estimator stores this in fit()
        if hasattr(X_orig, "columns"):
            self.feature_names_in_ = np.array(X_orig.columns, dtype=object)
        return self # always return self!
    
    def transform(self, X):
        check_is_fitted(self) # looks for learned attributes (with trailing _)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError("input_features ≠ feature_names_in_")
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_

    def inverse_transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError("input_features ≠ feature_names_in_")
        return X*self.scale_ + self.mean_ if self.with_mean else X

    def get_feature_names_out(self, input_features=None):
        if input_features == None:
            return getattr(self, "feature_names_in_",
                           [f"x{i}" for i in range(self.n_features_in_)])
        else:
            if self.n_features_in_ != X.shape[1]:
                raise ValueError("input_features ≠ feature_names_in_")
            if hasattr(self, "feature_names_in_") and not np.all(self.feature_names_in_ == input_features):
                raise ValueError("input_features ≠ feature_names_in_")
            return input_features

In [138]:
from sklearn.utils.estimator_checks import check_estimator
 
check_estimator(StandardScalerClone())

In [139]:
np.random.seed(42)
X = np.random.rand(1000, 3)

scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)

assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))

In [140]:
scaler = StandardScalerClone(with_mean=False)
X_scaled_uncentered = scaler.fit_transform(X)

assert np.allclose(X_scaled_uncentered, X / X.std(axis=0))

In [141]:
scaler = StandardScalerClone()
X_back = scaler.inverse_transform(scaler.fit_transform(X))

assert np.allclose(X, X_back)

In [142]:
assert np.all(scaler.get_feature_names_out() == ["x0", "x1", "x2"])
assert np.all(scaler.get_feature_names_out(["a", "b", "c"]) == ["a", "b", "c"])

In [143]:
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(df)

assert np.all(scaler.feature_names_in_ == ["a", "b"])
assert np.all(scaler.get_feature_names_out() == ["a", "b"])