In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel

In [2]:
from sklearn import set_config
set_config(display="diagram")

In [3]:
housing = pd.read_csv(Path("../data/datasets/housing/housing.csv"))

In [4]:
housing["income_cat"] = pd.cut(housing["median_income"],bins=[0., 1.5, 3.0, 4.5, 6., np.inf],labels=[1, 2, 3, 4, 5])

In [5]:
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

In [6]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [7]:
housing = strat_train_set.copy()

In [8]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [9]:
from sklearn.cluster import KMeans

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self # always return self!
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [10]:
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

In [11]:
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

In [12]:
def ratio_name(function_transformer, feature_names_in):
    return ["ratio"] # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )


In [13]:
log_pipeline = make_pipeline(
                SimpleImputer(strategy="median"),
                FunctionTransformer(np.log, feature_names_out="one-to-one"),
                StandardScaler()
            )

In [14]:
cat_pipeline = make_pipeline(
                SimpleImputer(strategy="most_frequent"),
                OneHotEncoder(handle_unknown="ignore")
            )

In [15]:
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [16]:
preprocessing = ColumnTransformer([
                    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
                    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
                    ("people_per_house", ratio_pipeline(), ["population", "households"]),
                    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                    "households", "median_income"]),
                    ("geo", cluster_simil, ["latitude", "longitude"]),
                    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
                ],
                remainder=default_num_pipeline # one column remaining: housing_median_age
            ) 

In [17]:
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape

(16512, 24)

In [18]:
preprocessing.get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)

# Linear Regression (Base model)

In [32]:
lin_reg = make_pipeline(preprocessing, LinearRegression())

lin_reg.fit(housing, housing_labels)

0,1,2
,steps,"[('columntransformer', ...), ('linearregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...001D4E6884B80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...001D4E6884D60>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...001D4E6884B80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...001D4E6884D60>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...001D4E6884B80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...001D4E6884D60>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,10.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [41]:
housing_predictions = lin_reg.predict(housing)

In [42]:
housing_predictions[:5].round(-2)

array([246000., 372700., 135700.,  91400., 330900.])

In [43]:
housing_labels.iloc[:5].values

array([458300., 483800., 101700.,  96100., 361800.])

In [19]:
from sklearn.metrics import root_mean_squared_error

In [45]:
lin_rmse = root_mean_squared_error(housing_labels, housing_predictions)
lin_rmse

68972.88910758478

# Decision Tree

In [20]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(housing, housing_labels)

0,1,2
,steps,"[('columntransformer', ...), ('decisiontreeregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...001D4E6884B80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...001D4E6884D60>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...001D4E6884B80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...001D4E6884D60>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...001D4E6884B80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...001D4E6884D60>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,10.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [49]:
housing_predictions = tree_reg.predict(housing)
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)
tree_rmse

0.0

## Better Evaluation Using Cross-Validation

In [21]:
from sklearn.model_selection import cross_val_score

In [None]:
tree_rmses = -cross_val_score(tree_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

In [51]:
pd.Series(tree_rmses).describe()

count       10.000000
mean     66573.734600
std       1103.402323
min      64607.896046
25%      66204.731788
50%      66388.272499
75%      66826.257468
max      68532.210664
dtype: float64

# Random Forest 

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))

forest_rmses = -cross_val_score(forest_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

In [54]:
pd.Series(forest_rmses).describe()

count       10.000000
mean     47038.092799
std       1021.491757
min      45495.976649
25%      46510.418013
50%      47118.719249
75%      47480.519175
max      49140.832210
dtype: float64

# Fine-Tune Your Model

### 1. Grid Search

In [25]:
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
                    ("preprocessing", preprocessing),
                    ("random_forest", RandomForestRegressor(random_state=42)),
                ])

In [25]:
param_grid = [
            {'preprocessing__geo__n_clusters': [5, 8, 10], 'random_forest__max_features': [4, 6, 8]},
            {'preprocessing__geo__n_clusters': [10, 15], 'random_forest__max_features': [6, 8, 10]},
        ]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error')

grid_search.fit(housing, housing_labels)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"[{'preprocessing__geo__n_clusters': [5, 8, ...], 'random_forest__max_features': [4, 6, ...]}, {'preprocessing__geo__n_clusters': [10, 15], 'random_forest__max_features': [6, 8, ...]}]"
,scoring,'neg_root_mean_squared_error'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...00230C18BD3A0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...00230C169E8E0>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...00230C18BD3A0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...00230C169E8E0>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...00230C18BD3A0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...00230C169E8E0>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,15.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
grid_search.best_params_


{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}

In [31]:
grid_search.best_estimator_

0,1,2
,steps,"[('preprocessing', ...), ('random_forest', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...00230C18BD3A0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...00230C169E8E0>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...00230C18BD3A0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...00230C169E8E0>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...00230C18BD3A0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...00230C169E8E0>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,15.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [40]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_random_forest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,4.620865,0.298328,0.170574,0.019077,5,4,"{'preprocessing__geo__n_clusters': 5, 'random_...",-46226.404919,-46146.401866,-47195.120579,-46522.642455,476.634202,13
1,5.971928,0.047455,0.153887,0.005787,5,6,"{'preprocessing__geo__n_clusters': 5, 'random_...",-46473.214688,-46383.628085,-47620.692379,-46825.84505,563.230649,14
2,7.90537,0.110492,0.144925,0.005617,5,8,"{'preprocessing__geo__n_clusters': 5, 'random_...",-46839.745574,-46904.579633,-47813.212568,-47185.845925,444.404127,15
3,4.517651,0.062169,0.159797,0.00885,8,4,"{'preprocessing__geo__n_clusters': 8, 'random_...",-44511.04679,-44519.243029,-45214.107933,-44748.132584,329.511319,7
4,6.210973,0.073935,0.158303,0.006383,8,6,"{'preprocessing__geo__n_clusters': 8, 'random_...",-44851.823543,-44827.316247,-45654.572385,-45111.237392,384.32611,10
5,8.116516,0.014063,0.147393,0.004252,8,8,"{'preprocessing__geo__n_clusters': 8, 'random_...",-45567.827559,-45323.269449,-46133.989451,-45675.02882,339.54461,12
6,4.616115,0.062954,0.160185,0.012527,10,4,"{'preprocessing__geo__n_clusters': 10, 'random...",-43797.854175,-44036.240246,-44960.694004,-44264.929475,501.51317,3
7,6.424798,0.075807,0.151665,0.003545,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-43709.66105,-44163.463178,-44966.539107,-44279.887778,519.680433,4
8,9.198964,0.694314,0.230702,0.038025,10,8,"{'preprocessing__geo__n_clusters': 10, 'random...",-44498.988402,-44883.300454,-45264.655671,-44882.314842,312.583131,8
9,6.856958,0.096966,0.156879,0.004428,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-43709.66105,-44163.463178,-44966.539107,-44279.887778,519.680433,4


In [41]:
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_random_forest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
12,8.036563,0.049754,0.207212,0.015743,15,6,"{'preprocessing__geo__n_clusters': 15, 'random...",-42725.4238,-43708.197434,-44334.935606,-43589.518946,662.417543,1
13,10.599247,0.053074,0.234922,0.00703,15,8,"{'preprocessing__geo__n_clusters': 15, 'random...",-43486.175916,-43819.842374,-44899.96868,-44068.662323,603.399271,2
6,4.616115,0.062954,0.160185,0.012527,10,4,"{'preprocessing__geo__n_clusters': 10, 'random...",-43797.854175,-44036.240246,-44960.694004,-44264.929475,501.51317,3
9,6.856958,0.096966,0.156879,0.004428,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-43709.66105,-44163.463178,-44966.539107,-44279.887778,519.680433,4
7,6.424798,0.075807,0.151665,0.003545,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-43709.66105,-44163.463178,-44966.539107,-44279.887778,519.680433,4


In [42]:
cv_res = cv_res[[
    "param_preprocessing__geo__n_clusters",
    "param_random_forest__max_features",
    "split0_test_score",
    "split1_test_score",
    "split2_test_score",
    "mean_test_score"
]].copy()

score_cols = ["split0", "split1", "split2", "mean_test_rmse"]

cv_res.columns = ["n_clusters", "max_features"] + score_cols

cv_res.loc[:, score_cols] = (
    -cv_res.loc[:, score_cols]
        .round()
        .astype(np.int64)
)

cv_res.head()


Unnamed: 0,n_clusters,max_features,split0,split1,split2,mean_test_rmse
12,15,6,42725.0,43708.0,44335.0,43590.0
13,15,8,43486.0,43820.0,44900.0,44069.0
6,10,4,43798.0,44036.0,44961.0,44265.0
9,10,6,43710.0,44163.0,44967.0,44280.0
7,10,6,43710.0,44163.0,44967.0,44280.0


### Randomized Search


In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [26]:
param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50), 'random_forest__max_features': randint(low=2, high=20)}

rnd_search = RandomizedSearchCV(full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3, scoring='neg_root_mean_squared_error', random_state=42)

rnd_search.fit(housing, housing_labels)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'preprocessing__geo__n_clusters': <scipy.stats....0020D9E8133B0>, 'random_forest__max_features': <scipy.stats....0020D9EA0DEE0>}"
,n_iter,10
,scoring,'neg_root_mean_squared_error'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...0020D9E941260>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...0020D9E941080>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...0020D9E941260>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...0020D9E941080>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...0020D9E941260>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...0020D9E941080>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,45.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,9
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Scikit-Learn also has HalvingRandomSearchCV and
HalvingGridSearchCV hyperparameter search classes

In [29]:
final_model = rnd_search.best_estimator_
final_model

0,1,2
,steps,"[('preprocessing', ...), ('random_forest', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('bedrooms', ...), ('rooms_per_house', ...), ...]"
,remainder,Pipeline(step...ardScaler())])
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...0020D9E941260>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...0020D9E941080>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...0020D9E941260>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...0020D9E941080>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function col...0020D9E941260>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function rat...0020D9E941080>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,45.0
,gamma,1.0
,random_state,42.0

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,9
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Analyzing the Best Models and Their Errors

In [32]:
feature_importances = final_model["random_forest"].feature_importances_
feature_importances.round(2)

array([0.07, 0.05, 0.05, 0.01, 0.01, 0.01, 0.01, 0.19, 0.01, 0.02, 0.01,
       0.01, 0.01, 0.  , 0.01, 0.02, 0.01, 0.02, 0.01, 0.  , 0.01, 0.02,
       0.01, 0.01, 0.01, 0.  , 0.02, 0.01, 0.01, 0.  , 0.01, 0.01, 0.01,
       0.03, 0.01, 0.01, 0.01, 0.01, 0.04, 0.01, 0.02, 0.01, 0.02, 0.01,
       0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.  , 0.07,
       0.  , 0.  , 0.  , 0.01])

In [33]:
sorted(zip(feature_importances, final_model["preprocessing"].get_feature_names_out()), reverse=True)

[(np.float64(0.18599734460509473), 'log__median_income'),
 (np.float64(0.07338850855844488), 'cat__ocean_proximity_INLAND'),
 (np.float64(0.06556941990883974), 'bedrooms__ratio'),
 (np.float64(0.05364871007672531), 'rooms_per_house__ratio'),
 (np.float64(0.04598870861894748), 'people_per_house__ratio'),
 (np.float64(0.04175269214442518), 'geo__Cluster 30 similarity'),
 (np.float64(0.025976797232869678), 'geo__Cluster 25 similarity'),
 (np.float64(0.023595895886342252), 'geo__Cluster 36 similarity'),
 (np.float64(0.02021056221732893), 'geo__Cluster 9 similarity'),
 (np.float64(0.018606917076661445), 'geo__Cluster 34 similarity'),
 (np.float64(0.01813798837462886), 'geo__Cluster 37 similarity'),
 (np.float64(0.017404353166326745), 'geo__Cluster 18 similarity'),
 (np.float64(0.01677838614384489), 'geo__Cluster 1 similarity'),
 (np.float64(0.015459009666188978), 'geo__Cluster 7 similarity'),
 (np.float64(0.015325731028175922), 'geo__Cluster 32 similarity'),
 (np.float64(0.01507377201503834

#### TIP

The sklearn.feature_selection.SelectFromModel transformer can automatically drop the least useful features for you: when you fit it, it trains a model (typically a random forest), looks at its feature_importances_ attribute, and selects the most useful features. Then when you call transform(), it drops the other features.


### Evaluate Your System on the Test Set

In [34]:
X_test = strat_test_set.drop("median_house_value", axis=1)

y_test = strat_test_set["median_house_value"].copy()

In [35]:
final_predictions = final_model.predict(X_test)
final_rmse = root_mean_squared_error(y_test, final_predictions)
print(final_rmse)

41445.533268606625


In [36]:
from scipy import stats

In [37]:
confidence = 0.95

squared_errors = (final_predictions - y_test) ** 2

np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))

array([39293.55594722, 43491.1590655 ])

## Save the Model

In [38]:
import joblib

joblib.dump(final_model, "housing_model_final.pkl")

['housing_model_final.pkl']

## Load the model

In [40]:
final_model_reloaded = joblib.load("housing_model_final.pkl")

In [41]:
new_data = housing.iloc[:5]
predictions = final_model_reloaded.predict(new_data)

In [None]:
predictions

array([441046.12, 454713.09, 104832.  , 101316.  , 336181.05])

: 