In [None]:
from google.colab import drive
drive.mount('/content/drive')
dirpath = 'drive/MyDrive/MachineLearning/HandsOnMachineLearning/california-housing/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import (StratifiedShuffleSplit, cross_val_score,
                                     GridSearchCV, RandomizedSearchCV)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from scipy.stats import randint, expon, reciprocal
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

def indices_of_top(arr , k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

class TopFeatureSelector():
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k

    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top(self.feature_importances, self.k)
        return self

    def transform(self, X):
        return X[:, self.feature_indices_]

In [None]:
housing = pd.read_csv(dirpath + 'housing.csv')
housing['income_cat'] = pd.cut(housing['median_income'],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_set, test_set = None, None

for train_index, test_index in split.split(housing, housing['income_cat']):
    train_set = housing.loc[train_index]
    test_set = housing.loc[test_index]
assert train_set is not None
assert test_set is not None

for set_ in (train_set, test_set):
    set_.drop('income_cat', axis=1, inplace=True)

train_labels = train_set['median_house_value'].copy()
train_set.drop('median_house_value', axis=1, inplace=True)
test_labels = test_set['median_house_value'].copy()
test_set.drop('median_house_value', axis=1, inplace=True)

train_set_num = train_set.drop('ocean_proximity', axis=1)
num_attributes = list(train_set_num)
cat_attributes = ['ocean_proximity']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', OneHotEncoder(), cat_attributes)
    ])
train_set_prepared = full_pipeline.fit_transform(train_set)

In [None]:
svr = SVR()
svr.fit(train_set_prepared, train_labels)

test_predictions = svr.predict(train_set_prepared)
svr_mse = mean_squared_error(train_labels, test_predictions)
svr_rmse = np.sqrt(svr_mse)
svr_scores = cross_val_score(svr, train_set_prepared, train_labels,
                             scoring='neg_mean_squared_error', cv=10)
svr_rmse_scores = np.sqrt(-svr_scores)

print(svr_rmse)
print(svr_rmse_scores.mean())
print(svr_rmse_scores.std())

In [None]:
def print_search_result(search):
    print('best_params:', search.best_params_)
    print('best_estiamtor:', search.best_estimator_)

    search_results = search.cv_results_
    for mean_score, params in zip(search_results['mean_test_score'],
            search_results['params']):
        print(np.sqrt(-mean_score), params)

In [None]:
# param_grid =  [
#     {'kernel': ['linear'], 'C': [1, 10, 20]},
#     {'kernel': ['rbf'], 'C': [1, 10, 20], 'gamma': ['scale', 'auto', 1, 10, 20]}
#     ]
param_grid = [
    {'kernel': ['linear'], 'C': [20, 40, 60]}
    ]

grid_search = GridSearchCV(svr, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(train_set_prepared, train_labels)

print_search_result(grid_search)

In [None]:
random_param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 200000),
    'gamma': expon(scale=1),
    }
randomized_search = RandomizedSearchCV(svr, random_param_grid, cv=5, n_iter=1,
                                       scoring='neg_mean_squared_error',
                                       return_train_score=True)
randomized_search.fit(train_set_prepared, train_labels)
print_search_result(randomized_search)

In [None]:
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
forest_grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
forest_grid_search.fit(train_set_prepared, train_labels)

In [None]:
feature_importances = forest_grid_search.best_estimator_.feature_importances_
k = 5
print(feature_importances)

In [None]:
num_attributes = list(train_set_num)
cat_attributes = ['ocean_proximity']

feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

train_set_prepared_top_k_features = feature_selection_pipeline.fit_transform(train_set)

In [None]:
print(train_set_prepared_top_k_features[0:3])
print(train_set_prepared[0:3])

In [None]:
prepare_select_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('prediction', SVR(**randomized_search.best_params_))
])
prepare_select_predict_pipeline.fit(test_set, test_labels)

In [None]:
final_test_predictions = prepare_select_predict_pipeline.predict(test_set)
final_svr_mse = mean_squared_error(test_labels, final_test_predictions)
final_svr_rmse = np.sqrt(final_svr_mse)
print(final_svr_rmse)

113757.33191694897


# 5

In [None]:
eull_pipeline.named_transformers_["cat"].handle_unknown = 'ignore'

param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select__predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(train_set, train_labels)

In [None]:
print(grid_search_prep.best_params)