In [None]:
import joblib
import pandas as pd
import os
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("../datasets/housing/housing.csv")
data.head()

In [25]:
from sklearn.model_selection import cross_val_score
# Helper function used in the first notebook
def display_scores(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    scores = np.sqrt(-scores)

    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("STD: ", scores.std())

# Exercise 1
Trying a SVR on the data to see if it does better than the Random Forest (I used a Gradient Boosting Regressor for the model)

In [4]:
%%time
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

housing_prepared = np.loadtxt("../datasets/housing/housing_prepared.csv", delimiter=",")
housing_labels = np.loadtxt("../datasets/housing/housing_labels.csv", delimiter=",")

param_grid = [
    {"kernel": ["linear", "rbf"], "C": np.arange(300, 1000, 100)}
]
grid_search = GridSearchCV(SVR(), param_grid, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)

grid_search.fit(housing_prepared, housing_labels)

CPU times: user 9.39 s, sys: 149 ms, total: 9.54 s
Wall time: 9min 20s


GridSearchCV(cv=10, estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': array([300, 400, 500, 600, 700, 800, 900]),
                          'kernel': ['linear', 'rbf']}],
             scoring='neg_mean_squared_error')

In [5]:
grid_search.best_params_

{'C': 700, 'kernel': 'linear'}

Grid Search likes the linear kernel, and a C around 700. I'll learn more abot what these actually mean later in the book. But I will try it with the LinearSVR class since it is faster. And wrap it into a function for easier use further in the notebook

In [20]:
from sklearn.svm import LinearSVR

def do_grid_search_linear(search_min, search_max, search_step):
    linear_param_grid = {
        "C": np.arange(search_min, search_max, search_step)
    }

    linear_grid_search = GridSearchCV(LinearSVR(), linear_param_grid, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    linear_grid_search.fit(housing_prepared, housing_labels)
    return linear_grid_search

Starting with the initial values I tested earlier

In [21]:
search = do_grid_search_linear(300, 1000, 100)
search.best_params_

{'C': 900}

It wants more regularization. Interesting. I'll try an interval of 1000 up to 10000 and see what happens.

In [22]:
search = do_grid_search_linear(1000, 11000, 1000)
search.best_params_

{'C': 10000}

In [23]:
%%time
search = do_grid_search_linear(5000, 50000, 1000)
search.best_params_

CPU times: user 1.17 s, sys: 54.3 ms, total: 1.22 s
Wall time: 2min 21s


{'C': 27000}

In [24]:
search = do_grid_search_linear(26000, 28000, 100)
search.best_params_

{'C': 26600}

I could go exploring further, but that's outside the scope of the exercise. This value will be good enough, so let's see how well it does compared against the data.

In [26]:
display_scores(search.best_estimator_, X_train=housing_prepared, y_train=housing_labels)

Scores:  [72577.09811839 66094.399554   67178.39110867 70234.21497558
 67265.14917758 73946.68950485 77233.41884311 70028.74098011
 67481.15413525 70944.50959103]
Mean:  70298.37659885749
STD:  3341.2096780778534


This value is really close to what we got with the Random Forest and only a couple thousands off of the Gradient Booster. That's pretty good!

# Exercise 2
Replacing GridSearchCV with RandomizedSearchCV

In [29]:
from sklearn.model_selection import RandomizedSearchCV

def linear_svm_random_search(search_min, search_max, search_step):
    linear_param_grid = {
        "C": np.arange(search_min, search_max, search_step)
    }

    random_search = RandomizedSearchCV(LinearSVR(), linear_param_grid, scoring="neg_mean_squared_error", n_jobs=-1, cv=10, n_iter=100)
    random_search.fit(housing_prepared, housing_labels)
    return random_search

In [30]:
search = linear_svm_random_search(100, 50000, 100)
search.best_params_



{'C': 38200}

In [31]:
display_scores(search.best_estimator_, housing_prepared, housing_labels)

Scores:  [72622.06149849 66081.56987982 67150.57716848 70217.77785789
 67283.07336727 73991.10336937 77151.11053227 69998.54895571
 67522.12514634 70918.10143383]
Mean:  70293.60492094714
STD:  3331.1507725819392


This seems to get closer to a solution with a smaller number of iterations. Random Search worked well here.

# Exercise 3
Try adding a transformer in the preparation pipeline to select only the most important attributes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class BestAttributesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, num_to_keep=3):
        self.num_to_keep = num_to_keep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        