In [1]:
import pandas as pd
import numpy as np
import os

In [31]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [40]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [41]:
housing_features = housing.drop("median_house_value", axis=1)
housing_labels = housing["median_house_value"].copy()


In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(housing_features, housing_labels, test_size=0.2, random_state=42)

In [43]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_attribs = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_attribs = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) 
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)


In [57]:
from sklearn.svm import SVR
svm_reg = SVR(kernel="linear")
svm_reg.fit(X_train_prepared, y_train)

svm_rbg = SVR(kernel="rbf")
svm_rbg.fit(X_train_prepared, y_train)

In [53]:
some_data = housing_features.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", svm_reg.predict(some_data_prepared))

Predictions: [197385.97433131 204083.19285412 194450.61190235 187455.5561234
 180096.48261072]


In [58]:
print("RBF Predictions:", svm_rbg.predict(some_data_prepared))

RBF Predictions: [180660.68689952 180434.77714358 180607.92611473 180407.41077562
 180050.52338272]


In [55]:
print("Labels:", list(some_labels))

Labels: [452600.0, 358500.0, 352100.0, 341300.0, 342200.0]


In [59]:
from sklearn.metrics import mean_squared_error
housing_predictions = svm_reg.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

np.float64(110280.09168598946)

In [64]:
from sklearn.model_selection import GridSearchCV

X_trained_Sample = X_train_prepared[:10000]
y_trained_Sample = y_train[:10000]

param_grid = [
    {
        'kernel': ['linear'], 
        'C': [0.1, 1, 10, 100]
    },
    {
        'kernel': ['rbf'], 
        'C': [0.1, 1, 10, 100], 
        'gamma': ['scale', 0.01, 0.1, 1]
    }
]

svr_reg = SVR()

grid_search = GridSearchCV(svr_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True, n_jobs=-1)

grid_search.fit(X_trained_Sample, y_trained_Sample)


In [65]:
grid_search.best_params_

{'C': 100, 'kernel': 'linear'}

In [66]:
grid_search.best_estimator_

In [67]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):  
    print(np.sqrt(-mean_score), params)

118592.87737476436 {'C': 0.1, 'kernel': 'linear'}
115355.16600926072 {'C': 1, 'kernel': 'linear'}
92249.28842446826 {'C': 10, 'kernel': 'linear'}
72219.92664861096 {'C': 100, 'kernel': 'linear'}
118944.64032902388 {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
118957.063194294 {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
118944.47530998438 {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
118960.01057179246 {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
118783.09724782022 {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
118900.2917978224 {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
118781.53935129552 {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
118931.82889512545 {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
117187.27199112996 {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
118351.45042036651 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
117170.50440726774 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
118628.5776423217 {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
104058.2460964514 {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
113179.049735060

In [68]:
final_model = grid_search.best_estimator_

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)   # => evaluates to 47,730.2
final_rmse

np.float64(72717.25090507677)

In [72]:

from sklearn.model_selection import RandomizedSearchCV

search_space = [
    {
        'kernel': ['linear'],
        'C': np.logspace(-2, 2, 5)
    },
    {
        'kernel': ['rbf'],
        'C': np.logspace(-2, 2, 5),
        'gamma': ['scale', 0.001, 0.01, 0.1, 1]
    }
]

random_search = RandomizedSearchCV(svr_reg, search_space, n_iter=25, cv=5,
                                   scoring='neg_mean_squared_error',
                                   return_train_score=True, n_jobs=-1,
                                   random_state=42)

random_search.fit(X_train_prepared, y_train)

In [73]:
random_search.best_params_

{'kernel': 'linear', 'C': np.float64(100.0)}

In [74]:
random_search.best_estimator_

In [75]:
rvres = random_search.cv_results_
for mean_score, params in zip(rvres["mean_test_score"], rvres["params"]):
    print(np.sqrt(-mean_score), params)

109505.07880041133 {'kernel': 'rbf', 'gamma': 0.01, 'C': np.float64(100.0)}
118488.93643614659 {'kernel': 'rbf', 'gamma': 'scale', 'C': np.float64(1.0)}
115890.06613312107 {'kernel': 'rbf', 'gamma': 0.1, 'C': np.float64(10.0)}
118689.25251238632 {'kernel': 'rbf', 'gamma': 0.01, 'C': np.float64(1.0)}
118791.23829925107 {'kernel': 'rbf', 'gamma': 0.1, 'C': np.float64(0.01)}
118793.88694501399 {'kernel': 'rbf', 'gamma': 1, 'C': np.float64(0.01)}
97194.1957280252 {'kernel': 'rbf', 'gamma': 0.1, 'C': np.float64(100.0)}
118269.60139762377 {'kernel': 'rbf', 'gamma': 1, 'C': np.float64(10.0)}
118783.57549693368 {'kernel': 'rbf', 'gamma': 0.01, 'C': np.float64(0.1)}
118728.39104432875 {'kernel': 'linear', 'C': np.float64(0.01)}
71506.25796023685 {'kernel': 'linear', 'C': np.float64(100.0)}
118781.30741229387 {'kernel': 'rbf', 'gamma': 0.001, 'C': np.float64(1.0)}
118791.26239930486 {'kernel': 'rbf', 'gamma': 'scale', 'C': np.float64(0.01)}
118762.11941402762 {'kernel': 'rbf', 'gamma': 0.1, 'C':

In [76]:
final_model = random_search.best_estimator_

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)   # => evaluates to 47,730.2
final_rmse

np.float64(72169.54043307822)