Exercise: train an SVM regressor on the California housing dataset.

In [None]:
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
import matplotlib as mpl
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

In [60]:
housing = fetch_california_housing(as_frame = True);

In [61]:
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [62]:
x = housing.data

In [63]:
y = housing.target

Split Train and Test Set

In [64]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=42)

IMPORTANT!
Scale the data

In [65]:
scaler = StandardScaler()
x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.transform(x_test)

In [66]:
model = LinearSVR(random_state=42)
model.fit(x_train_scale, y_train)



LinearSVR(random_state=42)

In [67]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(x_train_scale)
mse = mean_squared_error(y_train, y_pred)
mse

0.9641780189950231

Remember that this RMSE is based on the data after being scaled

In [68]:
np.sqrt(mse)


0.9819256687728574

Random Search to find the best hyperparameter values of C and Gamma

In [69]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, cv=3, random_state=42)
rnd_search_cv.fit(x_train_scale, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END .....C=4.745401188473625, gamma=0.07969454818643928; total time=  10.8s
[CV] END .....C=4.745401188473625, gamma=0.07969454818643928; total time=  10.3s
[CV] END .....C=4.745401188473625, gamma=0.07969454818643928; total time=  10.0s
[CV] END .....C=8.31993941811405, gamma=0.015751320499779724; total time=  10.3s
[CV] END .....C=8.31993941811405, gamma=0.015751320499779724; total time=   9.9s
[CV] END .....C=8.31993941811405, gamma=0.015751320499779724; total time=  10.1s
[CV] END ....C=2.560186404424365, gamma=0.002051110418843397; total time=   9.9s
[CV] END ....C=2.560186404424365, gamma=0.002051110418843397; total time=   9.8s
[CV] END ....C=2.560186404424365, gamma=0.002051110418843397; total time=   9.7s
[CV] END ....C=1.5808361216819946, gamma=0.05399484409787431; total time=   9.7s
[CV] END ....C=1.5808361216819946, gamma=0.05399484409787431; total time=  10.0s
[CV] END ....C=1.5808361216819946, gamma=0.05399

RandomizedSearchCV(cv=3, estimator=SVR(),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000235B25903A0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000235A3AD6C40>},
                   random_state=42, verbose=2)

Getting the best values

In [70]:
rnd_search_cv.best_estimator_

SVR(C=4.745401188473625, gamma=0.07969454818643928)

In [71]:
predict = rnd_search_cv.best_estimator_.predict(x_train_scale)
mse = mean_squared_error(y_train, predict)
np.sqrt(mse)

0.572758599828496

In [72]:
test_predict = rnd_search_cv.best_estimator_.predict(x_test_scale)
mse = mean_squared_error(y_test, test_predict)
np.sqrt(mse)

0.5929263484626331