In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.model_selection import train_test_split
# Only use this if running the notebook on your local machine
#plt.style.use('notebook.mplstyle')

In [None]:
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()

In [None]:
X = california_housing.data
y = california_housing.target.reshape(X.shape[0], 1)
all_data = np.hstack([X, y])
all_labels = california_housing.feature_names + california_housing.target_names

In [None]:
corr_coeff = np.corrcoef(all_data, rowvar=False)

fig, ax = plt.subplots(1, 1)
ih = ax.imshow(corr_coeff, cmap=cm.coolwarm, vmin=-1, vmax=1)
ax.set(xticks=range(9), yticks=range(9), yticklabels=all_labels);
ax.set_xticklabels(all_labels, rotation=90);
fig.colorbar(ih);

In [None]:
fig, ax = plt.subplots(1, 1)
sh = ax.scatter(X[:, 6], X[:, 7], s=20, c=y, cmap=cm.coolwarm, alpha=0.5)
ax.set(xlabel='Latitude', ylabel='Longitude');
fig.colorbar(sh, label='Median house value')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define alpha (regularization) values to try
alpha_vals = np.logspace(-4, 3, 8)

# Use a ridge regression model with built in cross-validation
ridge_reg_cv = RidgeCV(fit_intercept=True, alphas=alpha_vals, cv=None, store_cv_values=True)
ridge_reg_cv.fit(X_train, y_train)
y_hat_train = ridge_reg_cv.predict(X_train)
y_hat_test = ridge_reg_cv.predict(X_test)
y_hat = ridge_reg_cv.predict(X)

# Visualize what we have
fig, ax = plt.subplots(1, 1)
ax.plot(y_train, y_hat_train, '.', alpha=0.25, label='Train')
ax.plot(y_test, y_hat_test, '.', alpha=0.25, label='Test')
ax.plot([0, 5], [0, 5], 'k:')
ax.set(xlabel='$y$', ylabel='$\hat{y}$');
ax.legend()

In [None]:
fig, ax = plt.subplots(1, 1)
ax.plot(ridge_reg_cv.alphas, ridge_reg_cv.cv_values_.mean(axis=0).flatten(), 'o-', alpha=0.5)
ax.set(xscale='log', xlabel='alpha', ylabel='MSE');

In [None]:
fig, ax = plt.subplots(1, 1)
sh = ax.scatter(X[:, 6], X[:, 7], s=20, c=y-y_hat, cmap=cm.coolwarm, alpha=0.5)
ax.set(xlabel='Latitude', ylabel='Longitude');
fig.colorbar(sh, label='$y-\hat{y}$')

In [None]:
# Create a figure window
fig = plt.figure(figsize=[14, 4])


ax = fig.add_subplot(1, 2, 1)
ax.plot(X[:, 1], y, 'o', alpha=0.1)
ax.set(xlabel=all_labels[1], ylabel=all_labels[-1])

ax = fig.add_subplot(1, 2, 2)
ax.plot(X[:, 2], y, 'o', alpha=0.25)
ax.set(xlabel=all_labels[2], ylabel=all_labels[-1])

In [None]:
remove = y.flatten() == y.max()
remove += X[:, 1] == X[:, 1].max()
remove += X[:, 2] > 10
remove_rows = np.where(remove)[0]

X_cleaned = np.delete(X, remove_rows, axis=0)
y_cleaned = np.delete(y, remove_rows, axis=0)

In [None]:
# Create a figure window
fig = plt.figure(figsize=[14, 4])

ax = fig.add_subplot(1, 2, 1)
ax.plot(X_cleaned[:, 1], y_cleaned, 'o', alpha=0.1)
ax.set(xlabel=all_labels[1], ylabel=all_labels[-1])

ax = fig.add_subplot(1, 2, 2)
ax.plot(X_cleaned[:, 2], y_cleaned, 'o', alpha=0.1)
ax.set(xlabel=all_labels[2], ylabel=all_labels[-1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=1)

# Define alpha (regularization) values to try
alpha_vals = np.logspace(-4, 3, 8)

# Use a ridge regression model with built in cross-validation
ridge_reg_cv = RidgeCV(fit_intercept=True, alphas=alpha_vals, cv=None, store_cv_values=True)
ridge_reg_cv.fit(X_train, y_train)
y_hat_train = ridge_reg_cv.predict(X_train)
y_hat_test = ridge_reg_cv.predict(X_test)
y_hat = ridge_reg_cv.predict(X)

# Visualize what we have
fig, ax = plt.subplots(1, 1)
ax.plot(y_train, y_hat_train, '.', alpha=0.25, label='Train')
ax.plot(y_test, y_hat_test, '.', alpha=0.25, label='Test')
ax.plot([0, 5], [0, 5], 'k:')
ax.set(xlabel='$y$', ylabel='$\hat{y}$');
ax.legend();