In [1]:
import numpy as np

In [28]:
import bokeh.plotting
from bokeh.palettes import Category10_10 as palette

In [4]:
bokeh.plotting.output_notebook()

# Training Models

Code following along with Aurélien Géron's book [Hands-On Machine Learning with Scikit-Learn and TensorFlow](http://shop.oreilly.com/product/0636920052289.do).



Consider a data set of $m$ samples that have $n$ features.

Linear Regression model
$${\bf y} = \vartheta^T {\bf x}$$

Mean Square Error (MSE)
$$MSE = \frac{1}{m}\sum^m_{i=1} \left(\vartheta_i x_i - y_i\right)^2$$

There is a closed form solution to minimize the MSE, known as the Normal Equation.
$$\vartheta = \left({\bf x}^T {\bf x}\right)^{-1} {\bf x}^T {\bf y}$$

In [2]:
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

In [30]:
p = bokeh.plotting.figure(width=500, height=300, 
                          x_axis_label='x1', y_axis_label='y')

p.scatter(X[:, 0], y[:, 0], color=palette[0])
bokeh.plotting.show(p)

In [16]:
X_b = np.c_[np.ones((100, 1)), X]  # add x0 = 1 column (feature)
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T.dot(y))

In [17]:
theta_best

array([[ 3.83673933],
       [ 3.06358562]])

In [20]:
X_new = np.array([[0], [2]])  # just using the endpoints
X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0=1 column
y_predict = X_new_b.dot(theta_best)
y_predict

array([[ 3.83673933],
       [ 9.96391057]])

In [25]:
theta_real = np.array([[4], [3]])
y_real = X_new_b.dot(theta_real)

In [32]:
p = bokeh.plotting.figure(width=500, height=300, x_axis_label='x1', y_axis_label='y')

p.scatter(X[:, 0], y[:, 0], color=palette[0])
p.line(X_new[:, 0], y_predict[:, 0], legend='Prediction', color=palette[1])
p.line(X_new[:, 0], y_real[:, 0], legend='Real', color=palette[2])
p.legend.location = 'top_left'

bokeh.plotting.show(p)

Now using Scikit-Learn

In [34]:
import sklearn.linear_model

In [36]:
lin_reg = sklearn.linear_model.LinearRegression()
lin_reg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Comparing the Scikit-Learn results to the prior results, we see there is no difference.

In [39]:
print('Theta:')
print('normal equation: {}, {}'.format(*theta_best))
print('Scikit-Learn: {}, {}'.format(lin_reg.intercept_, lin_reg.coef_))

Theta:
normal equation: [ 3.83673933], [ 3.06358562]
Scikit-Learn: [ 3.83673933], [[ 3.06358562]]


In [44]:
print('End-points:')
print('normal equation: {}, {}'.format(*y_predict))
print('Scikit-Learn: {}, {}'.format(*lin_reg.predict(X_new)))

End-points:
normal equation: [ 3.83673933], [ 9.96391057]
Scikit-Learn: [ 3.83673933], [ 9.96391057]
