In [2]:
# Alternatives to Linear Regression

# Polynomial Regression.
# Idea: do linear regression on combinations of features.
# For y=W*X, add a feature X^2, y=W1*X + W2*X^2.
# For y=W1*X1+W2*X2, add features for X1^2 and X2^2 and X1*X2.

# Linear Regression risk: underfit. 
# Indicator of underfit: as train set size increases, train error remains high.
# Polynomial Regression risk: overfit.
# Indicator of overfit: validation error always higher than train error.
# How to analyze:
# Plot learning curves i.e. (train loss, test loss) vs growing test size.

# Reproduce data from previous notebook.
m = 100   # training set size
import numpy as np
X = 2 * np.random.rand(m,1)  
y = 4 + 3 * X + np.random.randn(m,1)
biasX = np.c_[np.ones((m,1)), X] 

In [3]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
Xpoly = poly.fit_transform(X)  # for this simple case, returns array of X and X^2
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(Xpoly,y)
linreg.intercept_, linreg.coef_
# For linear data, we get close: y = 4.5 + 2.1x + 0.4x^2 

(array([4.53385405]), array([[2.06226898, 0.40245008]]))

In [8]:
X = 2 * np.random.rand(m,1)  
y = 5 + 3 * X + 1 * pow(X,2) + np.random.randn(m,1)
biasX = np.c_[np.ones((m,1)), X] 
Xpoly = poly.fit_transform(X)  # for this simple case, returns array of X and X^2
linreg.fit(Xpoly,y)
linreg.intercept_, linreg.coef_
# For quadratic data, we get close: y = 5.5 + 2.1x + 1.3x^2 

(array([5.48818983]), array([[2.0536459 , 1.34175627]]))

In [None]:
# Sources of modeling error.
# 1) Bias = wrong assumptons. (not related to bias node)
#    Model complexity reduces bias e.g. assumption of linearity.
# 2) Variance = sensitivity to training data. 
#    Model complexity increases overfitting and variance. 
# 3) Irreducible error = noise.
#    Must preprocess to denoise the data!

In [None]:
# Regularization.
# Increase generalization and decrease overfitting.
# These types: 
# 1) Ridge, 
# 2) Lasso, 
# 3) Elastic Net,
# 4) Early stopping.
# Important to scale the data first (so all weights can get small).

In [None]:
# Ridge Regression.
# Linear regression + ridge regularization.
# aka Tikhonov regularization.
# Uses the L2 norm i.e. sum of squares of weights.

# During training (but not testing), penalize large weights.
# Choose regularization parameters alpha:
#    zero for no regularization, large number for smooth predictors.
# Cost = MSE(theta) + (alpha/2) * sum(theta^2).
# Note sum over theta 1..n so x0=bias it not penalized.
# The added cost is (1/2)*L2norm(W)^2 for weights W excluding bias.
# For gradient descent, the added cost is alpha*w.

# Closed form solution with identity matrix I
# theta = inverse(XT*X + alpha*I) * XT * y

In [10]:
# Ridge Regression in SKLearn.
# Closed form with Cholesky matrix factorization.
from sklearn.linear_model import Ridge
reg = Ridge(alpha=1, solver="cholesky")
reg.fit(X,y)
value_0 = 1.5    # predict y for this arbitrary x
reg.predict([[value_0]])  

array([[11.65848192]])

In [12]:
# Ridge Regression in SKLearn.
# Gradient Descent.
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(penalty="l2")   # L2 norm amounts using Ridge regression
sgd.fit(X,y.ravel())
sgd.predict([[value_0]])  

array([11.78141339])

In [13]:
# Lasso Regression.
# Linear regression + lasso regularization.
# LASSO = Least Absolute Shrinkage and Selection Operator.
# Uses the L1 norm i.e. sum of abs(weight).
# Lasso is a form of feature selection.
# Lasso tends to produce a sparse model i.e. to zero out many weights.
# Lasso gradient descent bounces around, needs learning schedule that reduces alpha.
# Cost function is not differentiable.
# Work-around uses a subgradient vector.

# SKLearn has two ways to do it.
sgd = SGDRegressor(penalty="l2")   # L1 norm amounts using Lasso regression
sgd.fit(X,y.ravel())
sgd.predict([[value_0]])  

array([11.77004])

In [14]:
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.1)
las.fit(X,y)
las.predict([[value_0]])  

array([11.5894603])

In [15]:
# Elastic Net.
# Mix of Ridge and Lasso.
# Parameter r: r=0 => Ridge L2 norm, r=1 => Lasso L1 norm.
from sklearn.linear_model import ElasticNet
net = ElasticNet(alpha=0.1, l1_ratio=0.5)
net.fit(X,y)
net.predict([[value_0]])

array([11.39566937])

In [None]:
# Early stopping.
# Stop gradient descent a minimum validation error
# i.e. between underfit and overfit.

# No easy solution in sklearn?
# Book shows code to break X into X_train and X_validate,
# then loop over epochs (nice - no need to start again each time),
# and retain clone of the model with min val error.