In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style

style.use('fivethirtyeight')
%matplotlib inline

  import pandas.util.testing as tm


In [None]:
data = pd.read_csv('/content/drive/My Drive/Repos/Git/Machine-Learning/An Introduction to Statistical Learning/Dataset/Auto.csv')

In [None]:
data.horsepower.replace('?',np.NaN,inplace=True)
data.dropna(inplace=True)
data.horsepower = pd.to_numeric(data.horsepower)

In [None]:
X = data[['cylinders','displacement','horsepower','weight','acceleration']]
y = data.mpg

# **Regularization**

## **What is Regularization?**

**Regularization** is a technique to discourage the complexity of the model. It does by penalizing the loss function. This helps us solve the overfitting problem.

**Loss Function** is the sum of square between the actual value and the predicted value.

<center>$L(x,y) = \overset{n}{\underset{i=1}{\sum}}(y_i-f(x_i))^2$</center>
<center>$f(x_i) = h_\theta xf(x_i) = h_\theta x = \theta_0+\theta_1x_1+\theta_2x_2^2+\theta_3x_3^3+\theta_4x_4^4$</center>

As the degree of the input features the complexity of the model increases and it tries to fit all the data points.

An Example:

![alt text](https://drive.google.com/uc?export=view&id=1VGP9VyWgVHsq94qMGHsR5YSySs1M6PS-)

Regularization works on the assumption that smaller weights generate a simpler model and thus help avoid overfitting. Hence, we penalize the weights $\theta_3$ and $\theta_4$ and make them too small, very close to zero to simplify the model.

<center>$f(x_i) = h_\theta x = \theta_0+\theta_1x_1+\theta_2x_2^2$</center>

To ensure we take all the input variables into account, we penalize all the weights by making them small. This also makes the model simpler and less prone to **overfitting**.

<center>$L(x,y) = \overset{n}{\underset{i=1}{\sum}(y_i-f(x_i))^2 +\lambda\overset{n}{\underset{i=1}{\sum}}\theta_i^2}$</center>

We have added the **regularization** term to the **Loss Function**. The Regularization term keeps the weight small by making the model simple and avoid overfitting.

$\lambda$ is the penalty term or regularization parameter which determines how much to penalize the weights.

## **L1 Regularization or Lasso or L1 norm**

In L1 Regularization we shrink the parameters to zero. When the input features have weights closer to zero that leads to sparse L1 Regression. In sparse solution majority of the input features have zero weights and very few features have non zero weights.

<center>$L(x,y) = \overset{n}{\underset{i=1}{\sum}}(y_i-f(x_i))^2+\lambda\overset{n}{\underset{i=1}{\sum}}|\theta_i|$</center>

In L1 Regression, we penalize the absolute value of the weights. Lasso produces a model that is simple, interpretable and contains a subset of input features.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error,r2_score

In [None]:
lr = LinearRegression()

In [None]:
X = X.values.reshape(-1,5)
X

array([[   8. ,  307. ,  130. , 3504. ,   12. ],
       [   8. ,  350. ,  165. , 3693. ,   11.5],
       [   8. ,  318. ,  150. , 3436. ,   11. ],
       ...,
       [   4. ,  135. ,   84. , 2295. ,   11.6],
       [   4. ,  120. ,   79. , 2625. ,   18.6],
       [   4. ,  119. ,   82. , 2720. ,   19.4]])

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=3)

In [None]:
model = lr.fit(X_train,y_train)

In [None]:
r2_score(y_test,model.predict(X_test))

0.7494076237091598

In [None]:
mean_squared_log_error(y_test,model.predict(X_test))

0.027159419844415323

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
reg_lr = Lasso(fit_intercept=True)

In [None]:
param_grid = {'alpha':np.arange(0,1,0.001)}

In [None]:
from sklearn.metrics import SCORERS

In [None]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [None]:
# for finding the best alpha value.
cross_validation = GridSearchCV(estimator=reg_lr,param_grid=param_grid,scoring='r2',cv=5)

In [None]:
cross_validation.fit(X_train,y_train)

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)


GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([0.   , 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,
       0.009, 0.01 , 0.011, 0.012, 0...
       0.954, 0.955, 0.956, 0.957, 0.958, 0.959, 0.96 , 0.961, 0.962,
       0.963, 0.964, 0.965, 0.966, 0.967, 0.968, 0.969, 0.97 , 0.971,
       0.972, 0.973, 0.974, 0.975, 0.976, 0.977, 0.978, 0.979, 0.98 ,
       0.981, 0.982, 0.983, 0.984, 0.985, 0.986, 0.987, 0.988, 0.989,
       0.99 , 0.991, 0.992, 0.993, 0.994, 0.995, 0.996, 0.997, 0.998,
       0.999])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [None]:
cross_validation.best_score_

0.6835081660272463

In [None]:
cross_validation.best_params_

{'alpha': 0.666}

In [None]:
lasso = cross_validation.best_estimator_

In [None]:
r2_score(y_test,lasso.predict(X_test))

0.7506243780485066

In [None]:
model.coef_

array([-0.47432468,  0.0038803 , -0.05381485, -0.00524477, -0.03547731])

In [None]:
lasso.coef_

array([-0.        , -0.00323215, -0.0476521 , -0.00546493, -0.        ])

as we can see comparing both the coefficients, the lasso models are closer to zero.

## **L2 Regularization or Ridge Regularization**

In L2 Regularization, the regularization term is the sum of the squares of all the feature weights. It forces the weights to be small but does not make them to be zero and does non sparse solution.

<center>$L(x,y) = \overset{n}{\underset{i=1}{\sum}}(y_i-f(x_i))^2+\lambda\overset{n}{\underset{i=1}{\sum}}\theta_i^2$</center>

L2 is not robust to outliers as square terms blows up the error differences and the regularization term tries to fix it by penalizing the weights.

Ridge regression performs better when all the input features are influenced by the output and all the weights are roughly equal size.

In [None]:
from sklearn.linear_model import Ridge

In [None]:
r = Ridge(fit_intercept=True)

In [None]:
cross_validation = GridSearchCV(r,param_grid,scoring='r2',cv=5)

In [None]:
cross_validation.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([0.   , 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,
       0.009, 0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017,
       0.018, 0.019, 0.02 , 0.021, 0.0...
       0.954, 0.955, 0.956, 0.957, 0.958, 0.959, 0.96 , 0.961, 0.962,
       0.963, 0.964, 0.965, 0.966, 0.967, 0.968, 0.969, 0.97 , 0.971,
       0.972, 0.973, 0.974, 0.975, 0.976, 0.977, 0.978, 0.979, 0.98 ,
       0.981, 0.982, 0.983, 0.984, 0.985, 0.986, 0.987, 0.988, 0.989,
       0.99 , 0.991, 0.992, 0.993, 0.994, 0.995, 0.996, 0.997, 0.998,
       0.999])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [None]:
cross_validation.best_params_

{'alpha': 0.999}

In [None]:
ridge = cross_validation.best_estimator_

In [None]:
r2_score(y_test,ridge.predict(X_test))

0.7494275630823424

## **Elastic Net**

Elastic Net combines characteristics of both lasso and ridge. It reduces the impact of different features while not eliminating all the features.

The formula as you can see below is the sum of lasso and ridge formulas.

***Elastic Net Formula***

<center>$L(x,y) = \overset{n}{\underset{i=1}{\sum}}(y_i-f(x_i))^2+\lambda\overset{n}{\underset{i=1}{\sum}}\theta_i^2+\lambda\overset{n}{\underset{i=1}{\sum}}|\theta_i|$</center>

- To conclude, Lasso, Ridge and Elastic Net are excellent methods to improve the performance of Linear Model.

- This includes if you are also running a neural network, a collection of linear models.

- Lasso will eliminate many features and reduce overfitting in your linear model.

- Ridge will reduce the impact of features that are not important in predicting $y$ values.

-  Elastic net combines feature elimination from Lasso and feature coefficient reduction from Ridge model to improve you'r models predictions.

In [None]:
from sklearn.datasets import make_regression
from sklearn.linear_model import ElasticNet

In [None]:
X,y = make_regression(n_features=15,n_informative=10,random_state=3)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
en = ElasticNet(fit_intercept=True)
param_grid={'alpha':np.arange(0,1.01,0.01)}

In [None]:
cross_validation = GridSearchCV(en,param_grid,scoring='r2',cv=5)

In [None]:
cross_validation.fit(X_train,y_train)

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  self.best_estimator_.fit(X, y, **fit_params)
  positive)


GridSearchCV(cv=5, error_score=nan,
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11,...
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])},
             pre_dispatch='2*n_jobs', refit=Tr

In [None]:
cross_validation.best_params_

{'alpha': 0.0}

In [None]:
cross_validation.best_estimator_

ElasticNet(alpha=0.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
cross_validation.best_score_

0.9999999968724677

In [None]:
elastic_net = cross_validation.best_estimator_

In [None]:
r2_score(y_test,elastic_net.predict(X_test))

0.9999999980887385

In [None]:
elastic_net.coef_

array([ 3.54687290e+01,  8.26319782e+01,  6.61331368e-04,  3.17512885e-03,
        2.51570147e+01,  6.47970302e+01,  6.98212315e+01,  5.47048989e-04,
        5.45103963e+01, -1.69987747e-03,  6.16328994e+01,  4.68878982e+01,
       -3.20707479e-04,  4.06480766e+01,  5.01764030e+01])