In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [2]:
matplotlib.rcParams['figure.figsize'] = [20, 10]

Vamos a aplicar regularización $l_2$  y regularization $l_1$ usando el siguiente ejemplo de una competición de  <kaggle.com> llamado __House Prices: Advanced Regression Techniques__, ver <https://www.kaggle.com/c/house-prices-advanced-regression-techniques>. 

Por favor baja el dataset `train.cvs`. 

In [3]:
path = "./data/house-prices-advanced-regression-techniques/train.csv"
houses = pd.read_csv(path)
houses.columns.values

array(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'Pav

### Seleccionamos para evitar complejidad las columnas numericas din elementos NAs

In [4]:
numeric_columns = houses.select_dtypes(include=[np.number]).columns
all_non_numeric_columns = houses.select_dtypes(exclude=[np.number]).columns

In [5]:
Xy = houses[numeric_columns]
target_col = "SalePrice"

Borrando las columnas con Nas values

In [6]:
cols_with_na = []
for col in Xy.columns.values:
    nas = sum(Xy[col].isna())
    if nas > 0:
        cols_with_na.append(col)
        print(col, sum(Xy[col].isna()))
        
Xy = Xy.drop(cols_with_na, axis=1)

LotFrontage 259
MasVnrArea 8
GarageYrBlt 81


Hacemos separación de datos de entrenamiento, test y validación:

In [7]:
X = Xy.drop(target_col, axis=1)
y = Xy[target_col]

X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, random_state=666, test_size=0.2)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, random_state=667, test_size=0.25)

## Método 1: Simple regresión lineal

In [8]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
y_dev_hat = reg.predict(X_dev)
print(np.sqrt(mean_squared_error(y_dev, y_dev_hat)))

34461.69716294582


## Método 2: Regresiónn (l2-regularización)

Dado el modeo lineal: 


$$\hat{y} = a_1x_1 + \ldots + a_nx_n + b$$



Para regularización $l_2$, la función objetivo es :

$$l(\hat{Y}) = \frac{1}{N}\sum_i^N \left(y_i - \hat{y}_i\right)^2 + \alpha \sum(a_1^2 + \ldots + a_n^2)$$

Este es llamado Ridge.




In [9]:
from sklearn.linear_model import Ridge

for i in range(-10, 10):
    alpha = 2**i
    ridge_reg = Ridge(alpha=alpha)
    ridge_reg.fit(X_train, y_train)
    y_dev_hat = ridge_reg.predict(X_dev)
    print(np.sqrt(mean_squared_error(y_dev, y_dev_hat)), alpha)

34461.678180096664 0.0009765625
34461.65919907327 0.001953125
34461.62124246699 0.00390625
34461.54535101685 0.0078125
34461.39365516826 0.015625
34461.09061110556 0.03125
34460.485909638504 0.0625
34459.28202148575 0.125
34456.89605317127 0.25
34452.20938684532 0.5
34443.16215816408 1
34426.262776393334 2
34396.51044932244 4
34348.951020385815 8
34282.42188338061 16
34202.48205212977 32
34125.16701254985 64
34099.00768051498 128
34245.40793648863 256
34744.590236435695 512


¿Cuál es entonces el mejor modelo?

## Lasso Regression (l1 regularización)

En este caso la función objetivo es:

$$l(\hat{Y}) = \frac{1}{N}\sum_i^N \left(y_i - \hat{y}_i\right)^2 + \alpha \sum(|a_1| + \ldots + |a_n|)$$

Este es llamado Lasso.

In [10]:
from sklearn.linear_model import Lasso

for i in range(-10, 10):
    alpha = 2**i
    lasso_reg = Lasso(alpha=alpha)
    lasso_reg.fit(X_train, y_train)
    y_dev_hat = lasso_reg.predict(X_dev)
    print(np.sqrt(mean_squared_error(y_dev, y_dev_hat)), alpha)

34461.69542173473 0.0009765625
34461.69368052823 0.001953125
34461.6901981263 0.00390625
34461.683233412696 0.0078125
34461.66930413723 0.015625
34461.64144640941 0.03125
34461.58573441035 0.0625
34461.47432458774 0.125
34461.2515619947 0.25
34460.80626696905 0.5
34459.91658585277 1
34458.14084437147 2
34454.60385885485 4
34447.587976514675 8
34433.78811793976 16
34407.119962130426 32
34357.519619665545 64
34288.09525971574 128
34229.02126168853 256
34270.67818181146 512


¿Cuál es el mejor modelo?


### Interpretación de los coeficientes en Lasso

En Lasso hay un interesante efecto en anular bastantes coeficientes
cuando aplicamos regularización. Veamos un ejemplo para $\alpha = 100000$

In [11]:
lasso_reg = Lasso(alpha=100000)
lasso_reg.fit(X_train, y_train)
lasso_reg.coef_

array([-3.17677896e+00, -5.19281675e+01,  1.81804546e-01,  0.00000000e+00,
        0.00000000e+00,  4.60597162e+02,  3.74462039e+02,  1.25190335e+01,
        0.00000000e+00, -0.00000000e+00,  2.42657167e+01,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  6.60160074e+01,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        6.00591708e+01,  3.89130770e+01, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  6.20233278e+01, -1.61332470e+02, -5.81768963e+00,
        0.00000000e+00, -0.00000000e+00])

Compara lo con regularización l2.

In [12]:
ridge_reg = Ridge(alpha=100000)
ridge_reg.fit(X_train, y_train)
ridge_reg.coef_

array([-2.74360780e+00, -1.16036290e+02,  1.49606822e-01,  1.54492603e+02,
        3.53637430e+01,  5.12887253e+02,  4.73797562e+02,  1.56538396e+01,
        3.83463135e+00,  1.73549128e+00,  2.12239622e+01,  1.09050761e+01,
        1.37610861e+01,  2.87665725e+01,  5.34327347e+01,  1.11132958e+01,
       -2.12506534e+00,  8.81515714e+00,  1.44562256e-02, -3.37008992e+01,
       -1.24206558e+01,  1.52790155e+01,  3.37324777e+01,  2.43274700e+01,
        5.59984263e+01,  4.30133223e+01, -1.16274809e+01,  1.95784426e+01,
        6.77256124e+01,  9.63614776e+01, -2.17259354e+02, -6.30007045e+00,
        3.33652332e+01, -1.91128798e+01])

# Elastic Net: ambas regularizaciones al mismo timepo

Hay un tercer tipo de regularización que trabaja muy bien que es la combinación de $l_2$ y $l_1$:


$$l(\hat{Y}) = \frac{1}{N}\sum_i^N \left(y_i - \hat{y}_i\right)^2 + 
\alpha_1 \sum(|a_1| + \ldots + |a_n|)+
\alpha_1 \sum((a_1)^2 + \ldots + (a_n)^2)
$$

El modelo lineal con esta función objetivo es a menudo llamdadoElastic Net. Esta tienes dos parámetros  `alpha` y `l1_ratio` y la relación entre ellos es la siguiente:

$$\alpha_1 = \texttt{alpha}\cdot\texttt{l1_ratio}$$

$$\alpha_2 = \texttt{alpha}\cdot(\texttt{1 - l1_ratio})$$

Veamos en el ejemplo anterior:

In [13]:
from sklearn.linear_model import ElasticNet
import warnings
warnings.filterwarnings("ignore")
alpha_ratio_score = []

for i in range(-10, 10):
    for j in range(20):
        alpha = 2**i
        l1_ratio = 0.6**j
        en_reg = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        en_reg.fit(X_train, y_train)
        y_dev_hat = en_reg.predict(X_dev)
        alpha_ratio_score.append([alpha, l1_ratio, np.sqrt(mean_squared_error(y_dev, y_dev_hat)), alpha, l1_ratio])
        
scores = pd.DataFrame({
    "alpha": [ars[0] for ars in alpha_ratio_score],
    "ratio": [ars[1] for ars in alpha_ratio_score],
    "score": [ars[2] for ars in alpha_ratio_score]
})

In [14]:
scores.sort_values('score').head()

Unnamed: 0,alpha,ratio,score
159,0.125,6.1e-05,34095.612059
158,0.125,0.000102,34095.612107
157,0.125,0.000169,34095.612188
156,0.125,0.000282,34095.612327
155,0.125,0.00047,34095.612565
