# Lasso Regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [2]:
_df = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter07/Dataset/ccpp.csv')

In [3]:
_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
AT    9568 non-null float64
V     9568 non-null float64
AP    9568 non-null float64
RH    9568 non-null float64
PE    9568 non-null float64
dtypes: float64(5)
memory usage: 373.9 KB


In [None]:
X = _df.drop(['PE'], axis=1).values

In [4]:
y = _df['PE'].values

In [5]:
train_X, eval_X, train_y, eval_y = train_test_split(X, y, train_size=0.8, random_state=0)

# Implement a LinearRegression model

In [6]:
lr_model_1 = LinearRegression()

In [7]:
lr_model_1.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
lr_model_1_preds = lr_model_1.predict(eval_X)

In [9]:
print('lr_model_1 R2 Score: {}'.format(lr_model_1.score(eval_X, eval_y)))

lr_model_1 R2 Score: 0.9261145309646464


In [10]:
print('lr_model_1 MSE: {}'.format(mean_squared_error(eval_y, lr_model_1_preds)))

lr_model_1 MSE: 21.674769717765997


# Engineer cubic features

In [11]:
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('lr', LinearRegression())
]

In [12]:
lr_model_2 = Pipeline(steps)

In [13]:
lr_model_2.fit(train_X, train_y)

Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('poly',
                 PolynomialFeatures(degree=3, include_bias=True,
                                    interaction_only=False, order='C')),
                ('lr',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [14]:
print('lr_model_2 R2 Score: {}'.format(lr_model_2.score(eval_X, eval_y)))

lr_model_2 R2 Score: 0.936925927889115


In [15]:
lr_model_2_preds = lr_model_2.predict(eval_X)

In [16]:
print('lr_model_2 MSE: {}'.format(mean_squared_error(eval_y, lr_model_2_preds)))

lr_model_2 MSE: 18.503178040475618


In [17]:
print(lr_model_2[-1].coef_)

[-2.15189170e-13 -1.59714520e+02 -5.07487866e+01 -1.35497263e+02
 -1.00367558e+02  1.92608902e+00  4.45016098e+00  1.31690065e+02
  1.46522307e+02  2.74623828e+01  1.65922936e+02 -8.20000178e+01
  9.61251976e+01  1.44191991e+02  9.00656698e+01  2.26361569e+01
  9.31492168e+01  2.54173115e+01 -9.74087063e+01 -1.54404701e+02
 -8.45227379e+01  1.81001096e+02 -5.35312812e+01 -1.07570259e+02
 -7.66085266e+01  9.52213337e+01 -1.09419311e+02 -8.17077925e+01
 -3.33612742e+01  5.03156671e+01  3.53116894e+01 -3.80733397e+01
 -3.47905095e+01 -7.93328011e+01 -2.20892842e+01]


In [18]:
print(len(lr_model_2[-1].coef_))

35


# Engineer polynomial features

In [19]:
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=10)),
    ('lr', LinearRegression())
]

In [20]:
lr_model_3 = Pipeline(steps)

In [21]:
lr_model_3.fit(train_X, train_y)

Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('poly',
                 PolynomialFeatures(degree=10, include_bias=True,
                                    interaction_only=False, order='C')),
                ('lr',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [22]:
print('lr_model_3 R2 Score: {}'.format(lr_model_3.score(eval_X, eval_y)))

lr_model_3 R2 Score: -0.29111247935325424


In [23]:
lr_model_3_preds = lr_model_3.predict(eval_X)

In [24]:
print('lr_model_3 MSE: {}'.format(mean_squared_error(eval_y, lr_model_3_preds)))

lr_model_3 MSE: 378.7560129898519


In [25]:
print(len(lr_model_3[-1].coef_))

1001


In [26]:
print(lr_model_3[-1].coef_[:35])

[-2.71708454e+05 -5.63255227e+07 -3.17707898e+07 -8.37452575e+06
 -4.58514958e+07  2.24573033e+08  1.80178848e+08  1.62742658e+08
  2.99515046e+08  6.40359089e+07  2.87190773e+07  1.92985885e+08
 -8.32707259e+07  1.32842682e+08  1.28186460e+08 -5.26319764e+08
 -3.90385876e+08 -9.12424648e+08 -1.00650673e+09 -4.91557656e+08
  6.47533197e+07 -8.65741493e+08 -4.30708667e+07 -9.94505491e+08
 -6.75489453e+08  8.81561278e+07 -4.96976659e+08 -2.88179118e+08
  4.84127358e+08 -3.39229291e+08 -5.29799196e+08  3.55436078e+08
 -5.54934445e+07 -4.16798364e+08 -1.98362102e+08]


# Implement Lasso on the same pipeline

In [27]:
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=10)),
    ('lr', Lasso(alpha=0.01))
]

In [28]:
lasso_model = Pipeline(steps)

In [29]:
lasso_model.fit(train_X, train_y)

Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('poly',
                 PolynomialFeatures(degree=10, include_bias=True,
                                    interaction_only=False, order='C')),
                ('lr',
                 Lasso(alpha=0.01, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [30]:
print('lasso_model R2 Score: {}'.format(lasso_model.score(eval_X, eval_y)))

lasso_model R2 Score: 0.9342826333586903


In [31]:
lasso_preds = lasso_model.predict(eval_X)

In [32]:
print('lasso_model MSE: {}'.format(mean_squared_error(eval_y, lasso_preds)))

lasso_model MSE: 19.27860521162577


In [33]:
print(len(lasso_model[-1].coef_))

1001


In [34]:
print(lasso_model[-1].coef_[:35])

[  0.         -70.74812452 -12.4772347    6.06371386  -0.
   0.          -0.           0.          -0.16591481  -0.6369311
   0.          -0.50313613   0.          -0.          -0.
   0.           0.           0.          -0.           0.
   0.          -0.           0.          -0.          -8.11528994
  -0.           0.          -0.           0.          -0.
  -4.77116607  -0.          -0.          -0.          -0.        ]
