In [1]:
# Title:   Python Regression Examples
# Author:  Peter Scarbrough
# Date:    24 Jan 2020
# Purpose: Practice regression in Python, leading to pipeline creation

In [12]:
# 1. Load required packages
import math
import numpy as np
import pandas as pd
import sklearn.datasets as datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
# 2. Load, tidy data (boston housing)
bdata = datasets.load_boston()

# convert to pandas dataframe
b_df = pd.DataFrame(bdata.data, columns=bdata.feature_names)
b_df["MEDV"] = bdata.target

# print data frame to inspect
# note: skipping EDA for this exercise
#       but pandas df usually a good starting point for that
print(b_df)

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  MEDV  
0       15.3  396.90   4.98  24.0  
1       17.8  396.90   9.14

In [4]:
# 3. linear regression modeling - include all predictors
#    note: this model is overfitted

# get features, response variable
X = b_df.iloc[:,:13]
y = b_df["MEDV"]

# use sklearn.linear_model.LinearRegression
lr    = LinearRegression()
model = lr.fit(X, y)

# get model R2
r2 = model.score(X, y)
print(r2)

0.7406426641094095


In [5]:
# 4. Linear Regression w/Regularization - Lasso Method

# scale X (mean = 0, std = 1)
scaler = StandardScaler()
stdX   = scaler.fit_transform(X)

# do lasso regression
lr_lasso    = Lasso(alpha=1)
model_lasso = lr_lasso.fit(stdX, y) 

# get model R2
r2_lasso = model_lasso.score(stdX, y)
print(r2_lasso)

0.6628192101128285


In [6]:
# 5. optimize lasso method (also a kind of feature selection)

# split into training and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

# create a pipeline for model selection 
# include feature selection (alpha) and standardization (required for lasso)
pipeline      = Pipeline([("standardize", StandardScaler()),
                          ("classifier", Lasso())])
alpha_space   = np.logspace(-3, 3, 30)
hyperp        = [{"classifier__alpha": alpha_space}]
lasso_modeler = GridSearchCV(pipeline, hyperp, cv=10, verbose=0, iid=False)
lasso_fit     = lasso_modeler.fit(X_train, y_train)

# get optimal alpha estimate from test data using lasso modeling object
optimal_alpha = lasso_fit.best_params_['classifier__alpha']
print("Optimal alpha for lasso model: ", optimal_alpha)

Optimal alpha for lasso model:  0.02807216203941177


In [7]:
# 6. refit model with test data, calculate R2

# standardize predictor (X) test data
scaler = StandardScaler()
std_X_test = scaler.fit(X_test).transform(X_test)

# make lasso fit using the putative 'optimal_alpha' value
lasso_fit_test = Lasso(alpha=optimal_alpha).fit(std_X_test, y_test)

# get R^2 estimate from test data modeling
lasso_fit_test_r2 = lasso_fit_test.score(std_X_test, y_test)
print("Estimated R2 from optimized lasso model: ", lasso_fit_test_r2)

Estimated R2 from optimized lasso model:  0.7104744913539625


In [38]:
# 7. Improve pipeline, add ridge regression
#    test lasso and ridge while tuning hyperparameters with standardization
#    also include option to allow parallelization of calculation

# note: feature selection didn't work with lasso optimization
#       lack of zeroes suggests no automatic feature selection
print("lasso model coefficients: ", lasso_fit_test.coef_, "", sep="\n") 

# so it's worth trying ridge regression as well. let's incorporate into pipeline...

# split into training and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

# create a pipeline for model selection 
# include feature selection (alpha) and standardization (required for lasso)
pipeline      = Pipeline([("standardize", StandardScaler()),
                          ("classifier", Ridge())])
alpha_space    = np.logspace(-3, 3, 30)
hyperp         = [{"classifier": [Lasso()],
                   "classifier__alpha": alpha_space},
                  {"classifier": [Ridge()],
                   "classifier__alpha": alpha_space}]
linear_modeler = GridSearchCV(pipeline, hyperp, cv=10, verbose=0, iid=False, n_jobs=-1)
linear_fit     = linear_modeler.fit(X_train, y_train)

# get best model information
print("best model information: ", linear_fit.best_params_['classifier'], sep="\n")

# suggests the Ridge model may perform slightly better
optimal_alpha = linear_fit.best_params_['classifier__alpha']

lasso model coefficients: 
[-0.5903414   0.94604396 -0.00668398  0.3508278  -2.1576922   1.07793228
  0.52763615 -3.74778731  3.24525877 -2.36980758 -1.83938538  1.87885042
 -4.59068475]

best model information: 
Ridge(alpha=13.738237958832638, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)


In [39]:
# 8. estimate R2 from final pipeline product

# standardize predictor (X) test data
scaler = StandardScaler()
std_X_test = scaler.fit(X_test).transform(X_test)

# make lasso fit using the putative 'optimal_alpha' value
linear_fit_test = Ridge(alpha=optimal_alpha).fit(std_X_test, y_test)

# get R^2 estimate from test data modeling
linear_fit_test_r2 = linear_fit_test.score(std_X_test, y_test)
print("Estimated R2 from optimized lasso model: ", linear_fit_test_r2)

Estimated R2 from optimized lasso model:  0.7560225184329137


In [None]:
# 9. final comments

# regularization improved the R2 estimate from even the overfitted 
# original naive linear model that included all predictors

# this is a promising first step in linear modeling, however repeated
# runs suggested this estimate is unstable, with the alpha levels and
# ridge/lasso model selection varying from run to run

# this suggests it may be worth trying other machine learning models
# playing around with feature engineering or adjusting some of the
# cross-validation parameters in order to achieve a more stable result