In [1]:
import pandas as pd

acs = pd.read_csv('data/acs_ny.csv')

print(acs.columns)

Index(['Acres', 'FamilyIncome', 'FamilyType', 'NumBedrooms', 'NumChildren',
       'NumPeople', 'NumRooms', 'NumUnits', 'NumVehicles', 'NumWorkers',
       'OwnRent', 'YearBuilt', 'HouseCosts', 'ElectricBill', 'FoodStamp',
       'HeatingFuel', 'Insurance', 'Language'],
      dtype='object')


In [2]:
from patsy import dmatrices

In [5]:
response, predictors = dmatrices(

    'FamilyIncome ~ NumBedrooms + NumChildren + NumPeople + NumRooms + NumUnits + NumVehicles + NumWorkers + OwnRent + ' 
'YearBuilt + ElectricBill + FoodStamp + HeatingFuel + ' 
    'Insurance + Language',

    data=acs

)

In [8]:
predictors

DesignMatrix with shape (22745, 39)
  Columns:
    ['Intercept',
     'NumUnits[T.Single attached]',
     'NumUnits[T.Single detached]',
     'OwnRent[T.Outright]',
     'OwnRent[T.Rented]',
     'YearBuilt[T.1940-1949]',
     'YearBuilt[T.1950-1959]',
     'YearBuilt[T.1960-1969]',
     'YearBuilt[T.1970-1979]',
     'YearBuilt[T.1980-1989]',
     'YearBuilt[T.1990-1999]',
     'YearBuilt[T.2000-2004]',
     'YearBuilt[T.2005]',
     'YearBuilt[T.2006]',
     'YearBuilt[T.2007]',
     'YearBuilt[T.2008]',
     'YearBuilt[T.2009]',
     'YearBuilt[T.2010]',
     'YearBuilt[T.Before 1939]',
     'FoodStamp[T.Yes]',
     'HeatingFuel[T.Electricity]',
     'HeatingFuel[T.Gas]',
     'HeatingFuel[T.None]',
     'HeatingFuel[T.Oil]',
     'HeatingFuel[T.Other]',
     'HeatingFuel[T.Solar]',
     'HeatingFuel[T.Wood]',
     'Language[T.English]',
     'Language[T.Other]',
     'Language[T.Other European]',
     'Language[T.Spanish]',
     'NumBedrooms',
     'NumChildren',
     'NumPeople',


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(predictors, response,
                                                    random_state=0)

In [9]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(normalize=True).fit(X_train, y_train)



model_coefs = pd.DataFrame(list(zip(predictors.design_info.column_names,

                                    lr.coef_[0])),

                          columns=['variable', 'coef_lr'])

print(model_coefs)

                       variable       coef_lr
0                     Intercept  3.522660e-11
1   NumUnits[T.Single attached]  3.135646e+04
2   NumUnits[T.Single detached]  2.418368e+04
3           OwnRent[T.Outright]  2.839186e+04
4             OwnRent[T.Rented]  7.229586e+03
5        YearBuilt[T.1940-1949]  1.292169e+04
6        YearBuilt[T.1950-1959]  2.057793e+04
7        YearBuilt[T.1960-1969]  1.764835e+04
8        YearBuilt[T.1970-1979]  1.756881e+04
9        YearBuilt[T.1980-1989]  2.552566e+04
10       YearBuilt[T.1990-1999]  2.983944e+04
11       YearBuilt[T.2000-2004]  3.012502e+04
12            YearBuilt[T.2005]  4.318648e+04
13            YearBuilt[T.2006]  3.242038e+04
14            YearBuilt[T.2007]  3.562061e+04
15            YearBuilt[T.2008]  3.712470e+04
16            YearBuilt[T.2009]  3.035133e+04
17            YearBuilt[T.2010]  7.364529e+04
18     YearBuilt[T.Before 1939]  1.218711e+04
19             FoodStamp[T.Yes] -2.745712e+04
20   HeatingFuel[T.Electricity]  1

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [10]:
print(lr.score(X_train, y_train))

0.2726140465638568


In [11]:
print(lr.score(X_test, y_test))

0.26976979568488124


In [14]:
from sklearn.linear_model import Lasso

lasso = Lasso(normalize=True, random_state=0).fit(X_test, y_test)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [15]:
coefs_lasso = pd.DataFrame(

  list(zip(predictors.design_info.column_names, lasso.coef_)),

  columns=['variable', 'coef_lasso'])

In [16]:
model_coefs = pd.merge(model_coefs, coefs_lasso, on='variable')

print(model_coefs)

                       variable       coef_lr    coef_lasso
0                     Intercept  3.522660e-11      0.000000
1   NumUnits[T.Single attached]  3.135646e+04  23847.097905
2   NumUnits[T.Single detached]  2.418368e+04  20278.620009
3           OwnRent[T.Outright]  2.839186e+04  30153.611697
4             OwnRent[T.Rented]  7.229586e+03   1440.140884
5        YearBuilt[T.1940-1949]  1.292169e+04  -6382.312453
6        YearBuilt[T.1950-1959]  2.057793e+04   -905.142030
7        YearBuilt[T.1960-1969]  1.764835e+04     -0.000000
8        YearBuilt[T.1970-1979]  1.756881e+04  -1579.827129
9        YearBuilt[T.1980-1989]  2.552566e+04   7854.066748
10       YearBuilt[T.1990-1999]  2.983944e+04   1355.026160
11       YearBuilt[T.2000-2004]  3.012502e+04  11212.207583
12            YearBuilt[T.2005]  4.318648e+04   8770.315635
13            YearBuilt[T.2006]  3.242038e+04  34814.310436
14            YearBuilt[T.2007]  3.562061e+04  27415.800873
15            YearBuilt[T.2008]  3.71247

In [17]:
print(lasso.score(X_train, y_train))

0.26670104659430227


In [18]:
print(lasso.score(X_test, y_test))

0.27506204638605314


In [21]:
from sklearn.linear_model import Ridge

ridge = Ridge(normalize=True, random_state=0).fit(X_train, y_train)



coefs_ridge = pd.DataFrame(

  list(zip(predictors.design_info.column_names, ridge.coef_[0])),

  columns=['variable', 'coef_ridge'])



model_coefs = pd.merge(model_coefs, coefs_ridge, on='variable')

print(model_coefs)

                       variable       coef_lr    coef_lasso    coef_ridge
0                     Intercept  3.522660e-11      0.000000      0.000000
1   NumUnits[T.Single attached]  3.135646e+04  23847.097905   4571.129321
2   NumUnits[T.Single detached]  2.418368e+04  20278.620009   4514.956813
3           OwnRent[T.Outright]  2.839186e+04  30153.611697  10674.890982
4             OwnRent[T.Rented]  7.229586e+03   1440.140884 -10180.631863
5        YearBuilt[T.1940-1949]  1.292169e+04  -6382.312453  -3672.096659
6        YearBuilt[T.1950-1959]  2.057793e+04   -905.142030   1221.616020
7        YearBuilt[T.1960-1969]  1.764835e+04     -0.000000    -15.801437
8        YearBuilt[T.1970-1979]  1.756881e+04  -1579.827129  -1868.746915
9        YearBuilt[T.1980-1989]  2.552566e+04   7854.066748   2664.343363
10       YearBuilt[T.1990-1999]  2.983944e+04   1355.026160   4079.639281
11       YearBuilt[T.2000-2004]  3.012502e+04  11212.207583   5615.285677
12            YearBuilt[T.2005]  4.318

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [22]:
from sklearn.linear_model import ElasticNet



en = ElasticNet(random_state=42).fit(X_train, y_train)

In [23]:
coefs_en = pd.DataFrame(

  list(zip(predictors.design_info.

  column_names, en.coef_)), columns=['variable', 'coef_en'])



model_coefs = pd.merge(model_coefs, coefs_en, on='variable')

print(model_coefs)

                       variable       coef_lr    coef_lasso    coef_ridge  \
0                     Intercept  3.522660e-11      0.000000      0.000000   
1   NumUnits[T.Single attached]  3.135646e+04  23847.097905   4571.129321   
2   NumUnits[T.Single detached]  2.418368e+04  20278.620009   4514.956813   
3           OwnRent[T.Outright]  2.839186e+04  30153.611697  10674.890982   
4             OwnRent[T.Rented]  7.229586e+03   1440.140884 -10180.631863   
5        YearBuilt[T.1940-1949]  1.292169e+04  -6382.312453  -3672.096659   
6        YearBuilt[T.1950-1959]  2.057793e+04   -905.142030   1221.616020   
7        YearBuilt[T.1960-1969]  1.764835e+04     -0.000000    -15.801437   
8        YearBuilt[T.1970-1979]  1.756881e+04  -1579.827129  -1868.746915   
9        YearBuilt[T.1980-1989]  2.552566e+04   7854.066748   2664.343363   
10       YearBuilt[T.1990-1999]  2.983944e+04   1355.026160   4079.639281   
11       YearBuilt[T.2000-2004]  3.012502e+04  11212.207583   5615.285677   

In [25]:
from sklearn.linear_model import ElasticNetCV



en_cv = ElasticNetCV(cv=5, random_state=42).fit(X_train, y_train)

coefs_en_cv = pd.DataFrame(

  list(zip(predictors.design_info.

  column_names, en_cv.coef_)), columns=['variable', 'coef_en_cv'])



model_coefs = pd.merge(model_coefs, coefs_en_cv, on='variable')

print(model_coefs)

                       variable       coef_lr    coef_lasso    coef_ridge  \
0                     Intercept  3.522660e-11      0.000000      0.000000   
1   NumUnits[T.Single attached]  3.135646e+04  23847.097905   4571.129321   
2   NumUnits[T.Single detached]  2.418368e+04  20278.620009   4514.956813   
3           OwnRent[T.Outright]  2.839186e+04  30153.611697  10674.890982   
4             OwnRent[T.Rented]  7.229586e+03   1440.140884 -10180.631863   
5        YearBuilt[T.1940-1949]  1.292169e+04  -6382.312453  -3672.096659   
6        YearBuilt[T.1950-1959]  2.057793e+04   -905.142030   1221.616020   
7        YearBuilt[T.1960-1969]  1.764835e+04     -0.000000    -15.801437   
8        YearBuilt[T.1970-1979]  1.756881e+04  -1579.827129  -1868.746915   
9        YearBuilt[T.1980-1989]  2.552566e+04   7854.066748   2664.343363   
10       YearBuilt[T.1990-1999]  2.983944e+04   1355.026160   4079.639281   
11       YearBuilt[T.2000-2004]  3.012502e+04  11212.207583   5615.285677   

  y = column_or_1d(y, warn=True)
