# Regression Week5 : Feature Selection and Lasso

In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [51]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

## Create new features

In [52]:
from math import log, sqrt

In [53]:
def add_features(sales):
    sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
    sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
    sales['bedrooms_square'] = sales['bedrooms'] * sales['bedrooms']
    sales['floors_square'] = sales['floors'] * sales['floors']
    return sales

In [54]:
sales = add_features(sales)

## Learn regression weights with L1 penalty

In [55]:
from sklearn.linear_model import Lasso

In [56]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [57]:
coef = model_all.coef_
intercept = model_all.intercept_
t = pd.DataFrame({'name':['intercept']+all_features,'value':[intercept]+coef.tolist()})

In [58]:
t[t.value!=0]

Unnamed: 0,name,value
0,intercept,-218136.214035
4,sqft_living,134.439314
11,view,24750.004586
13,grade,61749.103091


## Selecting an L1 penalty

In [40]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [59]:
testing = add_features(testing)
training = add_features(training)
validation = add_features(validation)

In [60]:
l1_penalty = np.logspace(1, 7, num=13)
RSS = []
for value in l1_penalty:
    model = Lasso(alpha=value, normalize=True)
    model.fit(training[all_features], training['price'])
    RSS.append(np.sum(np.square(model.predict(validation[all_features])-validation['price'])))

In [75]:
l = l1_penalty[np.argmin(RSS)]
model_best = Lasso(alpha = l,normalize=True, max_iter=2000)
model_best.fit(training[all_features], training['price'])
print(np.sum(np.square(model_best.predict(testing[all_features])-testing['price'])))

98467402552698.78


In [76]:
l

10.0

In [77]:
np.count_nonzero(model_best.coef_) + np.count_nonzero(model_best.intercept_)

15

## Limit the number of nonzero weights

In [78]:
max_nonzeros = 7

In [79]:
l1_penalty_values = np.logspace(1, 4, num=20)

In [80]:
number_of_nnz = list()
for l1_penalty in l1_penalty_values:
    model = Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features],training['price'])
    number_of_nnz.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))

In [82]:
l1_penalty_min = l1_penalty_values[np.array(number_of_nnz)>max_nonzeros].max()
l1_penalty_max = l1_penalty_values[np.array(number_of_nnz)<max_nonzeros].min()

In [83]:
print(l1_penalty_min, l1_penalty_max)

127.42749857031335 263.6650898730358


## Exploring the narrow range of values to find the solution with the right number of non-zero that has lowest RSS on the validation set

In [84]:
l1_penalty_values = np.linspace(l1_penalty_min,l1_penalty_max,20)

In [87]:
RSS_validation = list()
nnz_validation = list()
for l1_penalty in l1_penalty_values:
    model = Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features],training['price'])
    nnz_validation.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
    RSS_validation.append(sum((model.predict(validation[all_features])-validation['price'])**2))
RSS_validation = np.array(RSS_validation)
nnz_validation = np.array(nnz_validation)

In [100]:
l = l1_penalty_values[nnz_validation==max_nonzeros][np.argmin(RSS_validation[nnz_validation==max_nonzeros])]

In [101]:
l

156.10909673930755

In [96]:
model = Lasso(alpha=l, normalize=True)
model.fit(training[all_features],training['price'])

Lasso(alpha=127.42749857031335, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [97]:
coef = model.coef_
intercept = model.intercept_
t = pd.DataFrame({'name':['intercept']+all_features,'value':[intercept]+coef.tolist()})

In [98]:
t[t['value']!=0]

Unnamed: 0,name,value
0,intercept,4827874.0
1,bedrooms,-2786.608
3,bathrooms,16295.28
4,sqft_living,164.9116
7,sqft_lot_sqrt,-48.72995
9,floors_square,390.2552
10,waterfront,528877.0
11,view,42420.81
13,grade,118475.5
16,yr_built,-2827.808
