In [8]:
from sklearn import linear_model
import pandas as pd
import numpy as np
from math import log, sqrt

In [9]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [10]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [11]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [12]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [13]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [96]:
bool1 = model_all.coef_ > 0

In [31]:
pd.Series(all_features)[bool1]

3     sqft_living
10           view
12          grade
dtype: object

In [32]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [33]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [43]:
l1_penalty = np.logspace(1, 7, num=13)

In [44]:
l1_penalty

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [45]:
for l1_penalty in l1_penalty:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    predicted_value = model.predict(validation[all_features])
    RSS = np.sum((predicted_value - validation['price'])**2)
    print("RSS for %d l1-penalty is %d" %(l1_penalty, RSS))

RSS for 10 l1-penalty is 398213327300134
RSS for 31 l1-penalty is 399041900253348
RSS for 100 l1-penalty is 429791604072557
RSS for 316 l1-penalty is 463739831045119
RSS for 1000 l1-penalty is 645898733633810
RSS for 3162 l1-penalty is 1222506859427156
RSS for 10000 l1-penalty is 1222506859427156
RSS for 31622 l1-penalty is 1222506859427156
RSS for 100000 l1-penalty is 1222506859427156
RSS for 316227 l1-penalty is 1222506859427156
RSS for 1000000 l1-penalty is 1222506859427156
RSS for 3162277 l1-penalty is 1222506859427156
RSS for 10000000 l1-penalty is 1222506859427156


In [46]:
model_best = linear_model.Lasso(alpha=10, normalize=True)
model_best.fit(training[all_features], training['price'])
predict_test = model_best.predict(testing[all_features])
RSS = np.sum((predict_test - testing['price'])**2)
print("RSS for test dataset is %d" %(RSS))

RSS for test dataset is 98467402552698


In [47]:
model_best.coef_

array([ -1.61445628e+04,   3.73245384e+02,   5.08412433e+04,
         6.17853560e+02,  -4.44113549e+04,   7.85623065e-01,
        -7.01194765e+02,  -0.00000000e+00,   5.01420046e+03,
         6.19488752e+05,   3.80418557e+04,   2.49987718e+04,
         1.28716235e+05,   0.00000000e+00,   0.00000000e+00,
        -3.29383118e+03,   1.00573209e+01])

In [48]:
model_best.intercept_

6630155.6686283695

In [49]:
np.count_nonzero(model_best.coef_) + np.count_nonzero(model_best.intercept_)

15

In [74]:
max_nonzeros = 7
for i in np.logspace(1, 4, num=20):
    print i
    model_i = linear_model.Lasso(alpha=i, normalize=True)
    model_i.fit(training[all_features], training['price'])
    temp = (np.count_nonzero(model_i.coef_) + np.count_nonzero(model_i.intercept_))
    print("no of non zero coefficient for l1_penalty %d is %d" % (i, temp))

10.0
no of non zero coefficient for l1_penalty 10 is 15
14.3844988829
no of non zero coefficient for l1_penalty 14 is 15
20.6913808111
no of non zero coefficient for l1_penalty 20 is 15
29.7635144163
no of non zero coefficient for l1_penalty 29 is 15
42.8133239872
no of non zero coefficient for l1_penalty 42 is 13
61.5848211066
no of non zero coefficient for l1_penalty 61 is 12
88.586679041
no of non zero coefficient for l1_penalty 88 is 11
127.42749857
no of non zero coefficient for l1_penalty 127 is 10
183.298071083
no of non zero coefficient for l1_penalty 183 is 7
263.665089873
no of non zero coefficient for l1_penalty 263 is 6
379.269019073
no of non zero coefficient for l1_penalty 379 is 6
545.559478117
no of non zero coefficient for l1_penalty 545 is 6
784.759970351
no of non zero coefficient for l1_penalty 784 is 5
1128.83789168
no of non zero coefficient for l1_penalty 1128 is 3
1623.77673919
no of non zero coefficient for l1_penalty 1623 is 3
2335.72146909
no of non zero coef

In [72]:
l1_penalty_min = 127.42749857
l1_penalty_max = 263.665089873

In [86]:
for i in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=i, normalize=True)
    model.fit(training[all_features], training['price'])
    temp = (np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
    predict = model.predict(validation[all_features])
    RSS = np.sum((predict - validation['price'])**2)
    print("(l1_penalty, no_of_nonZero_variables, RSS) ==> (%f, %d, %f)" % (i, temp, RSS))

(l1_penalty, no_of_nonZero_variables, RSS) ==> (127.427499, 10, 435374677102612.500000)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (134.597898, 10, 437009229124402.625000)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (141.768298, 8, 438236128386877.062500)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (148.938697, 8, 439158937799623.875000)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (156.109097, 7, 440037365263289.875000)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (163.279496, 7, 440777489641579.000000)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (170.449896, 7, 441566698090113.812500)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (177.620295, 7, 442406413188640.250000)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (184.790695, 7, 443296716874289.625000)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (191.961094, 7, 444239780526117.687500)
(l1_penalty, no_of_nonZero_variables, RSS) ==> (199.131494, 7, 445230739842591.187500)
(l1_penalty, no_of_nonZero_variables, RSS

In [87]:
final_model = linear_model.Lasso(alpha=156.109097, normalize=True)
final_model.fit(training[all_features], training['price'])
final_model.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   1.06108902e+04,
         1.63380252e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.06451687e+05,   4.19600436e+04,   0.00000000e+00,
         1.16253554e+05,   0.00000000e+00,   0.00000000e+00,
        -2.61223488e+03,   0.00000000e+00])

In [98]:
pd.Series(all_features)[final_model.coef_ != 0]

2       bathrooms
3     sqft_living
9      waterfront
10           view
12          grade
15       yr_built
dtype: object

In [97]:
all_features

['bedrooms',
 'bedrooms_square',
 'bathrooms',
 'sqft_living',
 'sqft_living_sqrt',
 'sqft_lot',
 'sqft_lot_sqrt',
 'floors',
 'floors_square',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

In [100]:
np.logspace(1, 4, num=20)

array([    10.        ,     14.38449888,     20.69138081,     29.76351442,
           42.81332399,     61.58482111,     88.58667904,    127.42749857,
          183.29807108,    263.66508987,    379.26901907,    545.55947812,
          784.75997035,   1128.83789168,   1623.77673919,   2335.72146909,
         3359.81828628,   4832.93023857,   6951.92796178,  10000.        ])