In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
%matplotlib inline

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float,
              'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float,
              'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [3]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(lambda x: np.sqrt(x))
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(lambda x: np.sqrt(x))
sales['bedrooms_square'] = sales['bedrooms'].apply(lambda x: np.square(x))
sales['floors_square'] = sales['floors'].apply(lambda x: np.square(x))

In [5]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [6]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [48]:
pd.Series(model_all.coef_, index = all_features)

bedrooms                0.000000
bedrooms_square         0.000000
bathrooms               0.000000
sqft_living           134.439314
sqft_living_sqrt        0.000000
sqft_lot                0.000000
sqft_lot_sqrt           0.000000
floors                  0.000000
floors_square           0.000000
waterfront              0.000000
view                24750.004586
condition               0.000000
grade               61749.103091
sqft_above              0.000000
sqft_basement           0.000000
yr_built               -0.000000
yr_renovated            0.000000
dtype: float64

In [8]:
model_all.intercept_

-218136.21403514093

In [9]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [10]:
training['sqft_living_sqrt'] = training['sqft_living'].apply(lambda x: np.sqrt(x))
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(lambda x: np.sqrt(x))
training['bedrooms_square'] = training['bedrooms'].apply(lambda x: np.square(x))
training['floors_square'] = training['floors'].apply(lambda x: np.square(x))

testing['sqft_living_sqrt'] = testing['sqft_living'].apply(lambda x: np.sqrt(x))
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(lambda x: np.sqrt(x))
testing['bedrooms_square'] = testing['bedrooms'].apply(lambda x: np.square(x))
testing['floors_square'] = testing['floors'].apply(lambda x: np.square(x))

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(lambda x: np.sqrt(x))
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(lambda x: np.sqrt(x))
validation['bedrooms_square'] = validation['bedrooms'].apply(lambda x: np.square(x))
validation['floors_square'] = validation['floors'].apply(lambda x: np.square(x))

In [29]:
l1_penalty = np.logspace(1, 7, num=13)

In [30]:
class lasso:
    def __init__(self, train_x, train_y, val_x, val_y, test_x, test_y, penalty):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.test_y = test_y
        self.val_x = val_x
        self.val_y = val_y
        self.penalty = penalty
        
    def fit (self):
        model = linear_model.Lasso(alpha=self.penalty, normalize=True)
        model.fit(self.train_x, self.train_y) # learn weights
        self.coff = model.coef_
        self.intercept = model.intercept_
        val_y_hat = model.predict(self.val_x)
        val_rss = self.RSS_calc(val_y_hat, self.val_y)
        test_y_hat =  model.predict(self.test_x)
        test_rss = self.RSS_calc(test_y_hat, self.test_y)
        non_zero = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
        return val_rss, test_rss, non_zero
    def RSS_calc(self, prediction, target):
        return np.sum((prediction - target)**2)
        

In [31]:
summary = pd.DataFrame(data=None, columns=['L1 Penalty', 'Validation RSS', 'Test RSS', 'Non Zero'])
index = 0
for l1 in l1_penalty:
    train_x= training[all_features]
    train_y = training['price']
    test_x= testing[all_features]
    test_y = testing['price']
    val_x= validation[all_features]
    val_y = validation['price']
    model = lasso(train_x= train_x, train_y=train_y, val_x=val_x, val_y=val_y,
                  test_x=test_x, test_y=test_y, penalty=l1)
    val_rss, test_rss, nonzero = model.fit()
    summary.loc[index] = [l1, val_rss, test_rss, nonzero]
    index+=1

summary.sort_values('Validation RSS', inplace=True)






In [32]:
summary

Unnamed: 0,L1 Penalty,Validation RSS,Test RSS,Non Zero
0,10.0,398213300000000.0,98467400000000.0,15.0
1,31.62278,399041900000000.0,99776330000000.0,15.0
2,100.0,429791600000000.0,107020700000000.0,11.0
3,316.2278,463739800000000.0,114213700000000.0,6.0
4,1000.0,645898700000000.0,151694100000000.0,4.0
5,3162.278,1222507000000000.0,284718900000000.0,1.0
6,10000.0,1222507000000000.0,284718900000000.0,1.0
7,31622.78,1222507000000000.0,284718900000000.0,1.0
8,100000.0,1222507000000000.0,284718900000000.0,1.0
9,316227.8,1222507000000000.0,284718900000000.0,1.0


In [33]:
max_nonzeros = 7
l1_penalty = np.logspace(1, 4, num=20)


In [34]:
l1_penalty

array([    10.        ,     14.38449888,     20.69138081,     29.76351442,
           42.81332399,     61.58482111,     88.58667904,    127.42749857,
          183.29807108,    263.66508987,    379.26901907,    545.55947812,
          784.75997035,   1128.83789168,   1623.77673919,   2335.72146909,
         3359.81828628,   4832.93023857,   6951.92796178,  10000.        ])

In [35]:
search = pd.DataFrame(columns=['L1 Penalty', 'Validation RSS', 'Test RSS', 'Non Zero'])
index = 0
train_x= training[all_features]
train_y = training['price']
test_x= testing[all_features]
test_y = testing['price']
val_x= validation[all_features]
val_y = validation['price']
for l1 in l1_penalty:
    model = lasso(train_x= train_x, train_y=train_y, val_x=val_x, val_y=val_y,
                  test_x=test_x, test_y=test_y, penalty=l1)
    val_rss, test_rss, nonzero = model.fit()
    search.loc[index] = [l1, val_rss, test_rss, nonzero]
    index+=1

In [36]:
search

Unnamed: 0,L1 Penalty,Validation RSS,Test RSS,Non Zero
0,10.0,398213300000000.0,98467400000000.0,15.0
1,14.384499,396831800000000.0,98466490000000.0,15.0
2,20.691381,396210900000000.0,98698790000000.0,15.0
3,29.763514,398215500000000.0,99535880000000.0,15.0
4,42.813324,406877300000000.0,101711800000000.0,13.0
5,61.584821,424647500000000.0,105687900000000.0,12.0
6,88.586679,427906300000000.0,106558300000000.0,11.0
7,127.427499,435374700000000.0,108321000000000.0,10.0
8,183.298071,443107200000000.0,110002300000000.0,7.0
9,263.66509,454176700000000.0,112271300000000.0,6.0


In [37]:
l1_min = 127.427499
l1_max = 263.665090

l1_penalty = np.linspace(l1_min,l1_max,20)

In [38]:
l1_penalty

array([ 127.427499  ,  134.59789853,  141.76829805,  148.93869758,
        156.10909711,  163.27949663,  170.44989616,  177.62029568,
        184.79069521,  191.96109474,  199.13149426,  206.30189379,
        213.47229332,  220.64269284,  227.81309237,  234.98349189,
        242.15389142,  249.32429095,  256.49469047,  263.66509   ])

In [39]:
optimal = pd.DataFrame(columns=['L1 Penalty', 'Validation RSS', 'Test RSS', 'Non Zero'])
index = 0
train_x= training[all_features]
train_y = training['price']
test_x= testing[all_features]
test_y = testing['price']
val_x= validation[all_features]
val_y = validation['price']
for l1 in l1_penalty:
    model = lasso(train_x= train_x, train_y=train_y, val_x=val_x, val_y=val_y,
                  test_x=test_x, test_y=test_y, penalty=l1)
    val_rss, test_rss, nonzero = model.fit()
    optimal.loc[index] = [l1, val_rss, test_rss, nonzero]
    index+=1

In [40]:
optimal

Unnamed: 0,L1 Penalty,Validation RSS,Test RSS,Non Zero
0,127.427499,435374700000000.0,108321000000000.0,10.0
1,134.597899,437009200000000.0,108673600000000.0,10.0
2,141.768298,438236100000000.0,108940100000000.0,8.0
3,148.938698,439158900000000.0,109150700000000.0,8.0
4,156.109097,440037400000000.0,109346800000000.0,7.0
5,163.279497,440777500000000.0,109506000000000.0,7.0
6,170.449896,441566700000000.0,109674800000000.0,7.0
7,177.620296,442406400000000.0,109853600000000.0,7.0
8,184.790695,443296700000000.0,110042400000000.0,7.0
9,191.961095,444239800000000.0,110241700000000.0,7.0


In [41]:
optimal.sort_values(['Non Zero', 'Validation RSS'], inplace=True)

In [42]:
optimal

Unnamed: 0,L1 Penalty,Validation RSS,Test RSS,Non Zero
11,206.301894,446268900000000.0,110667900000000.0,6.0
12,213.472293,447112900000000.0,110838900000000.0,6.0
13,220.642693,447998200000000.0,111018300000000.0,6.0
14,227.813092,448924700000000.0,111206100000000.0,6.0
15,234.983492,449892500000000.0,111402300000000.0,6.0
16,242.153891,450901500000000.0,111607000000000.0,6.0
17,249.324291,451952400000000.0,111819900000000.0,6.0
18,256.49469,453043900000000.0,112041400000000.0,6.0
19,263.66509,454176700000000.0,112271300000000.0,6.0
4,156.109097,440037400000000.0,109346800000000.0,7.0


In [43]:
optimal_l1 = 156.109097

In [44]:
optimal_model = linear_model.Lasso(alpha=optimal_l1, normalize=True) # set parameters
optimal_model.fit(training[all_features], training['price']) # learn weights

Lasso(alpha=156.109097, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [45]:
all_features

['bedrooms',
 'bedrooms_square',
 'bathrooms',
 'sqft_living',
 'sqft_living_sqrt',
 'sqft_lot',
 'sqft_lot_sqrt',
 'floors',
 'floors_square',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

In [47]:
pd.Series(optimal_model.coef_, index = all_features)

bedrooms                -0.000000
bedrooms_square         -0.000000
bathrooms            10610.890230
sqft_living            163.380252
sqft_living_sqrt         0.000000
sqft_lot                -0.000000
sqft_lot_sqrt           -0.000000
floors                   0.000000
floors_square            0.000000
waterfront          506451.686907
view                 41960.043550
condition                0.000000
grade               116253.553672
sqft_above               0.000000
sqft_basement            0.000000
yr_built             -2612.234878
yr_renovated             0.000000
dtype: float64