# Predicting House Prices Using LASSO (Coordinate Descent)

In [1]:
import graphlab

##Dataset is from house sales in King County, the region where the city of Seattle, WA is located.

In [None]:
sales = graphlab.SFrame('kc_house_data.gl/')
sales['floors'] = sales['floors'].astype(int) 

In [26]:
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [3]:
import numpy as np 

In [4]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1
    features = ['constant'] + features
    features_sframe = data_sframe[features]
    features_matrix = features_sframe.to_numpy()
    output_sarray = data_sframe[output]
    output_array = output_sarray.to_numpy()
    return(features_matrix, output_array)

In [5]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [6]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    normalized_features=feature_matrix/norms
    return(normalized_features,norms)

##Coordinate Descent:-

In [8]:
def calculating_ro(simple_feature_matrix_coloumn,weights,output,predictions):
    #simple_feature_matrix_coloumn=simple_feature_matrix_coloumn.transpose()
    #print simple_feature_matrix_coloumn
    #ro1 = weights*simple_feature_matrix_coloumn
    #print ro1
    #ro2=output-predictions
    #print ro2
    #ro3=ro1+ro2
    #print ro3
    #ro=np.dot(simple_feature_matrix_coloumn,ro3)
    ro = (simple_feature_matrix_coloumn* (output - predictions + (weights* simple_feature_matrix_coloumn))).sum()
    return ro

In [9]:
len(sales)

21613

##Single Coordinate Descent Step:-

In [10]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_outcome(feature_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = calculating_ro(feature_matrix[ 0:21613,i],weights[i],output,prediction)

    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i 
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i+(l1_penalty/2.)
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i-(l1_penalty/2.)
    else:
        new_weight_i = 0.
    
    return new_weight_i

To test the function, run the following cell:

##Cyclical Coordinate Descent:- 

In [12]:
import sys
def lasso_cyclical_coordinate_descent(feature_matrix, output, weights, l1_penalty, tolerance):
    
    converged=False
    while not converged:
       maxxx=sys.float_info.min
       for i in range(len(weights)):
          old_weights_i = weights[i] 
          weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
          #print 'old_weights_i ',old_weights_i,'&','weights[i] ',weights[i]
          diff=weights[i]-old_weights_i
          if diff<0:
           diff=-diff
          if diff>maxxx:
            maxxx=diff
       if maxxx<tolerance:
          converged=True
    return np.array(weights)    

In [13]:
train_data,test_data = sales.random_split(.8,seed=0)

##Considering the following set of features:-

In [14]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']

In [15]:
(simple_feature_matrix, output) = get_numpy_data(train_data, all_features, 'price')
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix) # normalize features

##Learning the weights with `l1_penalty=1e7`, on the training data. Initializing weights to all zeros, and setting the `tolerance=1`:-

In [16]:
initialweights = np.zeros(14)
l1_penalty = 1e7
tolerance = 1.0
weights1e7 = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output,
                                            initialweights, l1_penalty, tolerance)
print weights1e7

[ 24429600.60933314         0.                 0.          48389174.35227978
         0.                 0.           3317511.16271982   7329961.9848964
         0.                 0.                 0.                 0.
         0.                 0.        ]


##Now, learning the weights with `l1_penalty=1e8`, on the training data. Initializing weights to all zeros, and setting the `tolerance=1`:-

In [18]:
initialweights = np.zeros(14)
l1_penalty = 1e8
tolerance = 1.0
weights1e8 = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output,
                                            initialweights, l1_penalty, tolerance)
print weights1e8

[ 71114625.75280938         0.                 0.                 0.
         0.                 0.                 0.                 0.
         0.                 0.                 0.                 0.
         0.                 0.        ]


Now, learning the weights with `l1_penalty=1e4`, on the training data. Initializing weights to all zeros, and setting the `tolerance=5e5`:-

In [19]:
initialweights = np.zeros(14)
l1_penalty = 1e4
tolerance = 5e5
weights1e4 = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output,
                                            initialweights, l1_penalty, tolerance)
print weights1e4

[ 77779073.91265215 -22884012.25023361  15348487.08089997
  92166869.69883084  -2139328.0824278   -8818455.54409496
   6494209.73310655   7065162.05053197   4119079.21006765
  18436483.52618778 -14566678.54514349  -5528348.75179429
 -83591746.20730527   2784276.46012858]


##Creating a normalized version of each of the weights learned above i.e., (`weights1e4`, `weights1e7`, `weights1e8`):-

In [20]:
(simple_feature_matrix, output) = get_numpy_data(train_data, all_features, 'price')
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix) # normalize features
print weights1e7

[ 24429600.60933314         0.                 0.          48389174.35227978
         0.                 0.           3317511.16271982   7329961.9848964
         0.                 0.                 0.                 0.
         0.                 0.        ]


In [22]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, all_features, 'price')

##Computing the RSS of each of the three normalized weights on the `test_feature_matrix`:-

In [23]:
pred=predict_outcome(test_feature_matrix,normalized_weights1e7 )
sum=float(0)
for i in range(len(pred)):
    x=pred[i]-test_output[i]
    y=x*x
    sum=sum+y
print sum

2.75962079909e+14


In [24]:
pred1=predict_outcome(test_feature_matrix,normalized_weights1e8 )
sum1=float(0)
for i in range(len(pred1)):
    x=pred1[i]-test_output[i]
    y=x*x
    sum1=sum1+y
print sum1

5.37166150034e+14


In [25]:
pred2=predict_outcome(test_feature_matrix,normalized_weights1e4 )
sum2=float(0)
for i in range(len(pred2)):
    x=pred2[i]-test_output[i]
    y=x*x
    sum2=sum2+y
print sum2

2.2778100476e+14
