In [1]:
import graphlab
import numpy as np
import pandas as pd

In [2]:
def get_numpy_data(data_dframe, features, output):
    data_dframe['constant'] = 1 # add a constant column to an dataframe
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_dataframe given by the ‘features’ list into the dataFrame ‘features_df’
    features_df = pd.DataFrame(data_dframe, columns=features)
    # this will convert the features_df into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_df.as_matrix()
    # assign the column of data_dframe associated with the target to the variable ‘output_sarray’
    output_series = pd.Series(data_dframe['price'])
    # this will convert the Series into a numpy array:
    output_array = output_series.as_matrix()
    return(features_matrix, output_array)

In [3]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [45]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

In [46]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
house_data_train = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
house_data_test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [47]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)

In [48]:
weights = np.array([1., 4., 1.])
predictions = predict_outcome(simple_feature_matrix, weights)

In [49]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)

In [54]:
1.73e8/2
#79704172
#70000000

86500000.0

In [51]:
ro = []
for i in range(0, simple_feature_matrix.shape[1]):
    ro.append(np.sum(simple_feature_matrix[:, i]*( sales['price'] - predictions + ( weights[i]*simple_feature_matrix[:, i] ))))

In [52]:
ro

[78176568.83295034, 86601827.95929945, 79704172.60800135]

In [12]:
#print sales['price']
print predictions
print weights[1] * simple_feature_matrix[:, 1] 

[  4724.  10284.   3083. ...,   4083.   6404.   4083.]
[  4720.  10280.   3080. ...,   4080.   6400.   4080.]


In [13]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_outcome(feature_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = np.sum( feature_matrix[:, i]*(output - prediction + ( weights[i]*feature_matrix[:, i])))
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i 
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2.
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2.
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [14]:
# should print 0.425558846691
import math
print lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([-1., 1.]), np.array([1., 4.]), 0.1)

-0.106896685343


In [15]:
print np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]])

print np.array([1., 1.])

print weights

[[ 0.83205029  0.31622777]
 [ 0.5547002   0.9486833 ]]
[ 1.  1.]
[ 1.  4.  1.]


In [16]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, weights, l1_penalty, tolerance):
    converged = False
    while not converged:
        check_list = []
        for i in range(len(weights)):
            old_weights_i = weights[i] # remember old value of weight[i], as it will be overwritten
            # the following line uses new values for weight[0], weight[1], ..., weight[i-1]
            #     and old values for weight[i], ..., weight[d-1]
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            # use old_weights_i to compute change in coordinate
            if abs(weights[i] - old_weights_i) > tolerance:
                check_list.append(True)
            else:
                check_list.append(False)
            #print i
            #print weights[i]
        if not pd.Series(check_list).any():
            converged = True
    return weights

In [17]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

In [18]:
(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix)

In [19]:
weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

In [20]:
weights

array([ 21624997.95951909,  63157247.20788956,         0.        ])

In [21]:
RSS = np.sum((predict_outcome(normalized_simple_feature_matrix, weights) - sales['price'])**2)

In [63]:
print RSS

1.63049247672e+15


In [23]:
house_data_train['floors'] = house_data_train['floors'].astype(float) 
house_data_test['floors'] = house_data_test['floors'].astype(float) 
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']

In [24]:
(feature_matrix, output_new) = get_numpy_data(house_data_train, all_features, 'price')
(normalized_feature_matrix, norms) = normalize_features(feature_matrix)

In [25]:
initial_weights_14 = np.zeros(14)
l1_penalty = 1e7
tolerance = 1.0
weights1e7 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output_new,initial_weights_14, l1_penalty, tolerance)

In [26]:
weights1e7

array([ 24429600.23440312,         0.        ,         0.        ,
        48389174.77154896,         0.        ,         0.        ,
         3317511.21492165,   7329961.81171425,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [27]:
pd.Series(all_features)[weights1e7[1:] > 0]

2    sqft_living
5     waterfront
6           view
dtype: object

In [28]:
l1_penalty=1e8
initial_weights_14 = np.zeros(14)
tolerance = 1.0
weights1e8 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output_new,initial_weights_14, l1_penalty, tolerance)

In [29]:
weights1e8

array([ 71114625.71488702,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [30]:
l1_penalty=1e4
initial_weights_14 = np.zeros(14)
tolerance = 5e5
weights1e4 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output_new,initial_weights_14, l1_penalty, tolerance)

In [31]:
weights1e4

array([ 78564738.34156762, -22097398.92430532,  12791071.87278517,
        93808088.09281193,  -2013172.75704954,  -4219184.93265014,
         6482842.81753506,   7127408.53480689,   5001664.8546964 ,
        14327518.43714051, -15770959.15237397,  -5159591.22213147,
       -84495341.7684364 ,   2824439.49703683])

In [32]:
norms

array([  1.31848398e+02,   4.60040216e+02,   2.96850552e+02,
         2.99962419e+05,   5.81709718e+06,   2.09458827e+02,
         1.15325626e+01,   1.05933942e+02,   4.57793622e+02,
         1.02101959e+03,   2.59726472e+05,   7.01224951e+04,
         2.59922094e+05,   5.36953839e+04])

In [33]:
normalized_weights1e4 = weights1e4 / norms
normalized_weights1e7 = weights1e7 / norms
normalized_weights1e8 = weights1e8 / norms

In [34]:
print normalized_weights1e4

[  5.95871771e+05  -4.80336244e+04   4.30892643e+04   3.12732803e+02
  -3.46078585e-01  -2.01432664e+04   5.62133764e+05   6.72816325e+04
   1.09255888e+04   1.40325598e+04  -6.07214159e+01  -7.35796867e+01
  -3.25079490e+02   5.26011603e+01]


In [35]:
print normalized_weights1e7

[  1.85285530e+05   0.00000000e+00   0.00000000e+00   1.61317458e+02
   0.00000000e+00   0.00000000e+00   2.87664705e+05   6.91937041e+04
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00]


In [36]:
print normalized_weights1e8

[ 539366.62793373       0.               0.               0.               0.
       0.               0.               0.               0.               0.
       0.               0.               0.               0.        ]


In [37]:
(test_feature_matrix, test_output) = get_numpy_data(house_data_test, all_features, 'price')

In [38]:
output_1e4 = predict_outcome(test_feature_matrix, normalized_weights1e4)
output_1e7 = predict_outcome(test_feature_matrix, normalized_weights1e7)
output_1e8 = predict_outcome(test_feature_matrix, normalized_weights1e8)

In [39]:
RSS_1e4 = np.sum((test_output - output_1e4)**2)
RSS_1e7 = np.sum((test_output - output_1e7)**2)
RSS_1e8 = np.sum((test_output - output_1e8)**2)

In [40]:
print("RSS of 1e4 is %d" % RSS_1e4)
print("RSS of 1e7 is %d" % RSS_1e7)
print("RSS of 1e8 is %d" % RSS_1e8)

RSS of 1e4 is 228459958971393
RSS of 1e7 is 275962075920366
RSS of 1e8 is 537166151497322


In [43]:
4.0e10/1

40000000000.0

In [None]:
40974255627
40000000000