In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
sales = sales.sort_values(['sqft_living','price'])

In [4]:
l2_small_penalty = 1.5e-5

In [5]:
def polynomial_dataframe(df, feature, degree): # feature is pandas.Series type
    # assume that degree >= 1
    poly_dataframe= pd.DataFrame()
    poly_dataframe[feature] = df[feature]
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            poly_dataframe[name] = poly_dataframe[feature].apply(lambda x: x**power)
    return poly_dataframe

In [6]:
poly15_data = polynomial_dataframe(df = sales, feature='sqft_living', degree=15)

In [8]:
model = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model.fit(poly15_data, sales['price'])

Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [9]:
model.coef_

array([  1.24873306e+02,  -4.77376011e-02,   3.01446238e-05,
        -2.44419942e-09,  -1.94153675e-13,   8.54085686e-18,
         1.51142121e-21,   8.27979094e-26,   6.52603100e-31,
        -3.27895017e-34,  -3.87962315e-38,  -2.72437650e-42,
        -1.07790800e-46,   3.78242694e-51,   1.39790296e-54])

In [10]:
model.intercept_

220664.37505306327

In [11]:
set_1 = pd.read_csv('wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv('wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv('wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv('wk3_kc_house_set_4_data.csv', dtype=dtype_dict)

In [12]:
poly15_set_1 = polynomial_dataframe(df = set_1, feature='sqft_living', degree=15)
poly15_set_2 = polynomial_dataframe(df = set_2, feature='sqft_living', degree=15)
poly15_set_3 = polynomial_dataframe(df = set_3, feature='sqft_living', degree=15)
poly15_set_4 = polynomial_dataframe(df = set_4, feature='sqft_living', degree=15)

In [13]:
l2_small_penalty=1e-9

In [14]:
model_set_1 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model_set_1.fit(poly15_set_1, set_1['price'])
model_set_1.coef_[0]

544.66939875124069

In [15]:
model_set_2 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model_set_2.fit(poly15_set_2, set_2['price'])
model_set_2.coef_[0]

859.36266158183673

In [16]:
model_set_3 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model_set_3.fit(poly15_set_3, set_3['price'])
model_set_3.coef_[0]

-755.39596179689272

In [17]:
model_set_4 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model_set_4.fit(poly15_set_4, set_4['price'])
model_set_4.coef_[0]

1119.4456937348741

In [18]:
l2_large_penalty=1.23e2


In [19]:
model_set_1 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model_set_1.fit(poly15_set_1, set_1['price'])
model_set_1.coef_[0]

2.3280680295793248

In [20]:
model_set_2 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model_set_2.fit(poly15_set_2, set_2['price'])
model_set_2.coef_[0]

2.097569027778555

In [21]:
model_set_3 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model_set_3.fit(poly15_set_3, set_3['price'])
model_set_3.coef_[0]

2.2890625811892047

In [22]:
model_set_4 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model_set_4.fit(poly15_set_4, set_4['price'])
model_set_4.coef_[0]

2.0859619409193071

In [60]:
import math
class ridge_cross_validation: 
    def __init__(self, train_x, train_y, penalty, k=10):
        self.train_x = train_x
        self.train_y = np.array(train_y)
        self.k = k
        self.penalty = penalty
        
    def fit(self):
        v_sz = math.ceil(len(self.train_y) / 10)
        model = linear_model.Ridge(alpha=self.penalty, normalize=True)
        rss = 0.0
        for i in range(self.k):
            start = i * v_sz
            end = start + v_sz if (start + v_sz) <= len(self.train_y) else len(self.train_y)
            df1 = self.train_x.iloc[0:start]
            df2 = self.train_x.iloc[end:]
            train_df = pd.concat([df1,df2], axis=0)
            train_label = np.concatenate((self.train_y[0:start], self.train_y[end:]))
            val_df = self.train_x.iloc[start:end]
            val_label = self.train_y[start:end]
            model.fit(train_df, train_label)
            y_hat = model.predict(val_df)
            rss += self.RSS_calc(y_hat, val_label)
        
        rss = rss/self.k
        return rss
            
            
    def RSS_calc(self, y_hat, y):
        rss=0.0
        for i in range(len(y)):
            rss += np.square(y_hat[i]-y[i])
        return rss
        
        

In [61]:
shuffle_data= pd.read_csv('wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)

In [62]:
l2_penalty = np.logspace(3, 9, num=13)

In [63]:
shuffle_poly_15 = polynomial_dataframe(df = shuffle_data, feature='sqft_living', degree=15)

1.00000000e+03


In [64]:
train_y = np.array(shuffle_data['price'])

In [65]:
summary = pd.DataFrame(columns=['Penalty', 'Average RSS'])
index = 0
for penalty in l2_penalty:
    model = ridge_cross_validation(train_x=shuffle_poly_15, train_y=train_y, penalty=penalty, k=10)
    rss = model.fit()
    summary.loc[index] = [penalty, rss]
    index +=1
summary.sort_values('Average RSS', inplace=True)

In [67]:
summary

Unnamed: 0,Penalty,Average RSS
0,1000.0,264977100000000.0
1,3162.278,265692700000000.0
2,10000.0,265924100000000.0
3,31622.78,265997800000000.0
4,100000.0,266021200000000.0
5,316227.8,266028600000000.0
6,1000000.0,266030900000000.0
7,3162278.0,266031700000000.0
8,10000000.0,266031900000000.0
9,31622780.0,266032000000000.0


In [70]:
shuffle_data= pd.read_csv('wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
shuffle_poly_15 = polynomial_dataframe(df = shuffle_data, feature='sqft_living', degree=15)
model = linear_model.Ridge(alpha=1000, normalize=True)
model.fit(shuffle_poly_15, shuffle_data['price'])

Ridge(alpha=1000, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [71]:
test_data = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
test_poly_15 = polynomial_dataframe(df = test_data, feature='sqft_living', degree=15)

In [72]:
test_y_hat = model.predict(test_poly_15)

In [75]:
test_y = np.array(test_data['price'])

In [76]:
def RSS_calc(y_hat, y):
        rss=0.0
        for i in range(len(y)):
            rss += np.square(y_hat[i]-y[i])
        return rss

In [77]:
test_rss = RSS_calc(test_y_hat, test_y)

In [78]:
test_rss

283856861224150.75