In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype= dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype= dtype_dict)
print ('Number of Training examples: {}'.format(len(train_data)))
print ('Number of Testing examples: {}'.format(len(test_data)))


Number of Training examples: 17384
Number of Testing examples: 4229


bedrooms_squared = bedrooms X bedrooms |
bed_bath_rooms = bedrooms X bathrooms  |
log_sqft_living = log(sqft_living)   |
lat_plus_long =lat + long  


In [4]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x*x)
train_data['bed_bath_rooms'] = train_data[['bedrooms', 'bathrooms']].apply(lambda x : x['bedrooms'] * x['bathrooms'], axis=1)
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: np.log(x))
train_data['lat_plus_long'] = train_data[['lat', 'long']].apply(lambda x : x['lat'] + x['long'], axis=1)

In [5]:
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x*x)
test_data['bed_bath_rooms'] = test_data[['bedrooms', 'bathrooms']].apply(lambda x : x['bedrooms'] * x['bathrooms'], axis=1)
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: np.log(x))
test_data['lat_plus_long'] = test_data[['lat', 'long']].apply(lambda x : x['lat'] + x['long'], axis=1)

In [6]:
test_data['bedrooms_squared'].mean()

12.446677701584299

In [7]:
test_data['bed_bath_rooms'].mean()

7.5039016315913925

In [8]:
test_data['log_sqft_living'].mean()

7.5502746796459377

In [9]:
test_data['lat_plus_long'].mean()

-74.653333554031676

In [10]:
def data_prep(df, cols):
    x = df[cols].as_matrix()
    b = np.ones(shape=(len(x),1), dtype=np.float32)
    x = np.concatenate((b,x), axis=1)
    return x


In [11]:
y_train = np.array(train_data['price'])
y_test = np.array(test_data['price'])
x_train_model_1 = data_prep(train_data, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long'])
x_train_model_2 = data_prep(train_data, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms'])
x_train_model_3 = data_prep(train_data, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared','log_sqft_living','lat_plus_long'])

x_test_model_1 = data_prep(test_data, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long'])
x_test_model_2 = data_prep(test_data, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms'])
x_test_model_3 = data_prep(test_data, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared','log_sqft_living','lat_plus_long'])




In [20]:
class LinearRegression:
    def __init__(self, train_x, train_y, test_x, test_y):
        self.train_x = train_x
        self.train_y = train_y
        self.train_N = len(train_y)
        self.test_x = test_x
        self.test_y = test_y
    def fit(self):
        self.w = np.linalg.solve(np.dot(self.train_x.T,self.train_x), np.dot(self.train_x.T, self.train_y))
        return self.w
    def train_predict(self):
        self.train_y_hat = np.dot(self.train_x,self.w)
        return self.train_y_hat
    def train_RSS(self):
        self.train_predict()
        rss = 0.0
        for i in range(self.train_N):
            rss += np.square(self.train_y[i] - self.train_y_hat[i])
        self.train_RSS = rss
        return self.train_RSS
    def test_RSS(self):
        self.test_y_hat = np.dot(self.test_x,self.w)
        rss = 0.0
        for i in range(len(self.test_y)):
            rss += np.square(self.test_y[i] - self.test_y_hat[i])
        self.test_RSS = rss
        return self.test_RSS

In [21]:
model_1 = LinearRegression(train_x= x_train_model_1, train_y=y_train, test_x=x_test_model_1, test_y=y_test)
model_2 = LinearRegression(train_x= x_train_model_2, train_y=y_train, test_x=x_test_model_2, test_y=y_test)
model_3 = LinearRegression(train_x= x_train_model_3, train_y=y_train, test_x=x_test_model_3, test_y=y_test)

In [22]:
w1 = model_1.fit()
w2 = model_2.fit()
w3 = model_3.fit()


Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’
Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’
Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’

In [23]:
w1

array([ -6.90757268e+07,   3.12258646e+02,  -5.95865332e+04,
         1.57067421e+04,   6.58619264e+05,  -3.09374351e+05])

In [24]:
w2

array([ -6.68679689e+07,   3.06610053e+02,  -1.13446368e+05,
        -7.14613083e+04,   6.54844630e+05,  -2.94298969e+05,
         2.55796520e+04])

In [25]:
w3

array([ -6.20360850e+07,   5.29422820e+02,   3.45142296e+04,
         6.70607813e+04,   1.41428126e+06,   4.73444938e+05,
        -8.57050439e+03,  -6.78858667e+03,  -5.61831484e+05,
        -7.52860749e+05])

In [26]:
train_RSS_1 = model_1.train_RSS()
print ("Model 1 training RSS: {}".format (train_RSS_1))

train_RSS_2 = model_2.train_RSS()
print ("Model 2 training RSS: {}".format (train_RSS_2))

train_RSS_3 = model_3.train_RSS()
print ("Model 3 training RSS: {}".format (train_RSS_3))

Model 1 training RSS: 967879963049543.4
Model 2 training RSS: 958419635074069.0
Model 3 training RSS: 903436455050477.9


In [27]:
test_RSS_1 = model_1.test_RSS()
print ("Model 1 testing RSS: {}".format (test_RSS_1))

test_RSS_2 = model_2.test_RSS()
print ("Model 2 testing RSS: {}".format (test_RSS_2))

test_RSS_3 = model_3.test_RSS()
print ("Model 3 testing RSS: {}".format (test_RSS_3))

Model 1 testing RSS: 225500469795556.9
Model 2 testing RSS: 223377462976516.28
Model 3 testing RSS: 259236319241685.0
