# Imports

In [42]:
import pandas as pd
import numpy as np
from sklearn import linear_model    as lm
from sklearn import metrics         as m
from sklearn import model_selection as ms

# Load Data


In [44]:
df_raw = pd.read_csv( 'dataset/kc_house_data.csv' )

In [45]:
df_raw.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# Data Preparation

In [46]:
# features
X = df_raw.drop( ['price', 'date'], axis=1 )

# response variable
y = df_raw['price'].copy()

In [47]:
x_train, x_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# features
x_train = df_raw.drop( ['price', 'date'], axis=1 )

# response variable
y_train = df_raw['price'].copy()

In [49]:
x_train.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [50]:
y_train.head()

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

# Model Training

In [51]:
# model description
model_lr = lm.LinearRegression()

# model training
model_lr.fit(x_train, y_train)

LinearRegression()

In [52]:
# prediction - training
pred_train = model_lr.predict( x_train )

# prediction - test
pred_test = model_lr.predict( x_test )

In [53]:
pred[0:100]

array([ 205766.26460963,  731219.64982974,  379097.88265425,
        457424.13432523,  444789.054718  , 1451676.800463  ,
        277965.142858  ,  135261.77208376,  329843.81624401,
        250731.25050351,  870212.88141882,  437789.72121288,
        600311.43498105,  316903.13655568,  611557.85739087,
        944453.29692629,  456573.68626286,  539004.69481037,
        335024.25629826,  210585.07434929,  466088.95603633,
       1116489.110787  ,  330386.45026626,  130784.11743749,
        545423.99906274,  310538.26026975,  802018.13764891,
        651875.20382826,  457699.20615303,  650018.4502456 ,
        541202.92170082,  332196.71884785,  655096.04733203,
        555240.33756537,  319817.17474403,  804301.35470879,
        201145.17195349,  861533.54324495,  103130.75983538,
        585745.32147519,  664511.50307337,  742662.60624443,
        863042.3984593 ,  508391.02023427,  153710.08047338,
        811721.66516478,   32096.77784449,  762323.19584882,
        417855.61065259,

# Performance Metrics

In [60]:
# training - MAE, MAPE
mae_train = m.mean_absolute_error( y_train, pred_train )
mape_train = np.mean( np.abs( ( y_train - pred_train ) / y_train ) )

# test
mae_test = m.mean_absolute_error( y_test, pred_test )
mape_test = np.mean( np.abs( ( y_test - pred_test ) / y_test ) )

In [61]:
data = {
    'Dataframe': ['training', 'test'],
    'MAE': [mae_train, mae_test],
    'MAPE': [mape_train, mape_test]
}

pd.DataFrame( data )

Unnamed: 0,Dataframe,MAE,MAPE
0,training,125921.544194,0.255805
1,test,127702.272946,0.250814
