In [16]:
# necessary imports

# data
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# modeling and metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_validate, cross_val_predict
from pyearth import Earth
from pygam import LinearGAM, GAM, f, s, te

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [17]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [18]:
# read the cleaned data into a DataFrame and check the first two observations
df = pd.read_csv('kc_house_data_clean.csv', index_col=0)
df.head(2)

Unnamed: 0_level_0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,waterfront_null,waterfront_ind,yr_renovated_scheme1,yr_renovated_null,yr_renovated_ind
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,1.0,0.0,3,...,98178,47.5112,-122.257,1340,5650,1,0,0,0,0
6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,...,98125,47.721,-122.319,1690,7639,0,0,2,0,1


In [19]:
# define variables for the model
X = df[['long',                     # 0
        'lat',                      # 1
        'sqft_living',              # 2
        'grade',                    # 3
        'bathrooms',                # 4
        'condition',                # 5
        'waterfront',               # 6
        'yr_renovated_scheme1',     # 7
        'floors',                   # 8
        'sqft_lot']].values         # 9
y = df['price'].values

# inspect the variables
display(X)
display(y)

array([[-1.22257e+02,  4.75112e+01,  1.18000e+03, ...,  0.00000e+00,
         1.00000e+00,  5.65000e+03],
       [-1.22319e+02,  4.77210e+01,  2.57000e+03, ...,  2.00000e+00,
         2.00000e+00,  7.24200e+03],
       [-1.22233e+02,  4.77379e+01,  7.70000e+02, ...,  1.00000e+00,
         1.00000e+00,  1.00000e+04],
       ...,
       [-1.22299e+02,  4.75944e+01,  1.02000e+03, ...,  0.00000e+00,
         2.00000e+00,  1.35000e+03],
       [-1.22069e+02,  4.75345e+01,  1.60000e+03, ...,  0.00000e+00,
         2.00000e+00,  2.38800e+03],
       [-1.22299e+02,  4.75941e+01,  1.02000e+03, ...,  0.00000e+00,
         2.00000e+00,  1.07600e+03]])

array([221900., 538000., 180000., ..., 402101., 400000., 325000.])

In [20]:
# perform test/training split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [21]:
# fit the model
gam=GAM(n_splines=25, 
        terms=s(2)+s(3)+s(4)+s(5)+s(6)+s(7)+s(8)+s(9)+te(0,1)+te(2,9),
        distribution = 'gamma', link='log').fit(X_train,y_train)

In [22]:
# display the fit results
gam.summary()

GAM                                                                                                       
Distribution:                         GammaDist Effective DoF:                                    240.1386
Link Function:                          LogLink Log Likelihood:                               -242783.2238
Number of Samples:                        19110 AIC:                                           486048.7248
                                                AICc:                                           486054.914
                                                GCV:                                                0.0302
                                                Scale:                                              0.0302
                                                Pseudo R-Squared:                                   0.9046
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(2)                              [0.

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  """Entry point for launching an IPython kernel.


In [23]:
# use the model to make predictions
y_pred = gam.predict(X_test)

# check the metrics for the predictions
print("R^2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R^2: 0.8663281276709829
MAE: 68092.39539844771
RMSE: 124200.75651839456


In [31]:
# look at over- and under-prediction
over_or_under = y_test - y_pred
total = len(over_or_under)
over  = len(over_or_under[over_or_under <  0])
under = len(over_or_under[over_or_under >= 0])

print("Total:", total, "\nOver:", over, "\nUnder:", under)

Total: 2124 
Over: 1068 
Under: 1056
