<img src="https://prnewswire2-a.akamaihd.net/p/1893751/sp/189375100/thumbnail/entry_id/1_su9da4fu/def_height/1001/def_width/1911/version/100011/type/2/q/100"  width="300" height="200">

# Modeling

In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from prepare import prepare_zillow
from wrangle import train_validate_test, impute_values, attributes_target_split, add_scaled_columns
from explore import cluster_features
from sklearn.preprocessing import StandardScaler
from preprocess import features_for_modeling
from sklearn.linear_model import ElasticNet, LassoLars, LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from math import sqrt

from warnings import filterwarnings
filterwarnings('ignore')

In [21]:
df = prepare_zillow()
df.drop(columns=['parcelid', 'date_sold'], inplace=True)
train_set, validate, test = train_validate_test(df)
train_set, validate, test = impute_values(train_set, validate, test)

In [22]:
X_train, y_train = attributes_target_split(train_set, 'logerror')
X_validate, y_validate = attributes_target_split(validate, 'logerror')
X_test, y_test = attributes_target_split(test, 'logerror')

In [23]:
X_train.head()

Unnamed: 0,num_of_bedrooms,num_of_restrooms,living_room_area_sqft,lot_size_sqft,year_built,has_basement,has_hottub_or_spa,has_pool,pool_area_sqft,has_patio,patio_area_sqft,has_shed,basement_area_sqft,property_tax,structure_tax,land_tax,taxable_value,fips,latitude,longitude
50002,4.0,3.5,2844.0,4500.0,2003,0,0,0,0.0,0,0.0,0,0.0,6273.0,176419.0,328433.0,504852.0,6111,34.211156,-119.174179
10254,2.0,2.0,1282.0,6353.0,1986,0,0,0,0.0,0,0.0,0,0.0,2700.23,124263.0,31065.0,155328.0,6037,34.703819,-118.119752
25372,2.0,1.0,1175.0,4820.0,1950,0,0,0,0.0,0,0.0,0,0.0,5750.83,108385.0,376843.0,485228.0,6037,33.816603,-118.114331
24999,4.0,3.0,1765.0,5743.0,1943,0,0,0,0.0,0,0.0,0,0.0,4062.59,106907.0,228541.0,335448.0,6037,33.867042,-118.129712
36733,3.0,2.0,1053.0,7200.0,1954,0,0,0,0.0,0,0.0,0,0.0,2650.54,67342.0,143981.0,211323.0,6059,33.822219,-117.966434


In [24]:
scaler = StandardScaler()
X_train, X_validate, X_test = add_scaled_columns(X_train, X_validate, X_test, scaler)

In [26]:
features_for_modeling(X_train, y_train, 5)

Select K Best: 5 features
['num_of_bedrooms_scaled', 'num_of_restrooms_scaled', 'living_room_area_sqft_scaled', 'year_built_scaled', 'longitude_scaled']
Recursive Feature Elimination: 5 features
['living_room_area_sqft_scaled', 'property_tax_scaled', 'structure_tax_scaled', 'land_tax_scaled', 'taxable_value_scaled']


In [27]:
X_train, X_validate, X_test, cluster_1, cluster_2, cluster_3 = cluster_features(X_train, X_validate, X_test)

In [28]:
train_cluster_1_dummies = pd.get_dummies(X_train.cluster_1, drop_first=True, prefix='cluster_1')
validate_cluster_1_dummies = pd.get_dummies(X_validate.cluster_1, drop_first=True, prefix='cluster_1')
test_cluster_1_dummies = pd.get_dummies(X_test.cluster_1, drop_first=True, prefix='cluster_1')

In [29]:
train_cluster_2_dummies = pd.get_dummies(X_train.cluster_2, drop_first=True, prefix='cluster_2')
validate_cluster_2_dummies = pd.get_dummies(X_validate.cluster_2, drop_first=True, prefix='cluster_2')
test_cluster_2_dummies = pd.get_dummies(X_test.cluster_2, drop_first=True, prefix='cluster_2')

In [30]:
train_cluster_3_dummies = pd.get_dummies(X_train.cluster_3, drop_first=True, prefix='cluster_3')
validate_cluster_3_dummies = pd.get_dummies(X_validate.cluster_3, drop_first=True, prefix='cluster_3')
test_cluster_3_dummies = pd.get_dummies(X_test.cluster_3, drop_first=True, prefix='cluster_3')

In [31]:
X_train = pd.concat([
    X_train,
    train_cluster_1_dummies,
    train_cluster_2_dummies,
    train_cluster_3_dummies],
    axis=1
)

In [32]:
X_validate = pd.concat([
    X_validate,
    validate_cluster_1_dummies,
    validate_cluster_2_dummies,
    validate_cluster_3_dummies],
    axis=1
)

In [33]:
X_test = pd.concat([
    X_test,
    test_cluster_1_dummies,
    test_cluster_2_dummies,
    test_cluster_3_dummies],
    axis=1
)

In [34]:
features_for_modeling(X_train, y_train, 5)

Select K Best: 5 features
['num_of_bedrooms_scaled', 'num_of_restrooms_scaled', 'living_room_area_sqft_scaled', 'longitude_scaled', 'cluster_3_4']
Recursive Feature Elimination: 5 features
['structure_tax_scaled', 'land_tax_scaled', 'taxable_value_scaled', 'cluster_2_4', 'cluster_3_4']


In [37]:
X_train_m1_data = X_train[[
    
    'num_of_bedrooms_scaled',
    'num_of_restrooms_scaled',
    'living_room_area_sqft_scaled',
    'longitude_scaled',
    'cluster_3_4'
]]


X_train_m2_data = X_train[[
    
    'num_of_bedrooms_scaled',
    'num_of_restrooms_scaled',
    'living_room_area_sqft_scaled',
    'longitude_scaled',
    'cluster_3_4'
]]

# Let's some models!

## Linear Regression

In [113]:
train_set_evaluation = pd.DataFrame()

lm = LinearRegression()

lm.fit(X_train_m1_data, y_train.values)

train_set_evaluation['model_1_yhat'] = lm.predict(X_train_m1_data)

lm.fit(X_train_m1_data, y_train.values)

train_set_evaluation['model_2_yhat'] = lm.predict(X_train_m1_data)

train_set_evaluation['baseline_model'] = y_train.median()
train_set_evaluation['actual'] = y_train.values

train_set_evaluation

Unnamed: 0,model_1_yhat,model_2_yhat,baseline_model,actual
0,0.008077,0.008077,0.006832,0.332771
1,0.008114,0.008114,0.006832,0.115085
2,0.009749,0.009749,0.006832,-0.000979
3,0.018985,0.018985,0.006832,0.011115
4,0.015149,0.015149,0.006832,0.053176
...,...,...,...,...
31458,0.020065,0.020065,0.006832,0.053148
31459,0.023644,0.023644,0.006832,-0.004948
31460,0.009650,0.009650,0.006832,0.014664
31461,0.007309,0.007309,0.006832,0.077012


In [114]:
baseline_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.baseline_model))
model_1_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_1_yhat))
model_2_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_2_yhat))

# Best model
print(baseline_rmse)
print(model_1_rmse)
print(model_2_rmse )

0.1796835669834449
0.17905955045456784
0.17905955045456784


In [115]:
train_set_evaluation = pd.DataFrame()

lm = LinearRegression()

lm.fit(X_train_m2_data, y_train.values)

train_set_evaluation['model_1_yhat'] = lm.predict(X_train_m2_data)

lm.fit(X_train_m2_data, y_train.values)

train_set_evaluation['model_2_yhat'] = lm.predict(X_train_m2_data)

train_set_evaluation['baseline_model'] = y_train.median()
train_set_evaluation['actual'] = y_train.values

train_set_evaluation

Unnamed: 0,model_1_yhat,model_2_yhat,baseline_model,actual
0,0.008077,0.008077,0.006832,0.332771
1,0.008114,0.008114,0.006832,0.115085
2,0.009749,0.009749,0.006832,-0.000979
3,0.018985,0.018985,0.006832,0.011115
4,0.015149,0.015149,0.006832,0.053176
...,...,...,...,...
31458,0.020065,0.020065,0.006832,0.053148
31459,0.023644,0.023644,0.006832,-0.004948
31460,0.009650,0.009650,0.006832,0.014664
31461,0.007309,0.007309,0.006832,0.077012


In [116]:
baseline_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.baseline_model))
model_1_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_1_yhat))
model_2_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_2_yhat))

print(baseline_rmse)
print(model_1_rmse)
print(model_2_rmse)

0.1796835669834449
0.17905955045456784
0.17905955045456784


In [136]:
train_set_evaluation = pd.DataFrame()

lm = LassoLars(9)

lm.fit(X_train_m1_data, y_train.values)

train_set_evaluation['model_1_yhat'] = lm.predict(X_train_m1_data)

lm.fit(X_train_m1_data, y_train.values)

train_set_evaluation['model_2_yhat'] = lm.predict(X_train_m1_data)

train_set_evaluation['baseline_model'] = y_train.median()
train_set_evaluation['actual'] = y_train.values

train_set_evaluation

Unnamed: 0,model_1_yhat,model_2_yhat,baseline_model,actual
0,0.017259,0.017259,0.006832,0.332771
1,0.017259,0.017259,0.006832,0.115085
2,0.017259,0.017259,0.006832,-0.000979
3,0.017259,0.017259,0.006832,0.011115
4,0.017259,0.017259,0.006832,0.053176
...,...,...,...,...
31458,0.017259,0.017259,0.006832,0.053148
31459,0.017259,0.017259,0.006832,-0.004948
31460,0.017259,0.017259,0.006832,0.014664
31461,0.017259,0.017259,0.006832,0.077012


In [137]:
baseline_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.baseline_model))
model_1_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_1_yhat))
model_2_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_2_yhat))

print(baseline_rmse)
print(model_1_rmse)
print(model_2_rmse)

0.1796835669834449
0.1793807760633735
0.1793807760633735


In [141]:
train_set_evaluation = pd.DataFrame()

lm = LassoLars(10)

lm.fit(X_train_m2_data, y_train.values)

train_set_evaluation['model_1_yhat'] = lm.predict(X_train_m2_data)

lm.fit(X_train_m2_data, y_train.values)

train_set_evaluation['model_2_yhat'] = lm.predict(X_train_m2_data)

train_set_evaluation['baseline_model'] = y_train.median()
train_set_evaluation['actual'] = y_train.values

train_set_evaluation

Unnamed: 0,model_1_yhat,model_2_yhat,baseline_model,actual
0,0.017259,0.017259,0.006832,0.332771
1,0.017259,0.017259,0.006832,0.115085
2,0.017259,0.017259,0.006832,-0.000979
3,0.017259,0.017259,0.006832,0.011115
4,0.017259,0.017259,0.006832,0.053176
...,...,...,...,...
31458,0.017259,0.017259,0.006832,0.053148
31459,0.017259,0.017259,0.006832,-0.004948
31460,0.017259,0.017259,0.006832,0.014664
31461,0.017259,0.017259,0.006832,0.077012


In [142]:
baseline_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.baseline_model))
model_1_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_1_yhat))
model_2_rmse = sqrt(mean_squared_error(train_set_evaluation.actual, train_set_evaluation.model_2_yhat))

print(baseline_rmse)
print(model_1_rmse)
print(model_2_rmse )

0.1796835669834449
0.1793807760633735
0.1793807760633735
