In [1]:
# PLACE HOLDER COMMENT
# RFE for home_size and lot_size

In [26]:
import pandas as pd
from prepare import Prepare
from split_get_scale import SplitGetScale
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, TweedieRegressor
from sklearn.feature_selection import RFE

zillow = Prepare().wrangle_zillow()
zillow = pd.get_dummies(zillow, columns=["county"], drop_first=True)
zillow.head()

Unnamed: 0,parcel_id,bathroom_count,bedroom_count,home_size_square_feet,fips,latitude,longitude,lot_size_square_feet,raw_census_tract_and_block,region_id_city,region_id_county,region_id_zip,year_built,structure_tax_value_price,tax_value_dollar_count,assessment_year,land_tax_value_price,tax_amount,census_tract_and_block,log_error,transaction_date,county_Orange,county_Ventura
0,14297519,3.5,4,3100.0,6059,33634931.0,-117869207.0,4506.0,60590630.0,53571,1286,96978,1998,485713.0,1023282.0,2016,537569.0,11013.72,60590630000000.0,0.025595,2017-01-01,1,0
1,17052889,1.0,2,1465.0,6111,34449266.0,-119281531.0,12647.0,61110010.0,13091,2061,97099,1967,88000.0,464000.0,2016,376000.0,5672.48,61110010000000.0,0.055619,2017-01-01,0,1
2,14186244,2.0,3,1243.0,6059,33886168.0,-117823170.0,8432.0,60590220.0,21412,1286,97078,1962,85289.0,564778.0,2016,479489.0,6488.3,60590220000000.0,0.005383,2017-01-01,1,0
3,12177905,3.0,4,2376.0,6037,34245180.0,-118240722.0,13038.0,60373000.0,396551,3101,96330,1970,108918.0,145143.0,2016,36225.0,1777.51,60373000000000.0,-0.10341,2017-01-01,0,0
4,10887214,3.0,3,1312.0,6037,34185120.0,-118414640.0,278581.0,60371240.0,12447,3101,96451,1964,73681.0,119407.0,2016,45726.0,1533.89,60371240000000.0,0.00694,2017-01-01,0,0


In [27]:
sgs = SplitGetScale()
train, validate, test = sgs.split(zillow)

In [28]:
train.columns

Index(['parcel_id', 'bathroom_count', 'bedroom_count', 'home_size_square_feet',
       'fips', 'latitude', 'longitude', 'lot_size_square_feet',
       'raw_census_tract_and_block', 'region_id_city', 'region_id_county',
       'region_id_zip', 'year_built', 'structure_tax_value_price',
       'tax_value_dollar_count', 'assessment_year', 'land_tax_value_price',
       'tax_amount', 'census_tract_and_block', 'log_error', 'transaction_date',
       'county_Orange', 'county_Ventura'],
      dtype='object')

### list of columns to include in model
- bathroom_count
- bedroom_count
- home_size_square_feet
- lot_size_square_feet
- year_built
- tax_amount
- county

### list of columns not included in model
- parcel_id
- fips
- latitude
- longitude
- raw_census_tract_and_block
- region_id_city
- region_id_county
- region_id_zip
- structure_tax_value_price
- tax_value_dollar_count
- assessment_year
- land_tax_value_price
- census_tract_and_block
- transaction_date

### target
- log_error

In [29]:
(X_train, y_train), (X_validate, y_validate), (X_test, y_test) = sgs.get_Xy(train, validate, test)

### Baseline

In [30]:
act_pred_error = pd.DataFrame({"actual": y_train})
act_pred_error

Unnamed: 0,actual
13527,-0.091213
52447,-0.031320
2559,0.059301
33056,-0.021502
63509,0.048015
...,...
65697,-0.023112
9026,0.048371
37603,0.005313
47311,0.029311


In [31]:
act_pred_error["baseline_prediction"] = y_train.mean()
act_pred_error

Unnamed: 0,actual,baseline_prediction
13527,-0.091213,0.016579
52447,-0.031320,0.016579
2559,0.059301,0.016579
33056,-0.021502,0.016579
63509,0.048015,0.016579
...,...,...
65697,-0.023112,0.016579
9026,0.048371,0.016579
37603,0.005313,0.016579
47311,0.029311,0.016579


In [51]:
baseline_rmse = mean_squared_error(act_pred_error["actual"], act_pred_error["baseline_prediction"], squared=False)
baseline_rmse

0.16559772875837225

### Baseline RMSE is 0.17

### Models

In [33]:
def grid_search(X, y, model, params_dic):
    grid = GridSearchCV(model, params_dic, n_jobs=-1)
    return grid.fit(X, y)

In [37]:
lasso_grid = {"alpha": [0.25, 0.5, 0.75, 1.0], "warm_start": [True, False]}
ridge_grid = {"alpha": [0.25, 0.5, 0.75, 1.0], }
tweedie_grid = {"power": [0], "alpha": [0.25, 0.5, 0.75, 1.0], "warm_start": [True, False]}

lasso = grid_search(X_train, y_train, Lasso(random_state=123), lasso_grid)
ridge = grid_search(X_train, y_train, Ridge(random_state=123), ridge_grid)
tweddie = grid_search(X_train, y_train, TweedieRegressor(), tweedie_grid)

#### grid search best estimators

In [39]:
print(lasso.best_estimator_)
print(lasso.best_params_)

Lasso(alpha=0.25, random_state=123, warm_start=True)
{'alpha': 0.25, 'warm_start': True}


In [40]:
print(ridge.best_estimator_)
print(ridge.best_params_)

Ridge(alpha=0.25, random_state=123)
{'alpha': 0.25}


In [41]:
print(tweddie.best_estimator_)
print(tweddie.best_params_)

TweedieRegressor(alpha=0.25, power=0, warm_start=True)
{'alpha': 0.25, 'power': 0, 'warm_start': True}


In [46]:
lasso = Lasso(alpha=0.25, random_state=123, warm_start=True).fit(X_train, y_train)
ridge = Ridge(alpha=0.25, random_state=123).fit(X_train, y_train)
tweedie = TweedieRegressor(alpha=0.25, power=0, warm_start=True).fit(X_train, y_train)
lr = LinearRegression().fit(X_train, y_train)

In [53]:
lass_pred_train = lasso.predict(X_train)
ridge_pred_train = ridge.predict(X_train)
tweed_pred_train = tweedie.predict(X_train)
lr_pred_train = lr.predict(X_train)

In [56]:
print(f"lasso train rmse: {mean_squared_error(y_train, lass_pred_train, squared=False)}\nridge train rmse: {mean_squared_error(y_train, ridge_pred_train, squared=False)}\ntweedie train rmse: {mean_squared_error(y_train, tweed_pred_train, squared=False)}\nlr train rmse: { mean_squared_error(y_train, lr_pred_train, squared=False)}")

lasso train rmse: 0.16535618032732577
ridge train rmse: 3.1980627336723105e-05
tweedie train rmse: 0.14904585144512345
lr train rmse: 1.969480651455987e-16


In [55]:
lass_pred_val = lasso.predict(X_validate)
ridge_pred_val = ridge.predict(X_validate)
tweed_pred_val = tweedie.predict(X_validate)
lr_pred_val = lr.predict(X_validate)

In [58]:
print(f"lasso validation rmse: {mean_squared_error(y_validate, lass_pred_val, squared=False)}\nridge validation rmse: {mean_squared_error(y_validate, ridge_pred_val, squared=False)}\ntweedie validation rmse: {mean_squared_error(y_validate, tweed_pred_val, squared=False)}\nlr validation rmse: { mean_squared_error(y_validate, lr_pred_val, squared=False)}")

lasso validation rmse: 0.1677281814345548
ridge validation rmse: 3.24368601888649e-05
tweedie validation rmse: 0.15117086173501681
lr validation rmse: 2.0221666857486523e-16


### No models appear to be overfit

#### Going to use Linear Regression because lowest RMSE, 2.0221666857486523e-16, and not overfit

In [59]:
lr_pred_test = lr.predict(X_test)

In [60]:
print(f"lr test rmse: { mean_squared_error(y_test, lr_pred_test, squared=False)}")

lr test rmse: 1.9534102205731086e-16


### Linear Regression test RMSE is 1.9534102205731086e-16