In [1]:

import pandas as pd
from prepare import Prepare
from split_get_scale import SplitGetScale
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, TweedieRegressor
from sklearn.feature_selection import RFE

pd.options.display.max_columns = 30

zillow = Prepare().wrangle_zillow()
zillow = pd.get_dummies(zillow, columns=["county"], drop_first=True)
zillow.head()

  df = Acquire().get_zillow_data()


Unnamed: 0,parcel_id,bathroom_count,bedroom_count,home_size_square_feet,fips,latitude,longitude,lot_size_square_feet,raw_census_tract_and_block,region_id_city,region_id_county,region_id_zip,year_built,structure_tax_value_price,tax_value_dollar_count,assessment_year,land_tax_value_price,tax_amount,census_tract_and_block,log_error,transaction_date,county_Orange,county_Ventura
0,14297519,3.5,4,3100.0,6059,33634931.0,-117869207.0,4506.0,60590630.0,53571,1286,96978,1998,485713.0,1023282.0,2016,537569.0,11013.72,60590630000000.0,0.025595,2017-01-01,1,0
1,17052889,1.0,2,1465.0,6111,34449266.0,-119281531.0,12647.0,61110010.0,13091,2061,97099,1967,88000.0,464000.0,2016,376000.0,5672.48,61110010000000.0,0.055619,2017-01-01,0,1
2,14186244,2.0,3,1243.0,6059,33886168.0,-117823170.0,8432.0,60590220.0,21412,1286,97078,1962,85289.0,564778.0,2016,479489.0,6488.3,60590220000000.0,0.005383,2017-01-01,1,0
3,12177905,3.0,4,2376.0,6037,34245180.0,-118240722.0,13038.0,60373000.0,396551,3101,96330,1970,108918.0,145143.0,2016,36225.0,1777.51,60373000000000.0,-0.10341,2017-01-01,0,0
4,10887214,3.0,3,1312.0,6037,34185120.0,-118414640.0,278581.0,60371240.0,12447,3101,96451,1964,73681.0,119407.0,2016,45726.0,1533.89,60371240000000.0,0.00694,2017-01-01,0,0


In [2]:
sgs = SplitGetScale()
train, validate, test = sgs.split(zillow)

In [3]:
train.columns

Index(['parcel_id', 'bathroom_count', 'bedroom_count', 'home_size_square_feet',
       'fips', 'latitude', 'longitude', 'lot_size_square_feet',
       'raw_census_tract_and_block', 'region_id_city', 'region_id_county',
       'region_id_zip', 'year_built', 'structure_tax_value_price',
       'tax_value_dollar_count', 'assessment_year', 'land_tax_value_price',
       'tax_amount', 'census_tract_and_block', 'log_error', 'transaction_date',
       'county_Orange', 'county_Ventura'],
      dtype='object')

### list of columns to include in model
- bathroom_count
- bedroom_count
- home_size_square_feet
- lot_size_square_feet
- year_built
- tax_amount
- county

### list of columns not included in model
- parcel_id
- fips
- latitude
- longitude
- raw_census_tract_and_block
- region_id_city
- region_id_county
- region_id_zip
- structure_tax_value_price
- tax_value_dollar_count
- assessment_year
- land_tax_value_price
- census_tract_and_block
- transaction_date

### target
- log_error

In [4]:
(X_train, y_train), (X_validate, y_validate), (X_test, y_test) = sgs.get_Xy(train, validate, test)

In [24]:
X_train

Unnamed: 0,bathroom_count,bedroom_count,home_size_square_feet,lot_size_square_feet,year_built,tax_amount,county_Orange,county_Ventura
13527,1.0,2,941.0,6338.0,1909,1025.09,0,0
52447,3.0,3,1560.0,7313.0,2007,4473.98,0,0
2559,1.0,5,2986.0,12490.0,1921,1193.71,0,0
33056,2.0,1,1271.0,19158.0,1978,5630.53,0,0
63509,3.0,3,2043.0,5001.0,1962,3887.88,0,0
...,...,...,...,...,...,...,...,...
65697,2.0,2,1188.0,7313.0,1975,2298.22,1,0
9026,2.0,3,1411.0,7480.0,1948,7555.58,0,0
37603,2.0,2,1227.0,377596.0,1984,4780.28,0,0
47311,3.0,2,1782.0,9069.0,1993,9494.27,0,0


In [5]:
X_train_scaled, X_val_scaled, X_test_scaled, _ = sgs.scale(X_train, X_validate, X_test)

### Baseline

In [6]:
act_pred_error = pd.DataFrame({"actual": y_train})
act_pred_error

Unnamed: 0,actual
13527,-0.091213
52447,-0.031320
2559,0.059301
33056,-0.021502
63509,0.048015
...,...
65697,-0.023112
9026,0.048371
37603,0.005313
47311,0.029311


In [7]:
act_pred_error["baseline_prediction"] = y_train.mean()
act_pred_error

Unnamed: 0,actual,baseline_prediction
13527,-0.091213,0.016579
52447,-0.031320,0.016579
2559,0.059301,0.016579
33056,-0.021502,0.016579
63509,0.048015,0.016579
...,...,...
65697,-0.023112,0.016579
9026,0.048371,0.016579
37603,0.005313,0.016579
47311,0.029311,0.016579


In [8]:
baseline_rmse = mean_squared_error(act_pred_error["actual"], act_pred_error["baseline_prediction"], squared=False)
baseline_rmse

0.16559772875837225

### Baseline RMSE is 0.16559772875837225

### Models

In [9]:
def grid_search(X, y, model, params_dic):
    grid = GridSearchCV(model, params_dic, n_jobs=-1)
    return grid.fit(X, y)

In [10]:
lasso_grid = {"alpha": [0.25, 0.5, 0.75, 1.0], "warm_start": [True, False]}
ridge_grid = {"alpha": [0.25, 0.5, 0.75, 1.0], }
tweedie_grid = {"power": [0], "alpha": [0.25, 0.5, 0.75, 1.0], "warm_start": [True, False]}

lasso = grid_search(X_train, y_train, Lasso(random_state=123), lasso_grid)
ridge = grid_search(X_train, y_train, Ridge(random_state=123), ridge_grid)
tweddie = grid_search(X_train, y_train, TweedieRegressor(), tweedie_grid)

#### grid search best estimators

In [11]:
print(lasso.best_estimator_)
print(lasso.best_params_)

Lasso(alpha=0.25, random_state=123, warm_start=True)
{'alpha': 0.25, 'warm_start': True}


In [12]:
print(ridge.best_estimator_)
print(ridge.best_params_)

Ridge(random_state=123)
{'alpha': 1.0}


In [13]:
print(tweddie.best_estimator_)
print(tweddie.best_params_)

TweedieRegressor(power=0, warm_start=True)
{'alpha': 1.0, 'power': 0, 'warm_start': True}


In [14]:
lasso = Lasso(alpha=0.25, random_state=123, warm_start=True).fit(X_train, y_train)
ridge = Ridge(alpha=1.0, random_state=123).fit(X_train, y_train)
tweedie = TweedieRegressor(alpha=1.0, power=0, warm_start=True).fit(X_train, y_train)
lr = LinearRegression().fit(X_train, y_train)

In [15]:
lass_pred_train = lasso.predict(X_train)
ridge_pred_train = ridge.predict(X_train)
tweed_pred_train = tweedie.predict(X_train)
lr_pred_train = lr.predict(X_train)

In [16]:
print(f"lasso train rmse: {mean_squared_error(y_train, lass_pred_train, squared=False)}\nridge train rmse: {mean_squared_error(y_train, ridge_pred_train, squared=False)}\ntweedie train rmse: {mean_squared_error(y_train, tweed_pred_train, squared=False)}\nlr train rmse: { mean_squared_error(y_train, lr_pred_train, squared=False)}")

lasso train rmse: 0.16535618032732577
ridge train rmse: 0.1653350905592181
tweedie train rmse: 0.16535447028887934
lr train rmse: 0.16533509055904103


In [17]:
lass_pred_val = lasso.predict(X_validate)
ridge_pred_val = ridge.predict(X_validate)
tweed_pred_val = tweedie.predict(X_validate)
lr_pred_val = lr.predict(X_validate)

In [18]:
print(f"lasso validation rmse: {mean_squared_error(y_validate, lass_pred_val, squared=False)}\nridge validation rmse: {mean_squared_error(y_validate, ridge_pred_val, squared=False)}\ntweedie validation rmse: {mean_squared_error(y_validate, tweed_pred_val, squared=False)}\nlr validation rmse: { mean_squared_error(y_validate, lr_pred_val, squared=False)}")

lasso validation rmse: 0.1677281814345548
ridge validation rmse: 0.1676937495191393
tweedie validation rmse: 0.16773395815860598
lr validation rmse: 0.1676937456295513


### No models appear to be overfit

### Get clusters before below

#### Going to use Linear Regression because lowest RMSE, 0.1676937456295513, and not overfit

In [19]:
lr_pred_test = lr.predict(X_test)

In [20]:
print(f"lr test rmse: { mean_squared_error(y_test, lr_pred_test, squared=False)}")

lr test rmse: 0.15953193544790714


### Linear Regression test RMSE is 0.15953193544790714

### Using RFE to check ranks

In [21]:
# The below code was taken from the curriculum as an example with home_size and lot_size

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=5)

# fit the data using RFE
rfe.fit(X_train, y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [22]:
rfe_feature

['bathroom_count',
 'bedroom_count',
 'year_built',
 'county_Orange',
 'county_Ventura']

In [23]:
# view list of columns and their ranking

# get the ranks
var_ranks = rfe.ranking_
# get the variable names
var_names = X_train.columns.tolist()
# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
0,bathroom_count,1
1,bedroom_count,1
4,year_built,1
6,county_Orange,1
7,county_Ventura,1
2,home_size_square_feet,2
5,tax_amount,3
3,lot_size_square_feet,4
