In [4]:

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

import itertools
import env
import wrangle
import acquire
import prepare
import split
import exploration as exp
import modeling as md

## Wrangle

In [5]:
zillow, cats, quants  = wrangle.wrangle_zillow()

df = zillow.drop(columns='taxamount')
target_var = 'taxvaluedollarcnt'


my_lrm = md.LRM(df, target_var)

my_lrm.OLS_regression()
my_lrm.OLS_regression(use_rfe_features=True)

Index(['bedroomcnt', 'bathroomcnt', 'sqr_ft', 'taxvaluedollarcnt', 'yearbuilt',
       'taxamount'],
      dtype='object')
(52243, 10)
(51120, 10)
RMSE using Mean
Train/In-Sample:  413956.87 
Validate/Out-of-Sample:  416628.21
RMSE using Median
Train/In-Sample:  425451.6 
Validate/Out-of-Sample:  428602.01
       scaled_bedroomcnt  scaled_bathroomcnt  scaled_sqr_ft  scaled_yearbuilt  \
0                    0.0                 0.0      -0.032258         -0.142857   
1                    1.0                 1.0       1.223790          0.642857   
2                    0.0                -1.0      -0.591734         -0.392857   
3                    1.0                 1.0       0.876008          0.928571   
4                    0.0                 1.0       0.987903          0.857143   
...                  ...                 ...            ...               ...   
28621               -1.0                -1.0      -0.701613         -0.928571   
28622                0.0                 0.0

Unnamed: 0,model_name,rmse_train,rmse_validate,power,alpha,features,percent_diff,baseline_diff_percent_train,baseline_diff_percent_validate
0,OLS,336965.176532,339296.404975,,,"Index(['scaled_bedroomcnt', 'scaled_sqr_ft'], ...",-0.69,72.430091,72.5566


In [6]:
(my_lrm.rmse_train_mean_bl / my_lrm.train.taxvaluedollarcnt.mean())

0.8897932481515467

## Split

In [21]:
using_all_features = md.LRM(df, target_var, 4)
using_all_features.tweedie()

RMSE using Mean
Train/In-Sample:  413956.87 
Validate/Out-of-Sample:  416628.21
RMSE using Median
Train/In-Sample:  425451.6 
Validate/Out-of-Sample:  428602.01
RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  334989.7607975786 
Validation/Out-of-Sample:  337525.27123331523


Unnamed: 0,model_name,rmse_train,rmse_validate,power,alpha,features,percent_diff,baseline_diff_percent_train,baseline_diff_percent_validate
0,tweedie,334989.760798,337525.271233,1,0,"Index(['scaled_bedroomcnt', 'scaled_bathroomcn...",-0.76,72.005479,72.177853


## Scale

In [22]:
orange_county = df[df.county=='orange_county']
ventura = df[df.county=='ventura']
los_angeles = df[df.county=='los_angeles']


all_counties = md.LRM(df, target_var, features_to_select=4)
orange_county = md.LRM(orange_county, target_var, features_to_select=4)
ventura = md.LRM(ventura, target_var, features_to_select=4)
los_angeles = md.LRM(los_angeles, target_var, features_to_select=4)
los_angeles.lassolars_regression()

RMSE using Mean
Train/In-Sample:  413956.87 
Validate/Out-of-Sample:  416628.21
RMSE using Median
Train/In-Sample:  425451.6 
Validate/Out-of-Sample:  428602.01
RMSE using Mean
Train/In-Sample:  419338.44 
Validate/Out-of-Sample:  433694.14
RMSE using Median
Train/In-Sample:  427388.63 
Validate/Out-of-Sample:  441288.47
RMSE using Mean
Train/In-Sample:  313827.11 
Validate/Out-of-Sample:  316290.21
RMSE using Median
Train/In-Sample:  317467.01 
Validate/Out-of-Sample:  319832.1
RMSE using Mean
Train/In-Sample:  417161.1 
Validate/Out-of-Sample:  405392.99
RMSE using Median
Train/In-Sample:  432476.05 
Validate/Out-of-Sample:  419386.08
RMSE for Lasso + Lars
Training/In-Sample:  343858.7171643823 
Validation/Out-of-Sample:  341200.33554031345


Unnamed: 0,model_name,rmse_train,rmse_validate,power,alpha,features,percent_diff,baseline_diff_percent_train,baseline_diff_percent_validate
0,lasso_lars,343858.717164,341200.33554,,1,"Index(['scaled_bedroomcnt', 'scaled_bathroomcn...",0.77,79.980755,80.570717


In [23]:
los_angeles.OLS_regression()
los_angeles.tweedie()
los_angeles.lassolars_regression()
los_angeles.all_models_df()

ventura.lassolars_regression()
ventura.all_models_df()
orange_county.lassolars_regression()
orange_county.all_models_df()

       scaled_bedroomcnt  scaled_bathroomcnt  scaled_sqr_ft  scaled_yearbuilt
0                   -1.0                -0.5      -0.717949          1.636364
1                    1.0                 0.5       0.723523          1.000000
2                    0.0                -0.5      -0.470457         -0.136364
3                   -1.0                 0.0      -0.443701         -0.227273
4                    0.0                 0.5       0.273133          1.454545
...                  ...                 ...            ...               ...
18503                1.0                -0.5      -0.123746         -2.000000
18504               -1.0                 0.0      -0.124861          0.500000
18505                0.0                 0.0       0.693423          0.181818
18506                0.0                 1.0       1.481605         -0.318182
18507                2.0                 0.5       0.939799          1.545455

[18508 rows x 4 columns]
RMSE for OLS using LinearRegression
Tr

Unnamed: 0,model_name,rmse_train,rmse_validate,power,alpha,features,percent_diff,baseline_diff_percent_train,baseline_diff_percent_validate
0,lasso_lars,322893.695843,322918.666999,,1,"Index(['scaled_bedroomcnt', 'scaled_bathroomcn...",-0.01,58.860825,58.976976


In [24]:
los_angeles.train.head()

Unnamed: 0,index,parcelid,bedroomcnt,bathroomcnt,sqr_ft,taxvaluedollarcnt,yearbuilt,county,latitude,longitude,scaled_bedroomcnt,scaled_bathroomcnt,scaled_sqr_ft,scaled_yearbuilt
0,48429,11231946,2.0,1.0,900.0,152287.0,1990.0,los_angeles,34.550961,-118.031104,-1.0,-0.5,-0.717949,1.636364
1,23711,11084636,4.0,3.0,2193.0,239759.0,1976.0,los_angeles,34.254749,-118.570504,1.0,0.5,0.723523,1.0
2,25880,12265750,3.0,1.0,1122.0,117727.0,1951.0,los_angeles,33.906367,-118.248945,0.0,-0.5,-0.470457,-0.136364
3,46555,10801616,2.0,2.0,1146.0,191909.0,1949.0,los_angeles,34.17585,-118.519861,-1.0,0.0,-0.443701,-0.227273
4,9177,11204894,3.0,3.0,1789.0,225000.0,1986.0,los_angeles,34.56213,-118.088158,0.0,0.5,0.273133,1.454545
