In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score

from math import sqrt
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import acquire
import explore
import split_scale


# Acquire the Data

In [2]:
# df = acquire.get_zillow_data_from_sql()
df = pd.read_csv('zillow.csv', index_col=0) # adding index_col=0 creates the unique row identifers

df.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,fips_number,propertylandusetypeid,propertylandusedesc,home_value,tax_amount,county_name,distribution_of_tax_rates
0,2.0,3.0,1458,6037.0,261,Single Family Residential,136104.0,2319.9,Los Angeles,0.017045
1,1.0,2.0,1421,6037.0,261,Single Family Residential,35606.0,543.69,Los Angeles,0.01527
2,3.0,4.0,2541,6059.0,261,Single Family Residential,880456.0,9819.72,Orange,0.011153
3,2.0,3.0,1650,6037.0,261,Single Family Residential,614000.0,7673.19,Los Angeles,0.012497
4,1.0,2.0,693,6037.0,261,Single Family Residential,274237.0,3267.47,Los Angeles,0.011915


In [3]:
df.shape

(15011, 10)

In [4]:
df.columns

Index(['bathrooms', 'bedrooms', 'square_feet', 'fips_number',
       'propertylandusetypeid', 'propertylandusedesc', 'home_value',
       'tax_amount', 'county_name', 'distribution_of_tax_rates'],
      dtype='object')

In [5]:
df .info()
# look like square_feet has some nulls

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15011 entries, 0 to 15035
Data columns (total 10 columns):
bathrooms                    15011 non-null float64
bedrooms                     15011 non-null float64
square_feet                  15011 non-null int64
fips_number                  15011 non-null float64
propertylandusetypeid        15011 non-null int64
propertylandusedesc          15011 non-null object
home_value                   15011 non-null float64
tax_amount                   15011 non-null float64
county_name                  15011 non-null object
distribution_of_tax_rates    15011 non-null float64
dtypes: float64(6), int64(2), object(2)
memory usage: 1.3+ MB


In [6]:
df.describe()

Unnamed: 0,bathrooms,bedrooms,square_feet,fips_number,propertylandusetypeid,home_value,tax_amount,distribution_of_tax_rates
count,15011.0,15011.0,15011.0,15011.0,15011.0,15011.0,15011.0,15011.0
mean,2.326161,3.312904,1942.109653,6049.387049,261.0,541332.7,6593.327045,0.013431
std,1.021834,0.942635,1001.672617,21.238488,0.0,730626.4,8449.893492,0.006881
min,0.0,0.0,300.0,6037.0,261.0,10504.0,51.26,0.000787
25%,2.0,3.0,1275.0,6037.0,261.0,198596.0,2706.52,0.011603
50%,2.0,3.0,1679.0,6037.0,261.0,384886.0,4771.47,0.012305
75%,3.0,4.0,2346.0,6059.0,261.0,644944.0,7678.315,0.013697
max,11.0,12.0,15450.0,6111.0,261.0,23858370.0,276797.83,0.452884


In [7]:
df.isna().sum()

bathrooms                    0
bedrooms                     0
square_feet                  0
fips_number                  0
propertylandusetypeid        0
propertylandusedesc          0
home_value                   0
tax_amount                   0
county_name                  0
distribution_of_tax_rates    0
dtype: int64

In [8]:
# df.square_feet.value_counts(dropna=False)
# assert(df['square_feet'] >= 0 ).all()
# so this tells us we have some values that are less than zero 
# since we have an AssertionError we know there are some in our dataframe

# What to do with our errors and outliers

In [9]:
# dropping nulls
df.dropna(inplace=True)

In [10]:
df.isna().sum()
# perfect we have cleaned up those NaN's with the mean of the column

bathrooms                    0
bedrooms                     0
square_feet                  0
fips_number                  0
propertylandusetypeid        0
propertylandusedesc          0
home_value                   0
tax_amount                   0
county_name                  0
distribution_of_tax_rates    0
dtype: int64

# Check data again

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15011 entries, 0 to 15035
Data columns (total 10 columns):
bathrooms                    15011 non-null float64
bedrooms                     15011 non-null float64
square_feet                  15011 non-null int64
fips_number                  15011 non-null float64
propertylandusetypeid        15011 non-null int64
propertylandusedesc          15011 non-null object
home_value                   15011 non-null float64
tax_amount                   15011 non-null float64
county_name                  15011 non-null object
distribution_of_tax_rates    15011 non-null float64
dtypes: float64(6), int64(2), object(2)
memory usage: 1.3+ MB


In [12]:
# so we need to change some of the datatypes. we can do this individually or all in one
df = df.astype(
    {'square_feet': 'int64',
     'propertylandusedesc': 'category'
    })

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15011 entries, 0 to 15035
Data columns (total 10 columns):
bathrooms                    15011 non-null float64
bedrooms                     15011 non-null float64
square_feet                  15011 non-null int64
fips_number                  15011 non-null float64
propertylandusetypeid        15011 non-null int64
propertylandusedesc          15011 non-null category
home_value                   15011 non-null float64
tax_amount                   15011 non-null float64
county_name                  15011 non-null object
distribution_of_tax_rates    15011 non-null float64
dtypes: category(1), float64(6), int64(2), object(1)
memory usage: 1.2+ MB


In [14]:
df.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,fips_number,propertylandusetypeid,propertylandusedesc,home_value,tax_amount,county_name,distribution_of_tax_rates
0,2.0,3.0,1458,6037.0,261,Single Family Residential,136104.0,2319.9,Los Angeles,0.017045
1,1.0,2.0,1421,6037.0,261,Single Family Residential,35606.0,543.69,Los Angeles,0.01527
2,3.0,4.0,2541,6059.0,261,Single Family Residential,880456.0,9819.72,Orange,0.011153
3,2.0,3.0,1650,6037.0,261,Single Family Residential,614000.0,7673.19,Los Angeles,0.012497
4,1.0,2.0,693,6037.0,261,Single Family Residential,274237.0,3267.47,Los Angeles,0.011915


In [15]:
df.fips_number.value_counts()

6037.0    9620
6059.0    4096
6111.0    1295
Name: fips_number, dtype: int64

In [16]:

df['county_name'] = df.fips_number.map({6037: 'Los Angeles', 
                                        6059: 'Orange',
                                        6111: 'Ventura'
                                       })


In [17]:
df.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,fips_number,propertylandusetypeid,propertylandusedesc,home_value,tax_amount,county_name,distribution_of_tax_rates
0,2.0,3.0,1458,6037.0,261,Single Family Residential,136104.0,2319.9,Los Angeles,0.017045
1,1.0,2.0,1421,6037.0,261,Single Family Residential,35606.0,543.69,Los Angeles,0.01527
2,3.0,4.0,2541,6059.0,261,Single Family Residential,880456.0,9819.72,Orange,0.011153
3,2.0,3.0,1650,6037.0,261,Single Family Residential,614000.0,7673.19,Los Angeles,0.012497
4,1.0,2.0,693,6037.0,261,Single Family Residential,274237.0,3267.47,Los Angeles,0.011915


In [18]:
df['distribution_of_tax_rates'] = df['tax_amount'] / df['home_value']
df.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,fips_number,propertylandusetypeid,propertylandusedesc,home_value,tax_amount,county_name,distribution_of_tax_rates
0,2.0,3.0,1458,6037.0,261,Single Family Residential,136104.0,2319.9,Los Angeles,0.017045
1,1.0,2.0,1421,6037.0,261,Single Family Residential,35606.0,543.69,Los Angeles,0.01527
2,3.0,4.0,2541,6059.0,261,Single Family Residential,880456.0,9819.72,Orange,0.011153
3,2.0,3.0,1650,6037.0,261,Single Family Residential,614000.0,7673.19,Los Angeles,0.012497
4,1.0,2.0,693,6037.0,261,Single Family Residential,274237.0,3267.47,Los Angeles,0.011915


# Splitting the data

In [23]:
# need to drop some columns 
df = df.drop(columns=['fips_number', 'propertylandusetypeid', 'propertylandusedesc',
                 'tax_amount', 'county_name', 'distribution_of_tax_rates'])

In [24]:
train, test = split_scale.split_my_data(df)

In [25]:
print(train.head(1), test.shape)

       bathrooms  bedrooms  square_feet  home_value
11921        3.0       3.0         2750   1068000.0 (4504, 4)


# Modeling

In [None]:
# evaluting using