#  Zillow Project: Data Preparation
---

## MVP Objectives
- Remove/Impute missing values.
- Create functions to reproduce prepared dataset
- Move functions to a seperate file named `prepare.py`


In [1]:
# Import libraries to manipulate data structures and visualize numeric data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from acquire import load_zillow_data

In [2]:
# Read in the zillow dataset and assign to a variable
df = load_zillow_data()

In [3]:
# My first dataset with +2 million rows :)
df.shape

(21943, 62)

In [4]:
# Initial look at our zillow dataset
df.head()

Unnamed: 0,parcelid,id,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate
0,11393337,2463969,,,,3.0,3.0,,4.0,3.0,...,43439.0,2016.0,22755.0,756.94,Y,14.0,60372350000000.0,117,0.086137,2017-06-08
1,11289917,2061546,1.0,,,2.0,3.0,,6.0,2.0,...,136104.0,2016.0,27214.0,2319.9,Y,15.0,60379010000000.0,1248,-0.362001,2017-06-23
2,11705026,1834372,,,,1.0,2.0,,6.0,1.0,...,35606.0,2016.0,23624.0,543.69,,,60372320000000.0,1772,-0.146056,2017-06-30
3,14269464,1923117,,,,3.0,4.0,,,3.0,...,880456.0,2016.0,445569.0,9819.72,,,60590640000000.0,2028,0.021085,2017-06-01
4,11389003,2121349,,,,2.0,3.0,,6.0,2.0,...,614000.0,2016.0,449000.0,7673.19,,,60377030000000.0,3273,-0.325393,2017-06-01


Our dataset has a bunch of missing values! This is only the first 5 rows! Let's see how many values are missing in each column.

In [5]:
# Using `isnull()` and `notnull()` we can calculate the number of missing values and non-null values.
nulls = df.isnull().sum()
non_nulls = df.notnull().sum()

# Add missing values and non-null values together to get the total number values in each column.
total_values = nulls + non_nulls

# Create a variable to store the percentage of missing values in each column.
pct_missing = (nulls/total_values).sort_values(ascending=False)

# Perform formatting to clearly see the percentage of missing values in each column.
pct_missing_chart = pct_missing.apply("{0:.2%}".format)

# Display table to the user showing the percentage of missing values in each column.
print('Percentage of values missing per column')
print('-' * 39)
print(f"{pct_missing_chart}")

Percentage of values missing per column
---------------------------------------
buildingclasstypeid     99.98%
basementsqft            99.95%
finishedsquarefeet13    99.95%
storytypeid             99.95%
yardbuildingsqft26      99.90%
                         ...  
id                       0.00%
transactiondate          0.00%
logerror                 0.00%
id.1                     0.00%
parcelid                 0.00%
Length: 62, dtype: object


Before we move ahead let's drop columns with more than __9.14%__ of values missing. Imputing values in columns with > 9.14% of values missing is a waste of time. We need to make our dataset _Robust™_.
- If we can impute values in columns: `lotsizesquarefeet`, `finishedsquarefeet12`, great. If not, we can drop them.

In [6]:
columns_to_drop = pct_missing[pct_missing > .3358].index.to_list()

In [7]:
df_subset = df.drop(columns=columns_to_drop)

In [8]:
# My eyes are happy
df_subset.sample(10)

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,latitude,...,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,id.1,logerror,transactiondate
8590,14386068,1280252,2.5,3.0,2.5,1923.0,1923.0,6059.0,2.0,33599415.0,...,1995.0,202116.0,759000.0,2016.0,556884.0,7565.02,60590630000000.0,40134,0.003133,2017-05-26
19776,14613733,341052,2.5,2.0,2.5,1305.0,1305.0,6059.0,2.0,33567801.0,...,1995.0,152831.0,405348.0,2016.0,252517.0,4071.24,60590630000000.0,51333,0.028,2017-06-28
8737,14689999,1519649,2.0,2.0,2.0,1451.0,1451.0,6059.0,2.0,33678886.0,...,1978.0,142739.0,362958.0,2016.0,220219.0,3855.22,60590530000000.0,40281,-0.044067,2017-05-26
20735,13921239,1875126,1.5,3.0,1.5,1232.0,1232.0,6059.0,1.0,33809538.0,...,1972.0,76956.0,242068.0,2016.0,165112.0,3115.22,60590880000000.0,52292,0.007172,2017-06-29
13408,14064175,1099911,2.5,3.0,2.5,1887.0,1887.0,6059.0,2.0,33699756.0,...,1985.0,184943.0,573597.0,2016.0,388654.0,9064.72,60590990000000.0,44958,0.013819,2017-06-09
8203,13860188,323486,3.0,5.0,3.0,2455.0,2455.0,6059.0,3.0,33677467.0,...,1973.0,143193.0,797361.0,2016.0,654168.0,9022.36,60590990000000.0,39746,0.006837,2017-05-25
16857,14220326,2114857,3.0,4.0,3.0,2928.0,2928.0,6059.0,3.0,33819760.0,...,1983.0,278036.0,597521.0,2016.0,319485.0,6448.68,60590220000000.0,48413,0.082309,2017-06-20
11036,11496601,1614740,3.0,3.0,3.0,1922.0,1922.0,6037.0,3.0,33868902.0,...,1989.0,173821.0,769667.0,2016.0,595846.0,9229.82,60376210000000.0,42582,-0.015284,2017-06-01
7191,14151405,2412497,2.0,3.0,2.0,1618.0,1618.0,6059.0,2.0,33940172.0,...,1961.0,48862.0,70667.0,2016.0,21805.0,1038.24,60590010000000.0,38732,-0.027443,2017-05-23
15658,10909919,180960,3.0,3.0,3.0,1471.0,1471.0,6037.0,3.0,34147018.0,...,1952.0,189975.0,272591.0,2016.0,82616.0,3312.63,60371440000000.0,47212,-0.048174,2017-06-16


Sweet, our data looks so much better. __We freed up $\approx$ .7+ GB of memory__! Let's press forward.

In [9]:
original_mem_usage = df.memory_usage(index=False).sum()/1_000_000_000
subset_mem_usage = df_subset.memory_usage(index=False).sum()/1_000_000_000

free_memory = original_mem_usage - subset_mem_usage
print(f"{free_memory:.2f}GB of free memory!")

0.01GB of free memory!


### Subset of Zillow data
- Dropped columns missing more than 9.14% of values

In [10]:
# Will return. Retrieving and reproducing data dictionary for these columns.
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21943 entries, 0 to 21942
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      21943 non-null  int64  
 1   id                            21943 non-null  int64  
 2   bathroomcnt                   21937 non-null  float64
 3   bedroomcnt                    21937 non-null  float64
 4   calculatedbathnbr             21761 non-null  float64
 5   calculatedfinishedsquarefeet  21882 non-null  float64
 6   finishedsquarefeet12          20950 non-null  float64
 7   fips                          21937 non-null  float64
 8   fullbathcnt                   21761 non-null  float64
 9   latitude                      21937 non-null  float64
 10  longitude                     21937 non-null  float64
 11  lotsizesquarefeet             19641 non-null  float64
 12  propertycountylandusecode     21937 non-null  object 
 13  p

In [11]:
# Create a function to clean our Zillow dataset
# Note: Create a wrangle function to acquire and prepare the dataset
def prepare_zillow(df):
    '''
    Signature: prepare_zillow(df) -> pandas.core.frame.DataFrame
    Docstring:
    Prepare the zillow dataset for data EDA

    Return DataFrame of zillow dataset

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
    df is the Zillow dataset stored as `zillow.csv`

    Returns
    -------
    DataFrame of the zillow dataset


    Examples
    --------
    To copy the code hold ALT + SHIFT and drag your cursor from the first line to the last.

    >>> df = pd.read_csv('zillow.csv')
    >>>
    >>> nulls = df.isnull().sum()
    >>> non_nulls = df.notnull().sum()
    >>> total_values = nulls + non_nulls
    >>>
    >>> pct_missing = (nulls/total_values).sort_values(ascending=False)
    >>> pct_missing_chart = pct_missing.apply("{0:.2%}".format)
    >>>
    >>> print('Percentage of values missing per column')
    >>> print('-' * 39)
    >>> print(f"{pct_missing_chart}")
    '''
    
    # 1. Drop columns with missing values
    # Calculate the number of missing values and non-null values.
    nulls = df.isnull().sum()
    non_nulls = df.notnull().sum()

    # Get the total number values in each column.
    total_values = nulls + non_nulls

    # Create a variable to store the percentage of missing values in each column.
    # .3358 is a hard coded value from the original analysis.
    # To reproduce use the code in the docstring
    pct_missing = nulls/total_values
    columns_to_drop = pct_missing[pct_missing > .3358].index.to_list()
    
    # drop columns missing more than 33.58% of data.
    df = df.drop(columns=columns_to_drop)
    
    # drop duplicate columns and index column
    duplicate_columns_to_drop = ['calculatedbathnbr', 'finishedsquarefeet12', 'id']
    df = df.drop(columns=duplicate_columns_to_drop)
    
    df = df[df.propertylandusetypeid.isin([261, 262, 273, 279])]
    
    return df

In [12]:
test = prepare_zillow(df)

In [13]:
# Sweet, the function works!
test.sample(10)

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,...,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,id.1,logerror,transactiondate
11027,10992903,2.0,3.0,1199.0,6037.0,2.0,34267731.0,-118316841.0,8379.0,100,...,1957.0,93000.0,465000.0,2016.0,372000.0,5701.47,60371030000000.0,42573,-0.052215,2017-06-01
2443,12483837,1.0,3.0,1041.0,6037.0,1.0,33848409.0,-118131033.0,6458.0,100,...,1950.0,109005.0,280901.0,2016.0,171896.0,3665.33,60375710000000.0,33977,0.011277,2017-05-08
8223,14097258,2.5,5.0,2288.0,6059.0,2.0,33797555.0,-118063681.0,6448.0,122,...,1965.0,188255.0,348855.0,2016.0,160600.0,4220.28,60591100000000.0,39766,0.053035,2017-05-25
10997,12273962,1.0,3.0,1016.0,6037.0,1.0,33907097.0,-118237169.0,5001.0,100,...,1950.0,75387.0,176066.0,2016.0,100679.0,3484.75,60375410000000.0,42543,-0.014532,2017-06-01
16500,12206500,2.0,3.0,1524.0,6037.0,2.0,33965679.0,-118305565.0,4716.0,100,...,1931.0,46684.0,235548.0,2016.0,188864.0,2993.5,60372380000000.0,48055,-0.046418,2017-06-19
18654,11693009,2.0,4.0,2409.0,6037.0,2.0,34075294.0,-118488434.0,7244.0,100,...,1960.0,233696.0,947788.0,2016.0,714092.0,11454.67,60372620000000.0,50211,0.018609,2017-06-23
21717,11728303,3.0,3.0,1593.0,6037.0,3.0,34028505.0,-118321833.0,6604.0,100,...,1910.0,224400.0,640000.0,2016.0,415600.0,7829.38,60372190000000.0,53277,-0.055138,2017-06-30
21932,14505389,2.5,3.0,2050.0,6059.0,2.0,33655807.0,-117648591.0,5060.0,122,...,1983.0,214065.0,329127.0,2016.0,115062.0,3409.96,60590320000000.0,53493,-0.021017,2017-06-30
21609,14362589,2.5,4.0,1919.0,6059.0,2.0,33689604.0,-117670468.0,4600.0,122,...,1993.0,214043.0,670065.0,2016.0,456022.0,7925.9,60590520000000.0,53169,0.058206,2017-06-30
3084,11205369,2.0,3.0,1639.0,6037.0,2.0,34560903.0,-118069603.0,43912.0,100,...,1976.0,99418.0,132626.0,2016.0,33208.0,2308.04,60379110000000.0,34618,0.712163,2017-05-10


In [14]:
df = prepare_zillow(df)

In [15]:
df.shape

(15036, 26)

In [16]:
df.fips.value_counts()

6037.0    9630
6059.0    4109
6111.0    1297
Name: fips, dtype: int64

In [17]:
# Data dictionary template
numeric_columns = df.select_dtypes(exclude='O')

for column in numeric_columns.columns.to_list():
    print(column, numeric_columns[column].min(), numeric_columns[column].max())

parcelid 10712101 167687839
bathroomcnt 0.0 11.0
bedroomcnt 0.0 12.0
calculatedfinishedsquarefeet 300.0 15450.0
fips 6037.0 6111.0
fullbathcnt 1.0 11.0
latitude 33340851.0 34779658.0
longitude -119388346.0 -117555373.0
lotsizesquarefeet 594.0 1323788.0
propertylandusetypeid 261.0 261.0
rawcensustractandblock 60371011.101011 61110091.001004
regionidcity 3491.0 396556.0
regionidcounty 1286.0 3101.0
regionidzip 95982.0 399675.0
roomcnt 0.0 14.0
yearbuilt 1878.0 2015.0
structuretaxvaluedollarcnt 129.0 7893568.0
taxvaluedollarcnt 10504.0 23858374.0
assessmentyear 2016.0 2016.0
landtaxvaluedollarcnt 2250.0 16350601.0
taxamount 51.26 276797.83
censustractandblock 60371011101011.0 61110091001004.0
id.1 1248 53501
logerror -4.45396399868 3.1756879667400004


# MVP Preparation

In [18]:
def prepare_zillow_mvp(df):
    '''
    
    '''
    df = df[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'taxvaluedollarcnt']]
    return df

In [19]:
df = pd.read_csv('zillow.csv')
df = prepare_zillow_mvp(df)

In [20]:
df.shape

(21943, 4)

In [21]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,3.0,3.0,1534.0,43439.0
1,2.0,3.0,1458.0,136104.0
2,1.0,2.0,1421.0,35606.0
3,3.0,4.0,2541.0,880456.0
4,2.0,3.0,1650.0,614000.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21943 entries, 0 to 21942
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bathroomcnt                   21937 non-null  float64
 1   bedroomcnt                    21937 non-null  float64
 2   calculatedfinishedsquarefeet  21882 non-null  float64
 3   taxvaluedollarcnt             21937 non-null  float64
dtypes: float64(4)
memory usage: 685.8 KB


In [23]:
df.isna().sum()

bathroomcnt                      6
bedroomcnt                       6
calculatedfinishedsquarefeet    61
taxvaluedollarcnt                6
dtype: int64

In [24]:
df = df.dropna()

In [25]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,3.0,3.0,1534.0,43439.0
1,2.0,3.0,1458.0,136104.0
2,1.0,2.0,1421.0,35606.0
3,3.0,4.0,2541.0,880456.0
4,2.0,3.0,1650.0,614000.0


In [26]:
df = df[(df['bathroomcnt'] > 0) & (df['bathroomcnt'] > 0)]

In [27]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,3.0,3.0,1534.0,43439.0
1,2.0,3.0,1458.0,136104.0
2,1.0,2.0,1421.0,35606.0
3,3.0,4.0,2541.0,880456.0
4,2.0,3.0,1650.0,614000.0


In [28]:
df['more_than_two_bath'] = (df.bathroomcnt > 2).astype('int')

In [29]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,more_than_two_bath
0,3.0,3.0,1534.0,43439.0,1
1,2.0,3.0,1458.0,136104.0,0
2,1.0,2.0,1421.0,35606.0,0
3,3.0,4.0,2541.0,880456.0,1
4,2.0,3.0,1650.0,614000.0,0


# Data Preparation: Iteration #2

In [30]:
from prepare import prepare_zillow

In [31]:
df = prepare_zillow()

In [32]:
df.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,fullbathcnt,lotsizesquarefeet,propertycountylandusecode,propertylandusetypeid,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount
1,11289917,2.0,3.0,1458.0,6037.0,2.0,8284.0,101,261.0,0.0,1970.0,108890.0,136104.0,27214.0,2319.9
2,11705026,1.0,2.0,1421.0,6037.0,1.0,6707.0,100,261.0,0.0,1911.0,11982.0,35606.0,23624.0,543.69
3,14269464,3.0,4.0,2541.0,6059.0,3.0,4975.0,1,261.0,0.0,2003.0,434887.0,880456.0,445569.0,9819.72
4,11389003,2.0,3.0,1650.0,6037.0,2.0,7300.0,100,261.0,0.0,1949.0,165000.0,614000.0,449000.0,7673.19
5,11967869,1.0,2.0,693.0,6037.0,1.0,2908.0,100,261.0,0.0,1921.0,82416.0,274237.0,191821.0,3267.47


In [33]:
df.shape

(14562, 15)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14562 entries, 1 to 21940
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      14562 non-null  int64  
 1   bathroomcnt                   14562 non-null  float64
 2   bedroomcnt                    14562 non-null  float64
 3   calculatedfinishedsquarefeet  14562 non-null  float64
 4   fips                          14562 non-null  float64
 5   fullbathcnt                   14562 non-null  float64
 6   lotsizesquarefeet             14562 non-null  float64
 7   propertycountylandusecode     14562 non-null  object 
 8   propertylandusetypeid         14562 non-null  float64
 9   roomcnt                       14562 non-null  float64
 10  yearbuilt                     14562 non-null  float64
 11  structuretaxvaluedollarcnt    14562 non-null  float64
 12  taxvaluedollarcnt             14562 non-null  float64
 13  l

In [42]:
df.propertylandusetypeid.value_counts()

261.0    14562
Name: propertylandusetypeid, dtype: int64

In [43]:
pd.crosstab(df.fips, df.propertycountylandusecode.value_counts()[:7])

In [36]:
df_taxable_amount = pd.DataFrame()
df_taxable_amount['taxvaluedollarcnt'] = df.taxvaluedollarcnt
df_taxable_amount['calculated_tax'] = df.structuretaxvaluedollarcnt + df.landtaxvaluedollarcnt

df_taxable_amount

Unnamed: 0,taxvaluedollarcnt,calculated_tax
1,136104.0,136104.0
2,35606.0,35606.0
3,880456.0,880456.0
4,614000.0,614000.0
5,274237.0,274237.0
...,...,...
21933,458903.0,458903.0
21935,115387.0,115387.0
21938,297097.0,297097.0
21939,746963.0,746963.0


In [37]:
# sns.pairplot(df);

Fips codes: 6059=Orange County, 6037=Los Angeles County, 6111=Ventura County

In [38]:
# Create seperate dataframes for each county


# Note: Create a categorical column with the corresponding county names for EDA.
# Orange county properties
oc_properties = df[df.fips==6059]

# Los Angeles properties
la_properties = df[df.fips==6037]

# Ventura County properties
vc_properties = df[df.fips==6111]

In [39]:
# Experimental: Need to refactor function
def plot_variable_pairs(df):
    '''
    Accepts the telco_churn train set
    Returns all pairwise relationships between attributes
    '''
    columns_to_plot = df.select_dtypes(exclude='O').columns.values

    for column in columns_to_plot:
        for pair in columns_to_plot:
            if column != pair:
                sns.regplot(x=column,
                            y=pair,
                            data=df,
                            line_kws={"color": "red"},
                           ).set_title(column + " and " + pair)
                plt.show()

In [40]:
# Experimental: Need to refactor function
def plot_categorical_and_continuous_cars(categorical_var, continuous_var, df):
    '''
    Accepts
    Returns
    '''
    sns.boxplot(data=df, x=categorical_var, y=continuous_var)
    plt.show()
    
    sns.swarmplot(data=df, x=categorical_var, y=continuous_var)
    plt.show()
    
    sns.violinplot(data=df, x=categorical_var, y=continuous_var)
    plt.show()

In [41]:
# Function took to long to run. We will have to plot each relationship seperately
# plot_variable_pairs(df);

<div class='alert alert-block alert-success'>
Create "Decade built" as categorical variable derived from `yearbuilt`. Use a chi2 test of decade by county?
 Newer homes have higher tax values?
</div>