#  Zillow Project: Data Preparation
---

## MVP Objectives
- Remove/Impute missing values.
- Create functions to reproduce prepared dataset
- Move functions to a seperate file named `prepare.py`


In [1]:
# Import libraries to manipulate data structures and visualize numeric data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read in the zillow dataset and assign to a variable
df = pd.read_csv('zillow.csv')

In [3]:
# My first dataset with +2 million rows :)
df.shape

(2985217, 59)

In [4]:
# Initial look at our zillow dataset
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,0,10754147,,,,0.0,0.0,,,,...,,,,9.0,2016.0,9.0,,,,
1,1,10759547,,,,0.0,0.0,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,2,10843547,,,,0.0,0.0,5.0,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,3,10859147,,,,0.0,0.0,3.0,6.0,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,4,10879947,,,,0.0,0.0,4.0,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


Our dataset has a bunch of missing values! This is only the first 5 rows! Let's see how many values are missing in each column.

In [5]:
# Using `isnull()` and `notnull()` we can calculate the number of missing values and non-null values.
nulls = df.isnull().sum()
non_nulls = df.notnull().sum()

# Add missing values and non-null values together to get the total number values in each column.
total_values = nulls + non_nulls

# Create a variable to store the percentage of missing values in each column.
pct_missing = (nulls/total_values).sort_values(ascending=False)

# Perform formatting to clearly see the percentage of missing values in each column.
pct_missing_chart = pct_missing.apply("{0:.2%}".format)

# Display table to the user showing the percentage of missing values in each column.
print('Percentage of values missing per column')
print('-' * 39)
print(f"{pct_missing_chart}")

Percentage of values missing per column
---------------------------------------
storytypeid                     99.95%
basementsqft                    99.95%
yardbuildingsqft26              99.91%
fireplaceflag                   99.83%
architecturalstyletypeid        99.80%
typeconstructiontypeid          99.77%
finishedsquarefeet13            99.74%
buildingclasstypeid             99.57%
pooltypeid10                    99.43%
decktypeid                      99.42%
finishedsquarefeet6             99.28%
poolsizesum                     99.06%
pooltypeid2                     98.89%
hashottuborspa                  98.32%
taxdelinquencyflag              98.11%
taxdelinquencyyear              98.11%
yardbuildingsqft17              97.27%
finishedsquarefeet15            93.63%
finishedsquarefeet50            93.17%
finishedfloor1squarefeet        93.17%
fireplacecnt                    89.51%
threequarterbathnbr             89.40%
pooltypeid7                     83.05%
poolcnt                

Before we move ahead let's drop columns with more than __9.14%__ of values missing. Imputing values in columns with > 9.14% of values missing is a waste of time. We need to make our dataset _Robust™_.
- If we can impute values in columns: `lotsizesquarefeet`, `finishedsquarefeet12`, great. If not, we can drop them.

In [6]:
columns_to_drop = pct_missing[pct_missing > .3358].index.to_list()

In [7]:
df_subset = df.drop(columns=columns_to_drop)

In [8]:
# My eyes are happy
df_subset.sample(10)

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,latitude,...,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
1353737,1353737,14122481,1.5,3.0,1.5,1315.0,1315.0,6059.0,1.0,33838409.0,...,1286.0,96183.0,6.0,1971.0,75740.0,407771.0,2016.0,332031.0,4825.24,60591100000000.0
1170432,1170432,12380145,2.0,3.0,2.0,1301.0,1301.0,6037.0,2.0,33991290.0,...,3101.0,96192.0,0.0,1955.0,36005.0,53376.0,2016.0,17371.0,1210.86,60375320000000.0
2075459,2075459,10980538,2.0,4.0,2.0,1658.0,1658.0,6037.0,2.0,34281562.0,...,3101.0,96361.0,0.0,1961.0,46364.0,56485.0,2016.0,10121.0,903.69,60371040000000.0
2404205,2404205,11270932,2.0,3.0,2.0,1312.0,1312.0,6037.0,2.0,34668203.0,...,3101.0,97319.0,0.0,1987.0,103307.0,151804.0,2016.0,48497.0,2577.81,60379010000000.0
2350822,2350822,12937955,1.0,4.0,1.0,1663.0,1663.0,6037.0,1.0,34076537.0,...,3101.0,96523.0,0.0,1952.0,88671.0,147775.0,2016.0,59104.0,1910.9,60374060000000.0
909136,909136,14510492,0.0,0.0,,,,6059.0,,33867600.0,...,1286.0,97041.0,0.0,,5740.0,5740.0,2016.0,,,60590110000000.0
862622,862622,14383629,2.0,3.0,2.0,1240.0,1240.0,6059.0,2.0,33605440.0,...,1286.0,96974.0,6.0,1970.0,61417.0,388117.0,2016.0,326700.0,3945.66,60590630000000.0
629226,629226,11089637,4.0,4.0,4.0,3066.0,3066.0,6037.0,4.0,34244313.0,...,3101.0,96354.0,0.0,1962.0,231561.0,446942.0,2016.0,215381.0,5526.78,60371130000000.0
2692467,2692467,12684687,2.0,3.0,2.0,1878.0,1878.0,6037.0,2.0,33800949.0,...,3101.0,96121.0,0.0,1953.0,138335.0,666874.0,2016.0,528539.0,8458.44,60376700000000.0
952551,952551,14607523,2.0,2.0,2.0,808.0,808.0,6059.0,2.0,33775200.0,...,1286.0,97050.0,0.0,1988.0,98651.0,316758.0,2016.0,218107.0,3933.84,60590890000000.0


Sweet, our data looks so much better. __We freed up $\approx$ .7+ GB of memory__! Let's press forward.

In [9]:
original_mem_usage = df.memory_usage(index=False).sum()/1_000_000_000
subset_mem_usage = df_subset.memory_usage(index=False).sum()/1_000_000_000

free_memory = original_mem_usage - subset_mem_usage
print(f"{free_memory:.2f}GB of free memory!")

0.79GB of free memory!


### Subset of Zillow data
- Dropped columns missing more than 9.14% of values

In [10]:
# Will return. Retrieving and reproducing data dictionary for these columns.
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2985217 entries, 0 to 2985216
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   id                            int64  
 1   parcelid                      int64  
 2   bathroomcnt                   float64
 3   bedroomcnt                    float64
 4   calculatedbathnbr             float64
 5   calculatedfinishedsquarefeet  float64
 6   finishedsquarefeet12          float64
 7   fips                          float64
 8   fullbathcnt                   float64
 9   latitude                      float64
 10  longitude                     float64
 11  lotsizesquarefeet             float64
 12  propertycountylandusecode     object 
 13  propertylandusetypeid         float64
 14  rawcensustractandblock        float64
 15  regionidcity                  float64
 16  regionidcounty                float64
 17  regionidzip                   float64
 18  roomcnt               

In [11]:
# Create a function to clean our Zillow dataset
# Note: Create a wrangle function to acquire and prepare the dataset
def prepare_zillow(df):
    '''
    Signature: prepare_zillow(df) -> pandas.core.frame.DataFrame
    Docstring:
    Prepare the zillow dataset for data EDA

    Return DataFrame of zillow dataset

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
    df is the Zillow dataset stored as `zillow.csv`

    Returns
    -------
    DataFrame of the zillow dataset


    Examples
    --------
    To copy the code hold ALT + SHIFT and drag your cursor from the first line to the last.

    >>> df = pd.read_csv('zillow.csv')
    >>>
    >>> nulls = df.isnull().sum()
    >>> non_nulls = df.notnull().sum()
    >>> total_values = nulls + non_nulls
    >>>
    >>> pct_missing = (nulls/total_values).sort_values(ascending=False)
    >>> pct_missing_chart = pct_missing.apply("{0:.2%}".format)
    >>>
    >>> print('Percentage of values missing per column')
    >>> print('-' * 39)
    >>> print(f"{pct_missing_chart}")
    '''
    
    # 1. Drop columns with missing values
    # Calculate the number of missing values and non-null values.
    nulls = df.isnull().sum()
    non_nulls = df.notnull().sum()

    # Get the total number values in each column.
    total_values = nulls + non_nulls

    # Create a variable to store the percentage of missing values in each column.
    # .3358 is a hard coded value from the original analysis.
    # To reproduce use the code in the docstring
    pct_missing = nulls/total_values
    columns_to_drop = pct_missing[pct_missing > .3358].index.to_list()
    
    # drop columns missing more than 33.58% of data.
    df = df.drop(columns=columns_to_drop)
    
    # drop duplicate columns and index column
    duplicate_columns_to_drop = ['calculatedbathnbr', 'finishedsquarefeet12', 'id']
    df = df.drop(columns=duplicate_columns_to_drop)
    
    df = df[df.propertylandusetypeid.isin([261, 262, 273, 279])]
    
    return df

In [12]:
test = prepare_zillow(df)

In [13]:
# Sweet, the function works!
test.sample(10)

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,...,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
1405817,11467385,2.0,4.0,1649.0,6037.0,2.0,33953903.0,-118421501.0,6642.0,100,...,3101.0,96026.0,0.0,1952.0,149241.0,746208.0,2016.0,596967.0,9042.72,60372770000000.0
2135961,11189555,3.0,5.0,2021.0,6037.0,3.0,34601478.0,-118157123.0,6938.0,101,...,3101.0,97329.0,0.0,1988.0,217561.0,271949.0,2016.0,54388.0,4317.66,60379100000000.0
1129793,14083882,2.0,3.0,1128.0,6059.0,2.0,33752693.0,-118029140.0,6100.0,122,...,1286.0,96965.0,5.0,1963.0,74950.0,185693.0,2016.0,110743.0,2393.5,60591000000000.0
86343,14217374,2.5,3.0,2256.0,6059.0,2.0,33836188.0,-117759580.0,2828.0,122,...,1286.0,97026.0,8.0,1973.0,150562.0,258388.0,2016.0,107826.0,2942.24,60590220000000.0
1114001,13839351,2.0,4.0,1442.0,6059.0,2.0,33747296.0,-117885325.0,6500.0,122,...,1286.0,97003.0,7.0,1969.0,76578.0,368717.0,2016.0,292139.0,4382.88,60590750000000.0
2455234,12977032,2.0,2.0,1224.0,6037.0,2.0,34092106.0,-117999580.0,27590.0,100,...,3101.0,96479.0,0.0,1952.0,109040.0,545206.0,2016.0,436166.0,8145.72,60374330000000.0
765567,11805911,1.0,3.0,1382.0,6037.0,1.0,34024962.0,-118207369.0,5935.0,100,...,3101.0,96004.0,0.0,1908.0,35496.0,177486.0,2016.0,141990.0,2235.26,60372050000000.0
2527432,12353684,3.0,3.0,1716.0,6037.0,3.0,33984695.0,-118214179.0,2979.0,100,...,3101.0,96110.0,0.0,1987.0,116582.0,257292.0,2016.0,140710.0,4019.97,60375330000000.0
2123538,11979338,1.0,2.0,816.0,6037.0,1.0,34116489.0,-118179526.0,2200.0,100,...,3101.0,96023.0,0.0,1940.0,32763.0,60449.0,2016.0,27686.0,802.29,60371830000000.0
1711747,14057121,2.0,5.0,1982.0,6059.0,2.0,33676054.0,-117952569.0,7000.0,122,...,1286.0,96964.0,8.0,1964.0,116268.0,587259.0,2016.0,470991.0,6659.28,60590990000000.0


In [14]:
df = prepare_zillow(df)

In [15]:
df.shape

(2152864, 23)

In [16]:
df.fips.value_counts()

6037.0    1431812
6059.0     555077
6111.0     165975
Name: fips, dtype: int64

In [17]:
# Data dictionary template
numeric_columns = df.select_dtypes(exclude='O')

for column in numeric_columns.columns.to_list():
    print(column, numeric_columns[column].min(), numeric_columns[column].max())

parcelid 10711725 169601949
bathroomcnt 0.0 32.0
bedroomcnt 0.0 25.0
calculatedfinishedsquarefeet 1.0 952576.0
fips 6037.0 6111.0
fullbathcnt 1.0 32.0
latitude 33339912.0 34819650.0
longitude -119475780.0 -117554316.0
lotsizesquarefeet 100.0 371000512.0
propertylandusetypeid 261.0 279.0
rawcensustractandblock 60371011.101 61110091.003010996
regionidcity 3491.0 396556.0
regionidcounty 1286.0 3101.0
regionidzip 95982.0 399675.0
roomcnt 0.0 96.0
yearbuilt 1801.0 2016.0
structuretaxvaluedollarcnt 1.0 66404932.0
taxvaluedollarcnt 1.0 98428909.0
assessmentyear 2000.0 2016.0
landtaxvaluedollarcnt 1.0 88921951.0
taxamount 1.85 1337755.86
censustractandblock -1.0 483030105084015.0


square feet of home, number of bedrooms, and number of bathrooms.

# MVP Preparation

In [18]:
def prepare_zillow_mvp(df):
    '''
    
    '''
    df = df[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'taxvaluedollarcnt']]
    return df

In [19]:
df = pd.read_csv('zillow.csv')
df = prepare_zillow_mvp(df)

In [20]:
df.shape

(2985217, 4)

In [21]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,0.0,0.0,,9.0
1,0.0,0.0,,27516.0
2,0.0,0.0,73026.0,1434941.0
3,0.0,0.0,5068.0,1174475.0
4,0.0,0.0,1776.0,440101.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2985217 entries, 0 to 2985216
Data columns (total 4 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bathroomcnt                   float64
 1   bedroomcnt                    float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
dtypes: float64(4)
memory usage: 91.1 MB


In [23]:
df.isna().sum()

bathroomcnt                      2957
bedroomcnt                       2945
calculatedfinishedsquarefeet    45097
taxvaluedollarcnt               34266
dtype: int64

In [24]:
df = df.dropna()

In [25]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
2,0.0,0.0,73026.0,1434941.0
3,0.0,0.0,5068.0,1174475.0
4,0.0,0.0,1776.0,440101.0
5,0.0,0.0,2400.0,287634.0
7,0.0,0.0,3611.0,698984.0


In [26]:
df = df[(df['bathroomcnt'] > 0) & (df['bathroomcnt'] > 0)]

In [27]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
10,4.0,4.0,3095.0,192544.0
20,2.0,4.0,3633.0,296425.0
28,4.0,5.0,2865.0,831224.0
30,2.0,2.0,1090.0,357568.0
31,4.0,3.0,1620.0,847770.0


In [28]:
df['more_than_two_bath'] = (df.bathroomcnt > 2).astype('int')

In [29]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,more_than_two_bath
10,4.0,4.0,3095.0,192544.0,1
20,2.0,4.0,3633.0,296425.0,0
28,4.0,5.0,2865.0,831224.0,1
30,2.0,2.0,1090.0,357568.0,0
31,4.0,3.0,1620.0,847770.0,1


# Data Preparation: Iteration #2

In [30]:
from prepare import prepare_zillow

In [31]:
df = prepare_zillow()

In [32]:
df.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,...,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
20,11324547,2.0,4.0,3633.0,6037.0,2.0,34560018.0,-118169806.0,9826.0,100,...,3101.0,97329.0,0.0,2005.0,222321.0,296425.0,2016.0,74104.0,6941.39,
31,11544747,4.0,3.0,1620.0,6037.0,4.0,33996200.0,-118438000.0,,100,...,3101.0,96047.0,0.0,2011.0,339532.0,847770.0,2016.0,508238.0,10244.94,
33,11585547,2.0,3.0,2077.0,6037.0,2.0,34012977.0,-118479243.0,6490.0,100,...,3101.0,96152.0,0.0,1926.0,210192.0,646760.0,2016.0,436568.0,7924.68,
110,12716947,1.0,3.0,1244.0,6037.0,1.0,33953559.0,-118083855.0,6021.0,100,...,3101.0,96201.0,0.0,1950.0,108040.0,169471.0,2016.0,61431.0,2532.88,
111,12757147,2.0,3.0,1300.0,6037.0,2.0,33897134.0,-118102953.0,4917.0,100,...,3101.0,96193.0,0.0,1950.0,77415.0,233266.0,2016.0,155851.0,3110.99,


In [33]:
df.shape

(2139825, 23)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2139825 entries, 20 to 2982282
Data columns (total 23 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   parcelid                      int64  
 1   bathroomcnt                   float64
 2   bedroomcnt                    float64
 3   calculatedfinishedsquarefeet  float64
 4   fips                          float64
 5   fullbathcnt                   float64
 6   latitude                      float64
 7   longitude                     float64
 8   lotsizesquarefeet             float64
 9   propertycountylandusecode     object 
 10  propertylandusetypeid         float64
 11  rawcensustractandblock        float64
 12  regionidcity                  float64
 13  regionidcounty                float64
 14  regionidzip                   float64
 15  roomcnt                       float64
 16  yearbuilt                     float64
 17  structuretaxvaluedollarcnt    float64
 18  taxvaluedollarcnt    

In [None]:
# Function took more than 10 minutes to run.
# sns.pairplot(df);

In [None]:
df.fips.value_counts()

Fips codes: 6059=Orange County, 6037=Los Angeles County, 6111=Ventura County

In [None]:
# Create seperate dataframes for each county


# Note: Create a categorical column with the corresponding county names for EDA.
# Orange county properties
oc_properties = df[df.fips==6059]

# Los Angeles properties
la_properties = df[df.fips==6037]

# Ventura County properties
vc_properties = df[df.fips==6111]

In [36]:
# Experimental: Need to refactor function
def plot_variable_pairs(df):
    '''
    Accepts the telco_churn train set
    Returns all pairwise relationships between attributes
    '''
    columns_to_plot = df.select_dtypes(exclude='O').columns.values

    for column in columns_to_plot:
        for pair in columns_to_plot:
            if column != pair:
                sns.regplot(x=column,
                            y=pair,
                            data=df,
                            line_kws={"color": "red"},
                           ).set_title(column + " and " + pair)
                plt.show()

In [37]:
# Experimental: Need to refactor function
def plot_categorical_and_continuous_cars(categorical_var, continuous_var, df):
    '''
    Accepts
    Returns
    '''
    sns.boxplot(data=df, x=categorical_var, y=continuous_var)
    plt.show()
    
    sns.swarmplot(data=df, x=categorical_var, y=continuous_var)
    plt.show()
    
    sns.violinplot(data=df, x=categorical_var, y=continuous_var)
    plt.show()

In [39]:
# Function took to long to run. We will have to plot each relationship seperately
# plot_variable_pairs(df);