#  Zillow Project: Data Preparation
---

## MVP Objectives
- Remove/Impute missing values.
- Create functions to reproduce prepared dataset
- Move functions to a seperate file named `prepare.py`


In [1]:
# Import libraries to manipulate data structures and visualize numeric data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read in the zillow dataset and assign to a variable
df = pd.read_csv('./data/raw/zillow.csv')

In [4]:
# My first dataset with +2 million rows :)
df.shape

(2985217, 59)

In [5]:
# Initial look at our zillow dataset
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,0,10754147,,,,0.0,0.0,,,,...,,,,9.0,2016.0,9.0,,,,
1,1,10759547,,,,0.0,0.0,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,2,10843547,,,,0.0,0.0,5.0,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,3,10859147,,,,0.0,0.0,3.0,6.0,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,4,10879947,,,,0.0,0.0,4.0,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


Our dataset has a bunch of missing values! This is only the first 5 rows! Let's see how many values are missing in each column.

In [6]:
# Using `isnull()` and `notnull()` we can calculate the number of missing values and non-null values.
nulls = df.isnull().sum()
non_nulls = df.notnull().sum()

# Add missing values and non-null values together to get the total number values in each column.
total_values = nulls + non_nulls

# Create a variable to store the percentage of missing values in each column.
pct_missing = (nulls/total_values).sort_values(ascending=False)

# Perform formatting to clearly see the percentage of missing values in each column.
pct_missing_chart = pct_missing.apply("{0:.2%}".format)

# Display table to the user showing the percentage of missing values in each column.
print('Percentage of values missing per column')
print('-' * 39)
print(f"{pct_missing_chart}")

Percentage of values missing per column
---------------------------------------
storytypeid                     99.95%
basementsqft                    99.95%
yardbuildingsqft26              99.91%
fireplaceflag                   99.83%
architecturalstyletypeid        99.80%
typeconstructiontypeid          99.77%
finishedsquarefeet13            99.74%
buildingclasstypeid             99.57%
pooltypeid10                    99.43%
decktypeid                      99.42%
finishedsquarefeet6             99.28%
poolsizesum                     99.06%
pooltypeid2                     98.89%
hashottuborspa                  98.32%
taxdelinquencyflag              98.11%
taxdelinquencyyear              98.11%
yardbuildingsqft17              97.27%
finishedsquarefeet15            93.63%
finishedsquarefeet50            93.17%
finishedfloor1squarefeet        93.17%
fireplacecnt                    89.51%
threequarterbathnbr             89.40%
pooltypeid7                     83.05%
poolcnt                

Before we move ahead let's drop columns with more than __9.14%__ of values missing. Imputing values in columns with > 9.14% of values missing is a waste of time. We need to make our dataset _Robust™_.
- If we can impute values in columns: `lotsizesquarefeet`, `finishedsquarefeet12`, great. If not, we can drop them.

In [7]:
columns_to_drop = pct_missing[pct_missing > .3358].index.to_list()

In [8]:
df_subset = df.drop(columns=columns_to_drop)

In [9]:
# My eyes are happy
df_subset.sample(10)

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,latitude,...,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
1509699,1509699,11275798,3.0,5.0,3.0,2875.0,2875.0,6037.0,3.0,34716339.0,...,3101.0,97317.0,0.0,2001.0,165739.0,207172.0,2016.0,41433.0,3427.82,60379010000000.0
1851630,1851630,10906892,2.0,2.0,2.0,2263.0,2263.0,6037.0,2.0,34155541.0,...,3101.0,96449.0,0.0,1938.0,417237.0,631736.0,2016.0,214499.0,7741.16,60371430000000.0
2284214,2284214,11021933,1.0,2.0,1.0,676.0,676.0,6037.0,1.0,34268726.0,...,3101.0,96361.0,0.0,1957.0,69500.0,343000.0,2016.0,273500.0,4286.03,60371040000000.0
2789140,2789140,17181036,3.0,6.0,3.0,1685.0,1685.0,6111.0,3.0,34166791.0,...,2061.0,97104.0,9.0,1962.0,209339.0,372931.0,2016.0,163592.0,4239.56,61110040000000.0
768698,768698,12605896,3.0,4.0,3.0,1782.0,1782.0,6037.0,3.0,33814365.0,...,3101.0,96229.0,0.0,1987.0,179658.0,302935.0,2016.0,123277.0,14983.48,60375440000000.0
2128584,2128584,13934074,2.0,3.0,2.0,1969.0,1969.0,6059.0,2.0,33776865.0,...,1286.0,97047.0,8.0,1952.0,60822.0,83242.0,2016.0,22420.0,1342.54,60590890000000.0
1879339,1879339,14153950,2.5,3.0,2.5,1625.0,1625.0,6059.0,2.0,33937486.0,...,1286.0,97035.0,0.0,2013.0,236621.0,541328.0,2016.0,304707.0,7071.68,
2449147,2449147,11405405,2.0,3.0,2.0,1104.0,1104.0,6037.0,2.0,33954779.0,...,3101.0,96133.0,0.0,1925.0,63218.0,148262.0,2016.0,85044.0,2175.64,60376010000000.0
756995,756995,14085243,2.5,3.0,2.5,1367.0,1367.0,6059.0,2.0,33753769.0,...,1286.0,96990.0,6.0,1973.0,101810.0,463423.0,2016.0,361613.0,5574.58,60591000000000.0
296172,296172,14497681,2.5,4.0,2.5,1739.0,1739.0,6059.0,2.0,33655960.0,...,1286.0,96989.0,0.0,1987.0,194369.0,444234.0,2016.0,249865.0,4523.88,60590320000000.0


Sweet, our data looks so much better. __We freed up $\approx$ .7+ GB of memory__! Let's press forward.

In [10]:
original_mem_usage = df.memory_usage(index=False).sum()/1_000_000_000
subset_mem_usage = df_subset.memory_usage(index=False).sum()/1_000_000_000

free_memory = original_mem_usage - subset_mem_usage
print(f"{free_memory:.2f}GB of free memory!")

0.79GB of free memory!


### Subset of Zillow data
- Dropped columns missing more than 9.14% of values

In [11]:
# Will return. Retrieving and reproducing data dictionary for these columns.
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2985217 entries, 0 to 2985216
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   id                            int64  
 1   parcelid                      int64  
 2   bathroomcnt                   float64
 3   bedroomcnt                    float64
 4   calculatedbathnbr             float64
 5   calculatedfinishedsquarefeet  float64
 6   finishedsquarefeet12          float64
 7   fips                          float64
 8   fullbathcnt                   float64
 9   latitude                      float64
 10  longitude                     float64
 11  lotsizesquarefeet             float64
 12  propertycountylandusecode     object 
 13  propertylandusetypeid         float64
 14  rawcensustractandblock        float64
 15  regionidcity                  float64
 16  regionidcounty                float64
 17  regionidzip                   float64
 18  roomcnt               

In [19]:
# Create a function to clean our Zillow dataset
# Note: Create a wrangle function to acquire and prepare the dataset
def prepare_zillow(df):
    '''
    Signature: prepare_zillow(df) -> pandas.core.frame.DataFrame
    Docstring:
    Prepare the zillow dataset for data EDA

    Return DataFrame of zillow dataset

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
    df is the Zillow dataset stored as `zillow.csv`

    Returns
    -------
    DataFrame of the zillow dataset


    Examples
    --------
    To copy the code hold ALT + SHIFT and drag your cursor from the first line to the last.

    >>> df = pd.read_csv('./data/raw/zillow.csv')
    >>>
    >>> nulls = df.isnull().sum()
    >>> non_nulls = df.notnull().sum()
    >>> total_values = nulls + non_nulls
    >>>
    >>> pct_missing = (nulls/total_values).sort_values(ascending=False)
    >>> pct_missing_chart = pct_missing.apply("{0:.2%}".format)
    >>>
    >>> print('Percentage of values missing per column')
    >>> print('-' * 39)
    >>> print(f"{pct_missing_chart}")
    '''
    
    # 1. Drop columns with missing values
    # Calculate the number of missing values and non-null values.
    nulls = df.isnull().sum()
    non_nulls = df.notnull().sum()

    # Get the total number values in each column.
    total_values = nulls + non_nulls

    # Create a variable to store the percentage of missing values in each column.
    # .3358 is a hard coded value from the original analysis.
    # To reproduce use the code in the docstring
    pct_missing = nulls/total_values
    columns_to_drop = pct_missing[pct_missing > .3358].index.to_list()
    
    # drop columns missing more than 33.58% of data.
    df = df.drop(columns=columns_to_drop)
    
    # drop duplicate columns and index column
    duplicate_columns_to_drop = ['calculatedbathnbr', 'finishedsquarefeet12', 'id']
    df = df.drop(columns=duplicate_columns_to_drop)
    
    df = df[df.propertylandusetypeid.isin([261, 262, 273, 279])]
    
    return df

In [20]:
test = prepare_zillow(df)

In [21]:
# Sweet, the function works!
test.sample(10)

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,...,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
2146046,14691368,0.0,0.0,,6059.0,,33466022.0,-117692527.0,,34,...,1286.0,96961.0,0.0,,63662.0,98648.0,2016.0,34986.0,1374.54,60590420000000.0
819376,14457648,4.5,5.0,3711.0,6059.0,4.0,33540994.0,-117647848.0,7427.0,1,...,1286.0,96998.0,0.0,2004.0,585333.0,1040631.0,2016.0,455298.0,17113.08,60590320000000.0
785275,12992731,2.0,4.0,1234.0,6037.0,2.0,34056845.0,-117981439.0,6001.0,100,...,3101.0,96490.0,0.0,1955.0,45886.0,216449.0,2016.0,170563.0,3007.16,60374070000000.0
2832373,14353792,2.5,4.0,2514.0,6059.0,2.0,33699056.0,-117762827.0,3833.0,1,...,1286.0,96954.0,0.0,2005.0,492006.0,1085718.0,2016.0,593712.0,14186.44,60590520000000.0
1579750,10714187,3.0,4.0,2162.0,6037.0,3.0,34228935.0,-118607133.0,11303.0,101,...,3101.0,96339.0,0.0,1963.0,229850.0,486645.0,2016.0,256795.0,6009.39,60371130000000.0
1963119,12838709,2.0,3.0,1242.0,6037.0,2.0,34036057.0,-117946333.0,6494.0,100,...,3101.0,96488.0,0.0,1953.0,73654.0,102516.0,2016.0,28862.0,1578.54,60374080000000.0
2563188,14675285,1.5,3.0,1121.0,6059.0,1.0,33682442.0,-117803227.0,,34,...,1286.0,96941.0,5.0,1976.0,59977.0,440101.0,2016.0,380124.0,4624.7,60590530000000.0
1942055,11209820,3.0,3.0,1600.0,6037.0,3.0,34561854.0,-118037487.0,7365.0,100,...,3101.0,97330.0,0.0,1987.0,93700.0,210800.0,2016.0,117100.0,3479.59,60379110000000.0
674356,11233901,2.0,3.0,1244.0,6037.0,2.0,34549234.0,-118070751.0,7048.0,100,...,3101.0,97328.0,0.0,1994.0,89198.0,121830.0,2016.0,32632.0,2392.36,60379110000000.0
1108367,11709651,2.0,3.0,1224.0,6037.0,2.0,34006422.0,-118324131.0,4656.0,100,...,3101.0,95989.0,0.0,1961.0,35580.0,67241.0,2016.0,31661.0,886.85,60372340000000.0


In [22]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2985217 entries, 0 to 2985216
Data columns (total 23 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   parcelid                      int64  
 1   bathroomcnt                   float64
 2   bedroomcnt                    float64
 3   calculatedfinishedsquarefeet  float64
 4   fips                          float64
 5   fullbathcnt                   float64
 6   latitude                      float64
 7   longitude                     float64
 8   lotsizesquarefeet             float64
 9   propertycountylandusecode     object 
 10  propertylandusetypeid         float64
 11  rawcensustractandblock        float64
 12  regionidcity                  float64
 13  regionidcounty                float64
 14  regionidzip                   float64
 15  roomcnt                       float64
 16  yearbuilt                     float64
 17  structuretaxvaluedollarcnt    float64
 18  taxvaluedollarcnt     

In [23]:
# Data dictionary template
numeric_columns = test.select_dtypes(exclude='O')

for column in numeric_columns.columns.to_list():
    print(column, numeric_columns[column].min(), numeric_columns[column].max())

parcelid 10711725 169601949
bathroomcnt 0.0 32.0
bedroomcnt 0.0 25.0
calculatedfinishedsquarefeet 1.0 952576.0
fips 6037.0 6111.0
fullbathcnt 1.0 32.0
latitude 33324388.0 34819650.0
longitude -119475780.0 -117554316.0
lotsizesquarefeet 100.0 371000512.0
propertylandusetypeid 31.0 279.0
rawcensustractandblock 60371011.101 61110091.003010996
regionidcity 3491.0 396556.0
regionidcounty 1286.0 3101.0
regionidzip 95982.0 399675.0
roomcnt 0.0 96.0
yearbuilt 1801.0 2016.0
structuretaxvaluedollarcnt 1.0 255321161.0
taxvaluedollarcnt 1.0 319622473.0
assessmentyear 2000.0 2016.0
landtaxvaluedollarcnt 1.0 94011079.0
taxamount 0.24 3823175.65
censustractandblock -1.0 483030105084015.0


In [17]:
# test.to_csv('zillow_clean.csv', index=False)

In [18]:
test = test.drop(columns=['calculatedbathnbr', 'finishedsquarefeet12', 'id'])

In [None]:
test.shape

In [32]:
single_family_units = test[test.propertylandusetypeid.isin([261, 262, 273, 279])]

In [34]:
single_family_units.to_csv('single_family_residential.csv', index=False)

In [None]:
print(df_single_units.shape)