# Acquire Walkthrough
---

## Let's import our libraries

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport

import env
import acquire
import prep

## Next let's bring in our zillow data from the SQL server

***If reproducing, run the commented code below to get the data***

In [2]:
query = '''
SELECT p.id, p.bathroomcnt, p.bedroomcnt, p.calculatedbathnbr, p.calculatedfinishedsquarefeet, p.fips, p.fullbathcnt, p.latitude, p.longitude, p.roomcnt, p.yearbuilt, p.taxvaluedollarcnt, ROUND((p.taxamount / p.taxvaluedollarcnt) * 100, 2) AS taxrate
FROM properties_2017 AS p
JOIN predictions_2017 AS pr USING (parcelid)
WHERE p.propertylandusetypeid IN (261, 262, 263, 264, 266, 268, 273, 276, 279)
AND pr.transactiondate BETWEEN "2017-05-01" AND '2017-06-30';
'''

url = acquire.get_url('zillow')

zillow = pd.read_sql(query, url, index_col='id')

* I chose the above columsn based on their lack of null values
* Fips is included for county identifcation
* taxrate is calculated for later use in exploration

## Now let's export is as a csv and bring it back so that we don't have to rerun the query each time.

In [3]:
zillow.to_csv('zillow.csv')

In [4]:
zillow = pd.read_csv('zillow.csv')
zillow

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate
0,2061546,2.0,3.0,2.0,1458.0,6037.0,2.0,34686163.0,-118113100.0,0.0,1970.0,136104.0,1.70
1,1834372,1.0,2.0,1.0,1421.0,6037.0,1.0,33999877.0,-118291863.0,0.0,1911.0,35606.0,1.53
2,1923117,3.0,4.0,3.0,2541.0,6059.0,3.0,33694636.0,-117912245.0,0.0,2003.0,880456.0,1.12
3,2121349,2.0,3.0,2.0,1650.0,6037.0,2.0,33985377.0,-118361620.0,0.0,1949.0,614000.0,1.25
4,2093710,1.0,2.0,1.0,693.0,6037.0,1.0,34112946.0,-118208106.0,0.0,1921.0,274237.0,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2.0,2.0,1030.0,6037.0,2.0,34150300.0,-118468000.0,0.0,1988.0,359829.0,1.22
20344,1948691,2.0,3.0,2.0,1536.0,6037.0,2.0,34058178.0,-117948180.0,0.0,1955.0,297097.0,1.18
20345,444575,3.0,5.0,3.0,2655.0,6059.0,3.0,33836898.0,-117801369.0,0.0,1994.0,746963.0,1.08
20346,1480299,3.0,4.0,3.0,2305.0,6037.0,3.0,33952368.0,-118441155.0,0.0,1949.0,579047.0,1.21


## Next let's add the county by using the fips number as the identifier

In [5]:
def label_county(row):
    if row['fips'] == 6037:
        return 'Los Angeles'
    elif row['fips'] == 6059:
        return 'Orange'
    elif row['fips'] == 6111:
        return 'Ventura'

In [6]:
zillow['County'] = zillow.apply(lambda row: label_county(row), axis=1)
zillow

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate,County
0,2061546,2.0,3.0,2.0,1458.0,6037.0,2.0,34686163.0,-118113100.0,0.0,1970.0,136104.0,1.70,Los Angeles
1,1834372,1.0,2.0,1.0,1421.0,6037.0,1.0,33999877.0,-118291863.0,0.0,1911.0,35606.0,1.53,Los Angeles
2,1923117,3.0,4.0,3.0,2541.0,6059.0,3.0,33694636.0,-117912245.0,0.0,2003.0,880456.0,1.12,Orange
3,2121349,2.0,3.0,2.0,1650.0,6037.0,2.0,33985377.0,-118361620.0,0.0,1949.0,614000.0,1.25,Los Angeles
4,2093710,1.0,2.0,1.0,693.0,6037.0,1.0,34112946.0,-118208106.0,0.0,1921.0,274237.0,1.19,Los Angeles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2.0,2.0,1030.0,6037.0,2.0,34150300.0,-118468000.0,0.0,1988.0,359829.0,1.22,Los Angeles
20344,1948691,2.0,3.0,2.0,1536.0,6037.0,2.0,34058178.0,-117948180.0,0.0,1955.0,297097.0,1.18,Los Angeles
20345,444575,3.0,5.0,3.0,2655.0,6059.0,3.0,33836898.0,-117801369.0,0.0,1994.0,746963.0,1.08,Orange
20346,1480299,3.0,4.0,3.0,2305.0,6037.0,3.0,33952368.0,-118441155.0,0.0,1949.0,579047.0,1.21,Los Angeles


In [7]:
zillow['State'] = 'CA'
zillow

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate,County,State
0,2061546,2.0,3.0,2.0,1458.0,6037.0,2.0,34686163.0,-118113100.0,0.0,1970.0,136104.0,1.70,Los Angeles,CA
1,1834372,1.0,2.0,1.0,1421.0,6037.0,1.0,33999877.0,-118291863.0,0.0,1911.0,35606.0,1.53,Los Angeles,CA
2,1923117,3.0,4.0,3.0,2541.0,6059.0,3.0,33694636.0,-117912245.0,0.0,2003.0,880456.0,1.12,Orange,CA
3,2121349,2.0,3.0,2.0,1650.0,6037.0,2.0,33985377.0,-118361620.0,0.0,1949.0,614000.0,1.25,Los Angeles,CA
4,2093710,1.0,2.0,1.0,693.0,6037.0,1.0,34112946.0,-118208106.0,0.0,1921.0,274237.0,1.19,Los Angeles,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2.0,2.0,1030.0,6037.0,2.0,34150300.0,-118468000.0,0.0,1988.0,359829.0,1.22,Los Angeles,CA
20344,1948691,2.0,3.0,2.0,1536.0,6037.0,2.0,34058178.0,-117948180.0,0.0,1955.0,297097.0,1.18,Los Angeles,CA
20345,444575,3.0,5.0,3.0,2655.0,6059.0,3.0,33836898.0,-117801369.0,0.0,1994.0,746963.0,1.08,Orange,CA
20346,1480299,3.0,4.0,3.0,2305.0,6037.0,3.0,33952368.0,-118441155.0,0.0,1949.0,579047.0,1.21,Los Angeles,CA


## Now let's get a quick look at the data

In [8]:
zillow.describe()

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate
count,20348.0,20348.0,20348.0,20278.0,20303.0,20348.0,20278.0,20348.0,20348.0,20348.0,20286.0,20348.0,20347.0
mean,1510457.0,2.285507,3.021181,2.292756,1776.690391,6049.483782,2.219795,34003840.0,-118196100.0,1.593228,1969.377452,505647.0,1.315918
std,859860.6,0.957624,1.029851,0.950579,936.896616,20.96093,0.931478,270947.6,363190.1,2.919491,23.27686,658603.1,0.602459
min,349.0,0.0,0.0,1.0,242.0,6037.0,1.0,33340850.0,-119388300.0,0.0,1878.0,10504.0,0.01
25%,774685.0,2.0,2.0,2.0,1176.0,6037.0,2.0,33800280.0,-118418400.0,0.0,1954.0,210768.5,1.16
50%,1525712.0,2.0,3.0,2.0,1536.0,6037.0,2.0,34018140.0,-118167000.0,0.0,1970.0,367000.0,1.22
75%,2253321.0,3.0,4.0,3.0,2103.0,6059.0,3.0,34177360.0,-117912900.0,0.0,1987.0,589514.5,1.34
max,2982274.0,11.0,12.0,11.0,15450.0,6111.0,11.0,34779660.0,-117555400.0,14.0,2015.0,23858370.0,45.29


We can see that their are some zero in bedroom and bathroom count so we will probably go ahead and get rid of those

In [9]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20348 entries, 0 to 20347
Data columns (total 15 columns):
id                              20348 non-null int64
bathroomcnt                     20348 non-null float64
bedroomcnt                      20348 non-null float64
calculatedbathnbr               20278 non-null float64
calculatedfinishedsquarefeet    20303 non-null float64
fips                            20348 non-null float64
fullbathcnt                     20278 non-null float64
latitude                        20348 non-null float64
longitude                       20348 non-null float64
roomcnt                         20348 non-null float64
yearbuilt                       20286 non-null float64
taxvaluedollarcnt               20348 non-null float64
taxrate                         20347 non-null float64
County                          20348 non-null object
State                           20348 non-null object
dtypes: float64(12), int64(1), object(2)
memory usage: 2.3+ MB


## Now let's build a function in the acquire file that gets the data

In [10]:
zillow = acquire.get_data()
zillow

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate,County,State
0,2061546,2.0,3.0,2.0,1458.0,6037.0,2.0,34686163.0,-118113100.0,0.0,1970.0,136104.0,1.70,Los Angeles,CA
1,1834372,1.0,2.0,1.0,1421.0,6037.0,1.0,33999877.0,-118291863.0,0.0,1911.0,35606.0,1.53,Los Angeles,CA
2,1923117,3.0,4.0,3.0,2541.0,6059.0,3.0,33694636.0,-117912245.0,0.0,2003.0,880456.0,1.12,Orange,CA
3,2121349,2.0,3.0,2.0,1650.0,6037.0,2.0,33985377.0,-118361620.0,0.0,1949.0,614000.0,1.25,Los Angeles,CA
4,2093710,1.0,2.0,1.0,693.0,6037.0,1.0,34112946.0,-118208106.0,0.0,1921.0,274237.0,1.19,Los Angeles,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2.0,2.0,1030.0,6037.0,2.0,34150300.0,-118468000.0,0.0,1988.0,359829.0,1.22,Los Angeles,CA
20344,1948691,2.0,3.0,2.0,1536.0,6037.0,2.0,34058178.0,-117948180.0,0.0,1955.0,297097.0,1.18,Los Angeles,CA
20345,444575,3.0,5.0,3.0,2655.0,6059.0,3.0,33836898.0,-117801369.0,0.0,1994.0,746963.0,1.08,Orange,CA
20346,1480299,3.0,4.0,3.0,2305.0,6037.0,3.0,33952368.0,-118441155.0,0.0,1949.0,579047.0,1.21,Los Angeles,CA


# Preparation Walkthrough

## First let's check for duplicates

In [11]:
zillow[zillow.duplicated(keep=False)]

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate,County,State
93,2011553,1.5,2.0,1.5,1190.0,6059.0,1.0,33636000.0,-117679000.0,4.0,1979.0,357000.0,1.06,Orange,CA
94,2011553,1.5,2.0,1.5,1190.0,6059.0,1.0,33636000.0,-117679000.0,4.0,1979.0,357000.0,1.06,Orange,CA
1876,2456793,1.0,2.0,1.0,1025.0,6059.0,1.0,33707862.0,-117980009.0,4.0,1940.0,48107.0,1.79,Orange,CA
1877,2456793,1.0,2.0,1.0,1025.0,6059.0,1.0,33707862.0,-117980009.0,4.0,1940.0,48107.0,1.79,Orange,CA
1881,159239,3.0,2.0,3.0,1149.0,6037.0,3.0,34024472.0,-117766183.0,0.0,1985.0,218619.0,1.41,Los Angeles,CA
1882,159239,3.0,2.0,3.0,1149.0,6037.0,3.0,34024472.0,-117766183.0,0.0,1985.0,218619.0,1.41,Los Angeles,CA
2565,2415168,4.0,4.0,4.0,2459.0,6111.0,4.0,34245590.0,-119169692.0,7.0,2007.0,479000.0,2.56,Ventura,CA
2566,2415168,4.0,4.0,4.0,2459.0,6111.0,4.0,34245590.0,-119169692.0,7.0,2007.0,479000.0,2.56,Ventura,CA
6773,2595690,3.0,4.0,3.0,1816.0,6111.0,3.0,34213981.0,-118857003.0,7.0,1960.0,350701.0,1.08,Ventura,CA
6774,2595690,3.0,4.0,3.0,1816.0,6111.0,3.0,34213981.0,-118857003.0,7.0,1960.0,350701.0,1.08,Ventura,CA


### There are a few duplicates but the info is identical for each so let's get rid of the second duplicate for each

In [12]:
print(zillow.shape)
zillow = zillow.drop_duplicates()
print(zillow.shape)

(20348, 15)
(20340, 15)


## Let's see if we have any dtype or null issues

In [13]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20340 entries, 0 to 20347
Data columns (total 15 columns):
id                              20340 non-null int64
bathroomcnt                     20340 non-null float64
bedroomcnt                      20340 non-null float64
calculatedbathnbr               20270 non-null float64
calculatedfinishedsquarefeet    20295 non-null float64
fips                            20340 non-null float64
fullbathcnt                     20270 non-null float64
latitude                        20340 non-null float64
longitude                       20340 non-null float64
roomcnt                         20340 non-null float64
yearbuilt                       20278 non-null float64
taxvaluedollarcnt               20340 non-null float64
taxrate                         20339 non-null float64
County                          20340 non-null object
State                           20340 non-null object
dtypes: float64(12), int64(1), object(2)
memory usage: 2.5+ MB


### So there are missing values some columns, but not enough to make a difference so let's just drop them for now

In [14]:
print(zillow.shape)
zillow = zillow.dropna()
print(zillow.shape)

(20340, 15)
(20251, 15)


In [15]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20251 entries, 0 to 20347
Data columns (total 15 columns):
id                              20251 non-null int64
bathroomcnt                     20251 non-null float64
bedroomcnt                      20251 non-null float64
calculatedbathnbr               20251 non-null float64
calculatedfinishedsquarefeet    20251 non-null float64
fips                            20251 non-null float64
fullbathcnt                     20251 non-null float64
latitude                        20251 non-null float64
longitude                       20251 non-null float64
roomcnt                         20251 non-null float64
yearbuilt                       20251 non-null float64
taxvaluedollarcnt               20251 non-null float64
taxrate                         20251 non-null float64
County                          20251 non-null object
State                           20251 non-null object
dtypes: float64(12), int64(1), object(2)
memory usage: 2.5+ MB


### Also let's make sure columns that can be ints are changed

In [16]:
(zillow.taxvaluedollarcnt % 1 != 0).sum()

0

In [17]:
zillow.bedroomcnt = zillow.bedroomcnt.astype('int')
zillow.calculatedfinishedsquarefeet = zillow.calculatedfinishedsquarefeet.astype('int')
zillow.fullbathcnt = zillow.fullbathcnt.astype('int')
zillow.yearbuilt = zillow.yearbuilt.astype('int')
zillow.taxvaluedollarcnt = zillow.taxvaluedollarcnt.astype('int')
zillow

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate,County,State
0,2061546,2.0,3,2.0,1458,6037.0,2,34686163.0,-118113100.0,0.0,1970,136104,1.70,Los Angeles,CA
1,1834372,1.0,2,1.0,1421,6037.0,1,33999877.0,-118291863.0,0.0,1911,35606,1.53,Los Angeles,CA
2,1923117,3.0,4,3.0,2541,6059.0,3,33694636.0,-117912245.0,0.0,2003,880456,1.12,Orange,CA
3,2121349,2.0,3,2.0,1650,6037.0,2,33985377.0,-118361620.0,0.0,1949,614000,1.25,Los Angeles,CA
4,2093710,1.0,2,1.0,693,6037.0,1,34112946.0,-118208106.0,0.0,1921,274237,1.19,Los Angeles,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2,2.0,1030,6037.0,2,34150300.0,-118468000.0,0.0,1988,359829,1.22,Los Angeles,CA
20344,1948691,2.0,3,2.0,1536,6037.0,2,34058178.0,-117948180.0,0.0,1955,297097,1.18,Los Angeles,CA
20345,444575,3.0,5,3.0,2655,6059.0,3,33836898.0,-117801369.0,0.0,1994,746963,1.08,Orange,CA
20346,1480299,3.0,4,3.0,2305,6037.0,3,33952368.0,-118441155.0,0.0,1949,579047,1.21,Los Angeles,CA


## We have don't need the fips and room count doesn't seem useful now either so let's get rid of that

In [18]:
zillow = zillow.drop(columns=['fips', 'roomcnt'])

## Let's also clarify the names of some of the columns

In [19]:
zillow = zillow.rename(columns={'calculatedfinishedsquarefeet': 'squarefeet', 'Name': 'County'})
zillow

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,squarefeet,fullbathcnt,latitude,longitude,yearbuilt,taxvaluedollarcnt,taxrate,County,State
0,2061546,2.0,3,2.0,1458,2,34686163.0,-118113100.0,1970,136104,1.70,Los Angeles,CA
1,1834372,1.0,2,1.0,1421,1,33999877.0,-118291863.0,1911,35606,1.53,Los Angeles,CA
2,1923117,3.0,4,3.0,2541,3,33694636.0,-117912245.0,2003,880456,1.12,Orange,CA
3,2121349,2.0,3,2.0,1650,2,33985377.0,-118361620.0,1949,614000,1.25,Los Angeles,CA
4,2093710,1.0,2,1.0,693,1,34112946.0,-118208106.0,1921,274237,1.19,Los Angeles,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2,2.0,1030,2,34150300.0,-118468000.0,1988,359829,1.22,Los Angeles,CA
20344,1948691,2.0,3,2.0,1536,2,34058178.0,-117948180.0,1955,297097,1.18,Los Angeles,CA
20345,444575,3.0,5,3.0,2655,3,33836898.0,-117801369.0,1994,746963,1.08,Orange,CA
20346,1480299,3.0,4,3.0,2305,3,33952368.0,-118441155.0,1949,579047,1.21,Los Angeles,CA


## The latitude and longitude are whole numbers so lets change those.

In [20]:
zillow.latitude = zillow.latitude / 1000000
zillow.longitude = zillow.longitude / 1000000
zillow

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,squarefeet,fullbathcnt,latitude,longitude,yearbuilt,taxvaluedollarcnt,taxrate,County,State
0,2061546,2.0,3,2.0,1458,2,34.686163,-118.113100,1970,136104,1.70,Los Angeles,CA
1,1834372,1.0,2,1.0,1421,1,33.999877,-118.291863,1911,35606,1.53,Los Angeles,CA
2,1923117,3.0,4,3.0,2541,3,33.694636,-117.912245,2003,880456,1.12,Orange,CA
3,2121349,2.0,3,2.0,1650,2,33.985377,-118.361620,1949,614000,1.25,Los Angeles,CA
4,2093710,1.0,2,1.0,693,1,34.112946,-118.208106,1921,274237,1.19,Los Angeles,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2,2.0,1030,2,34.150300,-118.468000,1988,359829,1.22,Los Angeles,CA
20344,1948691,2.0,3,2.0,1536,2,34.058178,-117.948180,1955,297097,1.18,Los Angeles,CA
20345,444575,3.0,5,3.0,2655,3,33.836898,-117.801369,1994,746963,1.08,Orange,CA
20346,1480299,3.0,4,3.0,2305,3,33.952368,-118.441155,1949,579047,1.21,Los Angeles,CA


## Now let's make a function in our prepare file that gets the data for us and cleans it.

In [21]:
prep.acquire_and_prep_data()

Unnamed: 0,id,bathroomcnt,bedroomcnt,calculatedbathnbr,squarefeet,fullbathcnt,latitude,longitude,roomcnt,yearbuilt,taxvaluedollarcnt,taxrate,County,State
0,2061546,2.0,3,2.0,1458,2,34.686163,-118.113100,0.0,1970,136104,1.70,Los Angeles,CA
1,1834372,1.0,2,1.0,1421,1,33.999877,-118.291863,0.0,1911,35606,1.53,Los Angeles,CA
2,1923117,3.0,4,3.0,2541,3,33.694636,-117.912245,0.0,2003,880456,1.12,Orange,CA
3,2121349,2.0,3,2.0,1650,2,33.985377,-118.361620,0.0,1949,614000,1.25,Los Angeles,CA
4,2093710,1.0,2,1.0,693,1,34.112946,-118.208106,0.0,1921,274237,1.19,Los Angeles,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20343,2922089,2.0,2,2.0,1030,2,34.150300,-118.468000,0.0,1988,359829,1.22,Los Angeles,CA
20344,1948691,2.0,3,2.0,1536,2,34.058178,-117.948180,0.0,1955,297097,1.18,Los Angeles,CA
20345,444575,3.0,5,3.0,2655,3,33.836898,-117.801369,0.0,1994,746963,1.08,Orange,CA
20346,1480299,3.0,4,3.0,2305,3,33.952368,-118.441155,0.0,1949,579047,1.21,Los Angeles,CA
