# Acquire and Summarize: 
### 1. Acquire data from the cloud database.

You will want to end with a single dataframe. Include the logerror field and all other fields related to the properties that are available. You will end up using all the tables in the database.

Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid. - Only include properties with a transaction in 2017, and include only the last transaction for each property (so no duplicate property ID's), along with zestimate error and date of transaction. (Hint: read the docs for the .duplicated method) - Only include properties that have a latitude and longitude value.

In [1]:
#imports:
import pandas as pd
import numpy as np
import os
import env

In [2]:
# get connection url:
def get_db_url(db, user= env.user, host=env.host, password=env.password):
    """
    This function will:
    - take credentials from env.py file
    - make a connection to the SQL database with given credentials
    - return url connection
    """
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [3]:
# need a table that specifically: max logerror and max transactiondate

In [4]:
# sql query: 
sql_query_2 = '''
Select *
FROM properties_2017 as pr
	JOIN (Select 
	parcelid,
    max(logerror) as logerrorr, 
    max(transactiondate) as transactiondate
From
	predictions_2017
Group by parcelid) as predictions_2017 
	on predictions_2017.parcelid = pr.parcelid
	Left JOIN airconditioningtype as air 
		on pr.airconditioningtypeid = air.airconditioningtypeid
	Left Join architecturalstyletype as ar
		on  pr.architecturalstyletypeid = ar.architecturalstyletypeid
	Left Join buildingclasstype as bu
		on pr.buildingclasstypeid = bu.buildingclasstypeid
	Left Join heatingorsystemtype as he
		on pr.heatingorsystemtypeid = he.heatingorsystemtypeid
	Left Join propertylandusetype po
		on pr.propertylandusetypeid = po.propertylandusetypeid
	Left Join storytype as st
		on pr.storytypeid = st.storytypeid
	Left Join typeconstructiontype as ty
		on pr.typeconstructiontypeid = ty.typeconstructiontypeid
    Left Join unique_properties as up
		on pr.parcelid = up.parcelid
	Where 
		YEAR(predictions_2017.transactiondate) = 2017
        AND (pr.latitude IS NOT NULL AND pr.longitude IS NOT NULL);'''

In [5]:
sql_query = """
SELECT prop.*, 
       pred.logerror, 
       pred.transactiondate, 
       air.airconditioningdesc, 
       arch.architecturalstyledesc, 
       build.buildingclassdesc, 
       heat.heatingorsystemdesc, 
       landuse.propertylandusedesc, 
       story.storydesc, 
       construct.typeconstructiondesc 

FROM   properties_2017 prop  
       INNER JOIN (SELECT parcelid,
       					  logerror,
                          Max(transactiondate) transactiondate 
                   FROM   predictions_2017 
                   GROUP  BY parcelid, logerror) pred
               USING (parcelid) 
       LEFT JOIN airconditioningtype air USING (airconditioningtypeid) 
       LEFT JOIN architecturalstyletype arch USING (architecturalstyletypeid) 
       LEFT JOIN buildingclasstype build USING (buildingclasstypeid) 
       LEFT JOIN heatingorsystemtype heat USING (heatingorsystemtypeid) 
       LEFT JOIN propertylandusetype landuse USING (propertylandusetypeid) 
       LEFT JOIN storytype story USING (storytypeid) 
       LEFT JOIN typeconstructiontype construct USING (typeconstructiontypeid) 
WHERE  prop.latitude IS NOT NULL 
       AND prop.longitude IS NOT NULL AND transactiondate <= '2017-12-31' 
"""

In [6]:
# make sure the sql query works:
df = pd.read_sql(sql_query, get_db_url('zillow'))
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [7]:
# are there anu duplicates?
duplicate_rows = df[df.duplicated(keep='first')]
duplicate_rows

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc


In [8]:
def read_zillow():
    '''
    This function return a zillow based on the following requirements:
    - Only unclude propeties with a transaction in 2017
    - Include only the last transaction for each property
    - Include only the last zestimate error and date of transaction
    - Only include properties that have a latitude and longitude value.
    '''
    sql_query: sql_query = '''
    SELECT prop.*, 
       pred.logerror, 
       pred.transactiondate, 
       air.airconditioningdesc, 
       arch.architecturalstyledesc, 
       build.buildingclassdesc, 
       heat.heatingorsystemdesc, 
       landuse.propertylandusedesc, 
       story.storydesc, 
       construct.typeconstructiondesc 

FROM   properties_2017 prop  
       INNER JOIN (SELECT parcelid,
       					  logerror,
                          Max(transactiondate) transactiondate 
                   FROM   predictions_2017 
                   GROUP  BY parcelid, logerror) pred
               USING (parcelid) 
       LEFT JOIN airconditioningtype air USING (airconditioningtypeid) 
       LEFT JOIN architecturalstyletype arch USING (architecturalstyletypeid) 
       LEFT JOIN buildingclasstype build USING (buildingclasstypeid) 
       LEFT JOIN heatingorsystemtype heat USING (heatingorsystemtypeid) 
       LEFT JOIN propertylandusetype landuse USING (propertylandusetypeid) 
       LEFT JOIN storytype story USING (storytypeid) 
       LEFT JOIN typeconstructiontype construct USING (typeconstructiontypeid) 
WHERE  prop.latitude IS NOT NULL 
       AND prop.longitude IS NOT NULL AND transactiondate <= '2017-12-31' '''
    
    # return the data frame
    return pd.read_sql(sql_query, get_db_url('zillow'))

In [9]:
def get_zillow_data():
    '''
    This function reads in data from a codeup database, writes the data to a csv file if a 
    local file does not exist, and returns a df
    '''
    
    if os.path.isfile('zillow.csv'):
        
        #if csv file exists, read in data from csv file 
        df = pd.read_csv('zillow.csv', index_col = 0)
        
    else:
        
        # read fresh data from db into a dataframe
        df = read_zillow()
        
        #write dataframe to a csv file
        df.to_csv('zillow.csv')
    
    return df

In [10]:
get_zillow_data()

  df = pd.read_csv('zillow.csv', index_col = 0)


Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,6.059063e+13,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,6.111001e+13,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,6.059022e+13,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,6.037300e+13,-0.103410,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,6.037124e+13,0.006940,2017-01-01,Central,,,Central,Condominium,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77569,2864704,10833991,1.0,,,3.0,3.0,,8.0,3.0,...,6.037132e+13,-0.002245,2017-09-20,Central,,,Central,Condominium,,
77570,673515,11000655,,,,2.0,2.0,,6.0,2.0,...,6.037101e+13,0.020615,2017-09-20,,,,Central,Single Family Residential,,
77571,2968375,17239384,,,,2.0,4.0,,,2.0,...,6.111008e+13,0.013209,2017-09-21,,,,,Single Family Residential,,
77572,1843709,12773139,1.0,,,1.0,3.0,,4.0,1.0,...,6.037434e+13,0.037129,2017-09-21,Central,,,Central,Single Family Residential,,


### 2. Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)

In [11]:
df.describe()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,logerror
count,77574.0,77574.0,25006.0,206.0,50.0,77574.0,77574.0,15.0,49808.0,76959.0,...,17596.0,172.0,77459.0,77573.0,77574.0,77572.0,77569.0,2900.0,77327.0,77574.0
mean,1495340.0,13006300.0,1.812045,7.38835,679.72,2.298521,3.053252,3.933333,6.53383,2.316389,...,1.434246,1.0,189282.9,490147.7,2016.0,301146.8,5995.855691,14.088276,60496660000000.0,0.016805
std,860968.9,3478043.0,2.965823,2.734542,689.703546,0.996706,1.140447,0.258199,1.722041,0.97969,...,0.544518,0.0,230414.9,653805.9,0.0,492731.7,7628.912331,2.181281,1533386000000.0,0.170742
min,349.0,10711860.0,1.0,2.0,38.0,0.0,0.0,3.0,1.0,1.0,...,1.0,1.0,44.0,1000.0,2016.0,161.0,19.92,3.0,60371010000000.0,-4.65542
25%,752087.5,11538190.0,1.0,7.0,273.0,2.0,2.0,4.0,6.0,2.0,...,1.0,1.0,84182.0,206898.0,2016.0,85292.0,2712.61,14.0,60373110000000.0,-0.02431
50%,1498169.0,12530500.0,1.0,7.0,515.0,2.0,3.0,4.0,6.0,2.0,...,1.0,1.0,136407.0,358879.0,2016.0,203181.0,4448.3,15.0,60376030000000.0,0.006673
75%,2240588.0,14211240.0,1.0,7.0,796.5,3.0,4.0,4.0,8.0,3.0,...,2.0,1.0,218734.0,569000.0,2016.0,366761.0,6926.82,15.0,60590420000000.0,0.039292
max,2982274.0,167689300.0,13.0,21.0,3560.0,18.0,16.0,4.0,12.0,18.0,...,6.0,1.0,11421790.0,49061240.0,2016.0,48952200.0,586639.3,99.0,483030100000000.0,5.262999


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77574 entries, 0 to 77573
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77574 non-null  int64  
 1   parcelid                      77574 non-null  int64  
 2   airconditioningtypeid         25006 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77574 non-null  float64
 6   bedroomcnt                    77574 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49808 non-null  float64
 9   calculatedbathnbr             76959 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6035 non-null   float64
 12  calculatedfinishedsquarefeet  77374 non-null  float64
 13  f

In [13]:
df.shape

(77574, 68)

### 3. Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [14]:
# missing values in columns: 
df.isna().sum()

id                              0
parcelid                        0
airconditioningtypeid       52568
architecturalstyletypeid    77368
basementsqft                77524
                            ...  
buildingclassdesc           77559
heatingorsystemdesc         28005
propertylandusedesc             0
storydesc                   77524
typeconstructiondesc        77352
Length: 68, dtype: int64

In [15]:
# missing values in rows: 
df.isna().sum(axis=1)

0        36
1        33
2        34
3        32
4        29
         ..
77569    29
77570    33
77571    32
77572    32
77573    34
Length: 77574, dtype: int64

In [16]:
df_nulls = df.copy().T
df_nulls

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77564,77565,77566,77567,77568,77569,77570,77571,77572,77573
id,1727539,1387261,11677,2288172,1970746,1447245,781532,870991,1246926,1585097,...,1635173,1684937,1642435,1373391,2274245,2864704,673515,2968375,1843709,1187175
parcelid,14297519,17052889,14186244,12177905,10887214,17143294,12095076,12069064,12790562,11542646,...,12892446,12666457,10858613,10722691,12412492,10833991,11000655,17239384,12773139,12826780
airconditioningtypeid,,,,,1.0,,1.0,,,,...,,,1.0,1.0,,1.0,,,1.0,
architecturalstyletypeid,,,,,,,,,,,...,,,,,,,,,,
basementsqft,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
buildingclassdesc,,,,,,,,,,,...,,,,,,,,,,
heatingorsystemdesc,,,,Central,Central,,Central,,Central,Central,...,Central,Central,Central,Central,Central,Central,Central,,Central,Central
propertylandusedesc,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Condominium,Condominium,Single Family Residential,Single Family Residential,Single Family Residential,Condominium,...,Single Family Residential,Condominium,Condominium,Single Family Residential,Single Family Residential,Condominium,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential
storydesc,,,,,,,,,,,...,,,,,,,,,,


In [17]:
df_nulls['num_rows_missing'] = df.isna().sum(axis=0)
df_nulls

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77565,77566,77567,77568,77569,77570,77571,77572,77573,num_rows_missing
id,1727539,1387261,11677,2288172,1970746,1447245,781532,870991,1246926,1585097,...,1684937,1642435,1373391,2274245,2864704,673515,2968375,1843709,1187175,0
parcelid,14297519,17052889,14186244,12177905,10887214,17143294,12095076,12069064,12790562,11542646,...,12666457,10858613,10722691,12412492,10833991,11000655,17239384,12773139,12826780,0
airconditioningtypeid,,,,,1.0,,1.0,,,,...,,1.0,1.0,,1.0,,,1.0,,52568
architecturalstyletypeid,,,,,,,,,,,...,,,,,,,,,,77368
basementsqft,,,,,,,,,,,...,,,,,,,,,,77524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
buildingclassdesc,,,,,,,,,,,...,,,,,,,,,,77559
heatingorsystemdesc,,,,Central,Central,,Central,,Central,Central,...,Central,Central,Central,Central,Central,Central,,Central,Central,28005
propertylandusedesc,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Condominium,Condominium,Single Family Residential,Single Family Residential,Single Family Residential,Condominium,...,Condominium,Condominium,Single Family Residential,Single Family Residential,Condominium,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,0
storydesc,,,,,,,,,,,...,,,,,,,,,,77524


In [18]:
df_nulls['pct_rows_missing'] = (df.isna().sum(axis=0)/df.shape[0])

In [19]:
df_null = df_nulls[['num_rows_missing','pct_rows_missing']]
df_null

Unnamed: 0,num_rows_missing,pct_rows_missing
id,0,0.000000
parcelid,0,0.000000
airconditioningtypeid,52568,0.677650
architecturalstyletypeid,77368,0.997344
basementsqft,77524,0.999355
...,...,...
buildingclassdesc,77559,0.999807
heatingorsystemdesc,28005,0.361010
propertylandusedesc,0,0.000000
storydesc,77524,0.999355


In [20]:
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    pct_missing = num_missing / rows
    cols_missing = pd.DataFrame({'number_missing_rows': num_missing, 'percent_rows_missing': pct_missing})
    return cols_missing

In [21]:
nulls_by_col(df)

Unnamed: 0,number_missing_rows,percent_rows_missing
id,0,0.000000
parcelid,0,0.000000
airconditioningtypeid,52568,0.677650
architecturalstyletypeid,77368,0.997344
basementsqft,77524,0.999355
...,...,...
buildingclassdesc,77559,0.999807
heatingorsystemdesc,28005,0.361010
propertylandusedesc,0,0.000000
storydesc,77524,0.999355


# Prepare: 

### 1. Remove any properties that are likely to be something other than single unit properties. (e.g. no duplexes, no land/lot, ...). There are multiple ways to estimate that a property is a single unit, and there is not a single "right" answer.

In [22]:
# Restrict df to only properties that meet single unit criteria

single_use = [261, 262, 263, 264, 266, 268, 273, 276, 279]
df = df[df.propertylandusetypeid.isin(single_use)]
    
    
# Restrict df to only those properties with at least 1 bath & bed and >350 sqft area
df = df[(df.bedroomcnt > 0) & (df.bathroomcnt > 0) & ((df.unitcnt<=1)|df.unitcnt.isnull()) & (df.calculatedfinishedsquarefeet>350)]


In [23]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .70):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [25]:
df = handle_missing_values(df)
df.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
0,1727539,14297519,3.5,4.0,,3.5,3100.0,3100.0,6059.0,3.0,...,485713.0,1023282.0,2016.0,537569.0,11013.72,60590630000000.0,0.025595,2017-01-01,,Single Family Residential
1,1387261,17052889,1.0,2.0,,1.0,1465.0,1465.0,6111.0,1.0,...,88000.0,464000.0,2016.0,376000.0,5672.48,61110010000000.0,0.055619,2017-01-01,,Single Family Residential
2,11677,14186244,2.0,3.0,,2.0,1243.0,1243.0,6059.0,2.0,...,85289.0,564778.0,2016.0,479489.0,6488.3,60590220000000.0,0.005383,2017-01-01,,Single Family Residential
3,2288172,12177905,3.0,4.0,8.0,3.0,2376.0,2376.0,6037.0,3.0,...,108918.0,145143.0,2016.0,36225.0,1777.51,60373000000000.0,-0.10341,2017-01-01,Central,Single Family Residential
4,1970746,10887214,3.0,3.0,8.0,3.0,1312.0,1312.0,6037.0,3.0,...,73681.0,119407.0,2016.0,45726.0,1533.89,60371240000000.0,0.00694,2017-01-01,Central,Condominium


In [26]:
# columns to remove based on outliers

cols_to_remove = ['id',
       'calculatedbathnbr', 'finishedsquarefeet12', 'fullbathcnt', 'heatingorsystemtypeid'
       ,'propertycountylandusecode', 'propertylandusetypeid','propertyzoningdesc', 
        'censustractandblock', 'propertylandusedesc', 'unitcnt']

In [27]:
def remove_columns(df, cols_to_remove):  
    df = df.drop(columns=cols_to_remove)
    return df

In [28]:
df = remove_columns(df, cols_to_remove)


In [29]:
df.isnull().sum()

parcelid                            0
bathroomcnt                         0
bedroomcnt                          0
buildingqualitytypeid           26514
calculatedfinishedsquarefeet        0
fips                                0
latitude                            0
longitude                           0
lotsizesquarefeet                8034
rawcensustractandblock              0
regionidcity                     1322
regionidcounty                      0
regionidzip                        40
roomcnt                             0
yearbuilt                          35
structuretaxvaluedollarcnt         75
taxvaluedollarcnt                   1
assessmentyear                      0
landtaxvaluedollarcnt               1
taxamount                           5
logerror                            0
transactiondate                     0
heatingorsystemdesc             24943
dtype: int64

In [30]:
# messy column
df.drop(columns = 'heatingorsystemdesc', inplace = True)

In [31]:
# fill in nulls
df.buildingqualitytypeid.fillna(6.0, inplace = True)

In [33]:
# fill in nulls
df.lotsizesquarefeet.fillna(7313, inplace = True)

In [34]:
df.dropna(inplace = True)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69919 entries, 0 to 77573
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      69919 non-null  int64  
 1   bathroomcnt                   69919 non-null  float64
 2   bedroomcnt                    69919 non-null  float64
 3   buildingqualitytypeid         69919 non-null  float64
 4   calculatedfinishedsquarefeet  69919 non-null  float64
 5   fips                          69919 non-null  float64
 6   latitude                      69919 non-null  float64
 7   longitude                     69919 non-null  float64
 8   lotsizesquarefeet             69919 non-null  float64
 9   rawcensustractandblock        69919 non-null  float64
 10  regionidcity                  69919 non-null  float64
 11  regionidcounty                69919 non-null  float64
 12  regionidzip                   69919 non-null  float64
 13  r

In [38]:
#Remove outliers: 
df = df[df.taxvaluedollarcnt < 5_000_000]
df[df.calculatedfinishedsquarefeet < 8000]

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,rawcensustractandblock,...,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,logerror,transactiondate
0,14297519,3.5,4.0,6.0,3100.0,6059.0,33634931.0,-117869207.0,4506.0,6.059063e+07,...,96978.0,0.0,1998.0,485713.0,1023282.0,2016.0,537569.0,11013.72,0.025595,2017-01-01
1,17052889,1.0,2.0,6.0,1465.0,6111.0,34449266.0,-119281531.0,12647.0,6.111001e+07,...,97099.0,5.0,1967.0,88000.0,464000.0,2016.0,376000.0,5672.48,0.055619,2017-01-01
2,14186244,2.0,3.0,6.0,1243.0,6059.0,33886168.0,-117823170.0,8432.0,6.059022e+07,...,97078.0,6.0,1962.0,85289.0,564778.0,2016.0,479489.0,6488.30,0.005383,2017-01-01
3,12177905,3.0,4.0,8.0,2376.0,6037.0,34245180.0,-118240722.0,13038.0,6.037300e+07,...,96330.0,0.0,1970.0,108918.0,145143.0,2016.0,36225.0,1777.51,-0.103410,2017-01-01
4,10887214,3.0,3.0,8.0,1312.0,6037.0,34185120.0,-118414640.0,278581.0,6.037124e+07,...,96451.0,0.0,1964.0,73681.0,119407.0,2016.0,45726.0,1533.89,0.006940,2017-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77569,10833991,3.0,3.0,8.0,1741.0,6037.0,34202400.0,-118502000.0,59487.0,6.037132e+07,...,96415.0,0.0,1980.0,265000.0,379000.0,2016.0,114000.0,4685.34,-0.002245,2017-09-20
77570,11000655,2.0,2.0,6.0,1286.0,6037.0,34245368.0,-118282383.0,47405.0,6.037101e+07,...,96284.0,0.0,1940.0,70917.0,354621.0,2016.0,283704.0,4478.43,0.020615,2017-09-20
77571,17239384,2.0,4.0,6.0,1612.0,6111.0,34300140.0,-118706327.0,12105.0,6.111008e+07,...,97116.0,7.0,1964.0,50683.0,67205.0,2016.0,16522.0,1107.48,0.013209,2017-09-21
77572,12773139,1.0,3.0,4.0,1032.0,6037.0,34040895.0,-118038169.0,5074.0,6.037434e+07,...,96480.0,0.0,1954.0,32797.0,49546.0,2016.0,16749.0,876.43,0.037129,2017-09-21


In [39]:
# Function to read and wrangle data:

def wrangle_zillow():
    df = pd.read_csv('zillow.csv')
    
    # Restrict df to only properties that meet single unit use criteria
    single_use = [261, 262, 263, 264, 266, 268, 273, 276, 279]
    df = df[df.propertylandusetypeid.isin(single_use)]
    
    # Restrict df to only those properties with at least 1 bath & bed and 350 sqft area
    df = df[(df.bedroomcnt > 0) & (df.bathroomcnt > 0) & ((df.unitcnt<=1)|df.unitcnt.isnull())\
            & (df.calculatedfinishedsquarefeet>350)]

    # Handle missing values i.e. drop columns and rows based on a threshold
    df = handle_missing_values(df)
    
    # Add column for counties
    df['county'] = np.where(df.fips == 6037, 'Los_Angeles',
                           np.where(df.fips == 6059, 'Orange', 
                                   'Ventura'))    
    # drop columns not needed
    df = remove_columns(df, ['id',
       'calculatedbathnbr', 'finishedsquarefeet12', 'fullbathcnt', 'heatingorsystemtypeid'
       ,'propertycountylandusecode', 'propertylandusetypeid','propertyzoningdesc', 
        'censustractandblock', 'propertylandusedesc','heatingorsystemdesc','unitcnt'
                            ,'buildingqualitytypeid'])


    # replace nulls in unitcnt with 1
#     df.unitcnt.fillna(1, inplace = True)
    
    # assume that since this is Southern CA, null means 'None' for heating system
#     df.heatingorsystemdesc.fillna('None', inplace = True)
    
    # replace nulls with median values for select columns
    df.lotsizesquarefeet.fillna(7313, inplace = True)
#     df.buildingqualitytypeid.fillna(6.0, inplace = True)

    # Columns to look for outliers
    df = df[df.taxvaluedollarcnt < 5_000_000]
    df[df.calculatedfinishedsquarefeet < 8000]
    
    # Just to be sure we caught all nulls, drop them here
    df = df.dropna()
    
    return df

# Mall Customers: 
- Acquire data from the customers table in the mall_customers database.
- Summarize the data (include distributions and descriptive statistics).
- Detect outliers using IQR.
- Split data into train, validate, and test.
- Encode categorical columns using a one hot encoder (pd.get_dummies).
- Handles missing values.
- Scaling

In [48]:
def wrangle_mall_df():
    
    # acquire data
    sql = 'select * from customers'
    mall_df = get_mall_customers(sql)
    
    # handle outliers
    mall_df = outlier_function(mall_df, ['age', 'spending_score', 'annual_income'], 1.5)
    
    # get dummy for gender column
    dummy_df = pd.get_dummies(mall_df.gender, drop_first=True)
    mall_df = pd.concat([mall_df, dummy_df], axis=1).drop(columns = ['gender'])
    mall_df.rename(columns= {'Male': 'is_male'}, inplace = True)

    # split the data in train, validate and test
    train, test = train_test_split(mall_df, train_size = 0.8, random_state = 123)
    train, validate = train_test_split(train, train_size = 0.75, random_state = 123)
    
    return min_max_scaler, train, validate, test


In [49]:
def min_max_scaler(train, valid, test):
    '''
    Uses the train & test datasets created by the split_my_data function
    Returns 3 items: mm_scaler, train_scaled_mm, test_scaled_mm
    This is a linear transformation. Values will lie between 0 and 1
    '''
    num_vars = list(train.select_dtypes('number').columns)
    scaler = MinMaxScaler(copy=True, feature_range=(0,1))
    train[num_vars] = scaler.fit_transform(train[num_vars])
    valid[num_vars] = scaler.transform(valid[num_vars])
    test[num_vars] = scaler.transform(test[num_vars])
    return scaler, train, valid, test