In [22]:
# Data is aquired from the company SQL Database, login credentials are required

#################################### Function Imports ##############################################

# OS allows us to check if the data is already stored on our computer
import os
# Pandas reads the data into the variable
import pandas as pd
# Holds login credentials for SQL Database in a seperate file not added to GitHub
# env should only be stored locally on your computer
# Add to your .gitignore file to ensure credentials not compromised by uploading online
from env import host, username, password

#################################### SQL Connection Function ##############################################

# Function uses Login credentials to create a connection to the company SQL database
# NOTE: BE SURE NOT TO ADD YOUR CREDENTIALS TO GITHUB WHEN RECREATING THE PROJECT
def get_db_url(db_name):

    '''
    Connect to the SQL database with credentials stored in env file.
    Function parameter is the name of the database to connect to.
    Returns url.
    '''
    
    # Creates the url and the function returns this url
    url = f'mysql+pymysql://{username}:{password}@{host}/{db_name}'
    return (url)

#################################### Acquire Zillow Home Data ##############################################

# Function connects to the SQL database to store the data in a variable which can be used throughout the project
# Saves the data as a .csv file, returns as a pandas data frame
def get_home_data():

    '''
    Connect to SQL Database with url function called within this function.
    Checks if database is already saved to computer in csv file.
    If no file found, saves to a csv file and assigns database to df variable.
    If file found, just assigns database to df variable.
    Returns df variable holding the  Home Value database.
    '''
    
    # data_name allows the function to work no matter what a user might have saved their file name as
    # First, we check if the data is already stored in the computer
    # First conditional runs if the data is not already stored in the computer
    if os.path.isfile('zillow_home.csv') == False:

        # Querry selects the whole dataframe, joing each table on their foriegn keys
        # We will have double columns on the foriegn keys because they are joined together
        sql_querry = '''
                        SELECT *
                        FROM properties_2017 as prop
                        JOIN predictions_2017 as pred ON pred.id = prop.id
                        WHERE prop.propertylandusetypeid IN (260, 261, 263, 264, 266, 279);
                        ;
                    '''

        # Connecting to the data base and using the querry above to select the data
        # the pandas read_sql function reads the query into a DataFrame
        df = pd.read_sql(sql_querry, get_db_url('zillow'))

        # We do not need the duplicate columns from the foriegn tables being joined
        # df.columns.duplicated() returns a boolean array, True for a duplicate or False if it is unique up to that point
        # Use ~ to flip the booleans and return the df as any columns that are not duplicated
        # df.loc accesses a group of rows and columns by label(s) or a boolean array
        df = df.loc[:,~df.columns.duplicated()]

        # The pandas to_csv function writes the data frame to a csv file
        # This allows data to be stored locally for quicker exploration and manipulation
        df.to_csv('zillow_home.csv')

    # This conditional runs if the data has already been saved as a csv (if the function has already been run on your computer)
    else:
        # Reads the csv saved from above, and assigns to the df variable
        df = pd.read_csv('zillow_home.csv', index_col=0)

    return df

    # Code to remove duplicates of a df found at:
    # https://www.interviewqs.com/ddi_code_snippets/remove_duplicate_cols

In [23]:
df = get_home_data()

In [24]:
df.shape

(70364, 61)

In [25]:
df.columns

Index(['id', 'parcelid', 'airconditioningtypeid', 'architecturalstyletypeid',
       'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcnt', 'yardbuildingsqft17', 'yardb

In [30]:
for col in df.columns:
    print(col)
    print(df[col].isnull().sum())
    print((df[col].isnull().sum()/70364)*100)
    print(df[col].min())
    print(df[col].max())
    print('\n')

id
0
0.0
1
77613


parcelid
0
0.0
10711745
169601949


airconditioningtypeid
50658
71.99420158035359
1.0
13.0


architecturalstyletypeid
70213
99.7854016258314
2.0
21.0


basementsqft
70320
99.93746802342108
63.0
2443.0


bathroomcnt
0
0.0
0.0
20.0


bedroomcnt
0
0.0
0.0
25.0


buildingclasstypeid
70364
100.0
nan
nan


buildingqualitytypeid
25913
36.82707066113353
1.0
12.0


calculatedbathnbr
2187
3.1081234722301176
1.0
20.0


decktypeid
69877
99.30788471377409
66.0
66.0


finishedfloor1squarefeet
64695
91.94332329031892
47.0
15998.0


calculatedfinishedsquarefeet
1019
1.4481837303166392
20.0
26345.0


finishedsquarefeet12
1428
2.029446876243534
20.0
26345.0


finishedsquarefeet13
70154
99.70155192996418
224.0
2400.0


finishedsquarefeet15
70357
99.99005173099881
462.0
8348.0


finishedsquarefeet50
64695
91.94332329031892
47.0
15998.0


finishedsquarefeet6
70172
99.72713319311012
368.0
5254.0


fips
0
0.0
6037.0
6111.0


fireplacecnt
62158
88.3377863680291
1.0
9.0


fullbathcnt
2187
3.

TypeError: '<=' not supported between instances of 'str' and 'float'

In [31]:
for col in df.describe().columns:
    print(col)
    print(df[col].min())
    print(df[col].max())
    print('\n')

id
1
77613


parcelid
10711745
169601949


airconditioningtypeid
1.0
13.0


architecturalstyletypeid
2.0
21.0


basementsqft
63.0
2443.0


bathroomcnt
0.0
20.0


bedroomcnt
0.0
25.0


buildingqualitytypeid
1.0
12.0


calculatedbathnbr
1.0
20.0


decktypeid
66.0
66.0


finishedfloor1squarefeet
47.0
15998.0


calculatedfinishedsquarefeet
20.0
26345.0


finishedsquarefeet12
20.0
26345.0


finishedsquarefeet13
224.0
2400.0


finishedsquarefeet15
462.0
8348.0


finishedsquarefeet50
47.0
15998.0


finishedsquarefeet6
368.0
5254.0


fips
6037.0
6111.0


fireplacecnt
1.0
9.0


fullbathcnt
1.0
20.0


garagecarcnt
0.0
13.0


garagetotalsqft
0.0
3774.0


hashottuborspa
1.0
1.0


heatingorsystemtypeid
1.0
24.0


latitude
33339600.0
34806946.0


longitude
-119448392.0
-117555933.0


lotsizesquarefeet
167.0
6971010.0


poolcnt
1.0
1.0


poolsizesum
28.0
2176.0


pooltypeid10
1.0
1.0


pooltypeid2
1.0
1.0


pooltypeid7
1.0
1.0


propertylandusetypeid
260.0
266.0


rawcensustractandblock
60371011.101


In [35]:
df.propertyzoningdesc.value_counts()

LAR1          7157
LARS          1571
LAR3          1464
LBR1N         1184
LARA           771
              ... 
LCR3-R1200       1
CVA1*            1
LCRR1Y           1
SDSP10*          1
COPR*            1
Name: propertyzoningdesc, Length: 1788, dtype: int64