# ZILLOW

# Acquire

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from env import host, user, password

In [2]:
# Creating a string that connects me to MySQLWorkbench

def get_connection(db, user=user, host=host, password=password):
    '''
    get_connection uses login info from env.py file to access Codeup db.
    It takes in a string name of a database as an argument.
    '''
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'
    

In [3]:
# Getting the data from the Codeup database

def get_zillow_data():
    '''
    zillow_data() gets the zillow (only properties_2017 table) data from Codeup db, then writes it to a csv file,
    and returns the DF.
    '''
    # Creating a SQL query
    sql_query = '''
                SELECT DISTINCT * FROM properties_2017
    LEFT JOIN propertylandusetype USING (propertylandusetypeid)
    LEFT JOIN predictions_2017 USING (parcelid)
    LEFT JOIN airconditioningtype USING (airconditioningtypeid)
    LEFT JOIN architecturalstyletype USING (architecturalstyletypeid)
    LEFT JOIN buildingclasstype USING (buildingclasstypeid)
    LEFT JOIN heatingorsystemtype USING (heatingorsystemtypeid)
    LEFT JOIN storytype USING (storytypeid)
    LEFT JOIN typeconstructiontype USING (typeconstructiontypeid)
    LEFT JOIN unique_properties USING (parcelid)
    WHERE propertylandusedesc = 'Condominium' OR 'Single Family Residential'
    OR 'Townhouse' OR 'Cluster Home' OR 'Bungalow' OR 'Patio Home'
    AND longitude IS NOT NULL
    AND latitude IS NOT NULL
    AND transactiondate BETWEEN '2017-01-01' AND '2017-12-31';
                '''
    
    # Reading in the DataFrame from Codeup db.
    df = pd.read_sql(sql_query, get_connection('zillow'))
    return df

In [4]:
# Assigning the dataframe to a variable and displaying of it to have a first look

houses = get_zillow_data()
houses.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,propertylandusedesc,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
0,10859051,,,2.0,,,1.0,266.0,1853,,...,Condominium,212.0,0.026735,2017-01-03,Central,,,Central,,
1,12859120,,,2.0,,,1.0,266.0,9323,,...,Condominium,369.0,-0.017184,2017-01-03,Central,,,Central,,
2,14632619,,,,,,,266.0,13031,,...,Condominium,3245.0,-0.064174,2017-01-17,,,,,,
3,11052658,,,2.0,,,1.0,266.0,17568,,...,Condominium,400.0,-0.005658,2017-01-03,Central,,,Central,,
4,11270466,,,2.0,,,1.0,266.0,18257,,...,Condominium,596.0,1.174471,2017-01-04,Central,,,Central,,


In [5]:
# Displaying number of rows and columns

houses.shape

(483837, 69)

In [6]:
# Displaying some general information about the data

houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483837 entries, 0 to 483836
Data columns (total 69 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   parcelid                      483837 non-null  int64  
 1   typeconstructiontypeid        4089 non-null    float64
 2   storytypeid                   16 non-null      float64
 3   heatingorsystemtypeid         321319 non-null  float64
 4   buildingclasstypeid           0 non-null       object 
 5   architecturalstyletypeid      3669 non-null    float64
 6   airconditioningtypeid         233279 non-null  float64
 7   propertylandusetypeid         483837 non-null  float64
 8   id                            483837 non-null  int64  
 9   basementsqft                  16 non-null      float64
 10  bathroomcnt                   483837 non-null  float64
 11  bedroomcnt                    483837 non-null  float64
 12  buildingqualitytypeid         286661 non-nul

In [None]:
# Writing houses to csv on my computer

houses.to_csv('houses.csv')

In [7]:
# Displaying a first statistical analysis of the data through .describe()

houses.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parcelid,483837.0,14225630.0,13395490.0,10714520.0,11582470.0,12570710.0,14666180.0,168183300.0
typeconstructiontypeid,4089.0,6.011494,0.4049775,4.0,6.0,6.0,6.0,13.0
storytypeid,16.0,7.0,0.0,7.0,7.0,7.0,7.0,7.0
heatingorsystemtypeid,321319.0,3.916893,5.580052,1.0,2.0,2.0,2.0,24.0
architecturalstyletypeid,3669.0,7.212319,2.216204,2.0,7.0,7.0,7.0,27.0
airconditioningtypeid,233279.0,1.047338,0.6558377,1.0,1.0,1.0,1.0,13.0
propertylandusetypeid,483837.0,266.0,0.0,266.0,266.0,266.0,266.0,266.0
id,483837.0,1492971.0,860784.9,14.0,746131.0,1492125.0,2237380.0,2982283.0
basementsqft,16.0,443.0,169.014,104.0,330.0,539.0,539.25,618.0
bathroomcnt,483837.0,2.178589,0.750534,0.0,2.0,2.0,3.0,18.0


In [8]:
# Displaying statistical analysis of the data through.descibe()
# with astype('int64') to cut through all the noise of the float data

houses.describe().T.astype('int64')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parcelid,483837,14225634,13395488,10714522,11582473,12570713,14666180,168183267
typeconstructiontypeid,4089,6,0,4,6,6,6,13
storytypeid,16,7,0,7,7,7,7,7
heatingorsystemtypeid,321319,3,5,1,2,2,2,24
architecturalstyletypeid,3669,7,2,2,7,7,7,27
airconditioningtypeid,233279,1,0,1,1,1,1,13
propertylandusetypeid,483837,266,0,266,266,266,266,266
id,483837,1492971,860784,14,746131,1492125,2237380,2982283
basementsqft,16,443,169,104,330,539,539,618
bathroomcnt,483837,2,0,0,2,2,3,18


In [9]:
# Checking for duplicates

houses.columns.duplicated().any()

True

In [10]:
# Dropping the duplicates

houses = houses.loc[:,~houses.T.duplicated(keep=False)]

In [11]:
# Checking for duplicates again

houses.columns.duplicated().any()
# There are still duplicates

True

In [12]:
# Checking to see the number of columns left

houses.shape

(483837, 66)

In [13]:
# Running a list of the columns to detect the duplicates

houses.columns.tolist()

['parcelid',
 'typeconstructiontypeid',
 'storytypeid',
 'heatingorsystemtypeid',
 'architecturalstyletypeid',
 'airconditioningtypeid',
 'propertylandusetypeid',
 'id',
 'basementsqft',
 'bathroomcnt',
 'bedroomcnt',
 'buildingqualitytypeid',
 'calculatedbathnbr',
 'decktypeid',
 'finishedfloor1squarefeet',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet15',
 'finishedsquarefeet50',
 'finishedsquarefeet6',
 'fips',
 'fireplacecnt',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'hashottuborspa',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'poolcnt',
 'poolsizesum',
 'pooltypeid10',
 'pooltypeid2',
 'pooltypeid7',
 'propertycountylandusecode',
 'propertyzoningdesc',
 'rawcensustractandblock',
 'regionidcity',
 'regionidcounty',
 'regionidneighborhood',
 'regionidzip',
 'roomcnt',
 'threequarterbathnbr',
 'unitcnt',
 'yardbuildingsqft17',
 'yardbuildingsqft26',
 'yearbuilt',
 'numberofstories',
 'fireplaceflag',
 'structuretaxvaluedollarcnt',


**The duplicates are 2 id columns. They are not essential to the data exploration. I will drop them along with other id columns**

**However in case one of the columns are necessary in a different case, I have put together a function that rename the columns in order to differentiate them**

In [None]:
# # For loop to rename duplicate columns (particularly if they have the same name and not the same content)

# cols=pd.Series(df.columns)
# for dup in df.columns[df.columns.duplicated(keep=False)]: 
#     cols[df.columns.get_loc(dup)] = ([dup + '_' + str(d_idx) 
#                                      if d_idx != 0 
#                                      else dup 
#                                      for d_idx in range(df.columns.get_loc(dup).sum())]
#                                     )
# df.columns=cols

In [14]:
# Dropping id columns

houses = houses.drop(['typeconstructiontypeid',
 'storytypeid',
 'heatingorsystemtypeid',
 'architecturalstyletypeid',
 'airconditioningtypeid',
 'propertylandusetypeid',
 'id',
 'buildingqualitytypeid',
 'pooltypeid10',
 'pooltypeid2',
 'pooltypeid7',
 'decktypeid'], axis = 1)
houses.columns.tolist()

['parcelid',
 'basementsqft',
 'bathroomcnt',
 'bedroomcnt',
 'calculatedbathnbr',
 'finishedfloor1squarefeet',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet15',
 'finishedsquarefeet50',
 'finishedsquarefeet6',
 'fips',
 'fireplacecnt',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'hashottuborspa',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'poolcnt',
 'poolsizesum',
 'propertycountylandusecode',
 'propertyzoningdesc',
 'rawcensustractandblock',
 'regionidcity',
 'regionidcounty',
 'regionidneighborhood',
 'regionidzip',
 'roomcnt',
 'threequarterbathnbr',
 'unitcnt',
 'yardbuildingsqft17',
 'yardbuildingsqft26',
 'yearbuilt',
 'numberofstories',
 'fireplaceflag',
 'structuretaxvaluedollarcnt',
 'taxvaluedollarcnt',
 'assessmentyear',
 'landtaxvaluedollarcnt',
 'taxamount',
 'taxdelinquencyflag',
 'taxdelinquencyyear',
 'censustractandblock',
 'propertylandusedesc',
 'logerror',
 'transactiondate',
 'airconditioningdesc',
 'architectural

In [15]:
houses.shape

(483837, 53)

In [16]:
# A loop to print out the value_counts of the columns

for column in houses.columns:
    print(column)
    print(houses[column].value_counts())
    print('\n##########################\n')

parcelid
12068159    2
12641353    2
14634203    2
12541155    2
10777937    2
           ..
14743165    1
14742843    1
14742420    1
14737462    1
14675114    1
Name: parcelid, Length: 483789, dtype: int64

##########################

basementsqft
539.0    5
330.0    4
104.0    2
618.0    2
473.0    1
616.0    1
540.0    1
Name: basementsqft, dtype: int64

##########################

bathroomcnt
2.0     187481
3.0     124494
1.0      79116
2.5      63874
1.5      13393
4.0       7330
0.0       3734
3.5       3244
5.0        590
4.5        339
6.0        127
5.5         52
7.0         26
6.5         10
8.0          7
12.0         5
9.0          4
16.0         2
11.0         2
10.0         2
13.0         1
18.0         1
17.0         1
15.0         1
14.0         1
Name: bathroomcnt, dtype: int64

##########################

bedroomcnt
2.0     241600
3.0     149686
1.0      61308
4.0      21663
0.0       8284
5.0       1233
6.0         42
8.0          7
9.0          6
7.0          2
16

104023.0    346
101525.0    342
109038.0    301
200000.0    277
203050.0    268
           ... 
165039.0      1
31154.0       1
359270.0      1
500226.0      1
302845.0      1
Name: landtaxvaluedollarcnt, Length: 195174, dtype: int64

##########################

taxamount
1844.16     122
1045.88     105
1481.24      98
2264.43      88
2266.14      84
           ... 
1085.14       1
3630.22       1
5526.22       1
3868.02       1
10412.02      1
Name: taxamount, Length: 323732, dtype: int64

##########################

taxdelinquencyflag
Y    7650
Name: taxdelinquencyflag, dtype: int64

##########################

taxdelinquencyyear
15.0    3756
14.0    2243
13.0     635
12.0     408
11.0     251
10.0     191
9.0       97
8.0       35
7.0       20
6.0        5
5.0        3
98.0       2
0.0        1
2.0        1
90.0       1
3.0        1
96.0       1
Name: taxdelinquencyyear, dtype: int64

##########################

censustractandblock
6.037137e+13    881
6.059063e+13    807
6.037277e+1

In [None]:
# # This code allows me to display the count of nulls in each column mentioned
# # Still need to shape it into a function or a loop

# houses[['bedroomcnt',
#  'calculatedbathnbr',
#  'finishedfloor1squarefeet',
#  'calculatedfinishedsquarefeet',
#  'finishedsquarefeet12',
#  'finishedsquarefeet15',
#  'finishedsquarefeet50',
#  'finishedsquarefeet6']].isna().sum().reset_index(name="n").plot.bar(x='index', y='n', rot=75)

In [18]:
# Trying my function

def multi_frequency(df,vars):
    '''multi_frequency takes a dataframe in *arg and a *kwarg in the form of a list of columns
    and return a dataframe with the count and the frequency of the data
    '''
    frequency=df[vars].isnull().sum()
    percentage=df[vars].isnull().sum()*100/(len(df))
    df=pd.concat([frequency,percentage], axis=1, keys=['num_rows_missing', 'pct_rows_missing'])
    return df
multi_frequency(houses, ['latitude'])

Unnamed: 0,num_rows_missing,pct_rows_missing
latitude,0,0.0


In [19]:
# Displaying the columns and their the count and percent of missing rows ordered by percent in a descending order

multi_frequency(houses, houses.columns).sort_values(by='pct_rows_missing', ascending=False)

Unnamed: 0,num_rows_missing,pct_rows_missing
storydesc,483821,99.996693
basementsqft,483821,99.996693
finishedsquarefeet6,483781,99.988426
yardbuildingsqft26,483772,99.986566
poolsizesum,483764,99.984912
finishedsquarefeet15,483629,99.95701
hashottuborspa,483134,99.854703
fireplaceflag,481523,99.52174
architecturalstyledesc,480168,99.241687
typeconstructiondesc,479748,99.154881


In [22]:
# def multi_frequency_rows(df,vars):
#     '''multi_frequency takes a dataframe in *arg and a *kwarg in the form of a list of columns
#     and return a dataframe with the count and the frequency of the data
#     '''
#     frequency=df[vars].isnull().sum(axis=1)
#     percentage=df[vars].isnull().sum(axis=1)*100/(len(df))
#     df=pd.concat([frequency,percentage], axis=1, keys=['num_entries_missing', 'pct_entries_missing'])
#     return df
# multi_frequency(houses, ['basementsqft'])

In [23]:
# Assigning the table above to a variable and selecting a cut off for the percentage of rows missing to eliminate

frequency = multi_frequency(houses, houses.columns).sort_values(by='pct_rows_missing', ascending=False)
frequency.pct_rows_missing >= 33.589411

storydesc                        True
basementsqft                     True
finishedsquarefeet6              True
yardbuildingsqft26               True
poolsizesum                      True
finishedsquarefeet15             True
hashottuborspa                   True
fireplaceflag                    True
architecturalstyledesc           True
typeconstructiondesc             True
yardbuildingsqft17               True
taxdelinquencyflag               True
taxdelinquencyyear               True
fireplacecnt                     True
transactiondate                  True
logerror                         True
finishedsquarefeet50             True
finishedfloor1squarefeet         True
numberofstories                  True
threequarterbathnbr              True
poolcnt                          True
garagecarcnt                     True
garagetotalsqft                  True
regionidneighborhood             True
airconditioningdesc              True
lotsizesquarefeet                True
unitcnt     

In [25]:
# Percent of columns that is missing +30% of its entries

percent_nullvalues_columns = (len(frequency[frequency.pct_rows_missing >= 33.589411]))/len(frequency)

In [26]:
percent_nullvalues_columns

0.5471698113207547

## Takeaways
- 54.71% of the data is missing between 33% and 100% of its values
- This makes me think that the best cut off would be the 33% threshold
- Many columns are id columns and should be removed. They have no impact on the analysis of the data and teh dataframe already has parcelid as unique identifier. Here is the list of the columns to drop:
     'typeconstructiontypeid',
     'storytypeid',
     'heatingorsystemtypeid',
     'buildingclasstypeid',
     'architecturalstyletypeid',
     'airconditioningtypeid',
     'propertylandusetypeid',
     'id',
     'buildingqualitytypeid',
     'id_1'

# Prepare

## 1. Remove any properties that are likely to be something other than single unit properties.

**Single unit properties were selected in the SQL query. The code for it is shown in the SQL query**

## 2. Create a function that will drop rows or columns based on the percent of values that are missing

### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;A. Handling missing values

In [31]:
def handle_missing_values(df, prop_required_column, prop_required_row):
    '''
    handle_missing_values calculates the number of recquired columns and rows
    based on an arbitrary prop_required_row/column float times the number
    of columns/rows respectively as ordered in the function
    then uses the thresh argument to apply that number to the dropna
    '''
    required_columns = df.shape[0] * prop_required_column
    required_rows = df.shape[1] * prop_required_row
    
    df = df.dropna(axis=0, thresh = required_rows)
    df = df.dropna(axis=1, thresh = required_columns)
    return df

In [32]:
houses = handle_missing_values(houses, prop_required_row=.7, prop_required_column=.7) 
houses.head()

In [None]:
def examine_and_prepare()

In [None]:
def _maybe_dedup_names(self, names):
    # see gh-7160 and gh-9424: this helps to provide
    # immediate alleviation of the duplicate names
    # issue and appears to be satisfactory to users,
    # but ultimately, not needing to butcher the names
    # would be nice!
    if self.mangle_dupe_cols:
        names = list(names)  # so we can index
        counts = {}

        for i, col in enumerate(names):
            cur_count = counts.get(col, 0)

            if cur_count > 0:
                names[i] = '%s.%d' % (col, cur_count)

            counts[col] = cur_count + 1

    return names

In [None]:
figure, ax1 = plt.subplots()
ax1.plot(df.iloc[:,0],df.iloc[:,1],linewidth=0.5,zorder=1, label = )
ax1.plot(df.iloc[:,0],df.iloc[:,2],linewidth=0.5,zorder=1, label = )