### Zillow

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from summarize import df_summary
from scipy import stats
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
def keep_only_single_unit_properties(df):
    '''
    Include only single unit properties (e.g. no duplexes, no land/lot, ...)
    For some properties, you will need to use multiple fields to estimate 
    whether it is a single unit property.

    KEEP:
    'Single Family Residential', 'Condominium', 
    'Mobile Home', 'Manufactured, Modular, Prefabricated Homes',
    'Residential General', 'Townhouse'

    DROP
    'Commercial/Office/Residential Mixed Used','Cluster Home',
    'Planned Unit Development', 'Quadruplex (4 Units, Any Combination)',
    'Triplex (3 Units, Any Combination)', 'Cooperative', 
    'Duplex (2 Units, Any Combination)', 'Store/Office (Mixed Use)'

    and then drop all but unitcnt == 1
    '''
    
    # make a copy of the df to avoid possible side-affects
    df = df.copy()
    
    keep = ('Single Family Residential', 'Condominium', 'Mobile Home', 
        'Manufactured, Modular, Prefabricated Homes', 
        'Residential General', 'Townhouse')

    df = df[df['propertylandusedesc'].isin(keep)]
    df = df[df['unitcnt'].isin(['1'])]
    
    return df

In [3]:
def remove_columns(df, cols_to_remove):  
    df = df.drop(columns=cols_to_remove)
    return df

def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

def data_prep(df, cols_to_remove=[], prop_required_column=.5, prop_required_row=.75):
    df = remove_columns(df, cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    return df

In [4]:
def amount_of_missing_values_in_columns(df):
    '''
    Write or use a previously written function to return the
    total missing values and the percent missing values by column.
    
    Puts all that info into a dataframe with each original feature
    being a row.
    
    Returns a dataframe that only contains rows that have missing values.
    '''
    df = df.copy()
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    total_count = null_count + empty_count + nan_count
    total_percentage = null_percentage + empty_percentage + nan_percentage

    missing_df = pd.DataFrame({'total_missing': total_count, 'total_percentage': total_percentage,
                        'num_missing': null_count, 'missing_percentage': null_percentage,
                        'num_empty': empty_count, 'empty_percentage': empty_percentage,
                        'nan_count': nan_count, 'nan_percentage': nan_percentage})
    
    return missing_df[(missing_df != 0).any(1)]

In [5]:
def fix_bathroom_cnts(df):
    '''
    fix calculatedbathnbr, fullbathcnt
    'bathroomcnt'	 Number of bathrooms in home including fractional bathrooms
    'calculatedbathnbr' (55 missing)	 Number of bathrooms in home including fractional bathroom
    'fullbathcnt'(55 missing)	 Number of full bathrooms (sink, shower + bathtub, and toilet) present in home

    fill both latter fields with the bathroomcnt
    '''
    df = df.copy()
    df['calculatedbathnbr'].fillna(df['bathroomcnt'], inplace=True)
    df['fullbathcnt'].fillna(df['bathroomcnt'], inplace=True)
    return df

In [6]:
def convert_number_columns_to_appropriate_datatype(df):
    '''
    Accepts an unprepared zillow dataframe, creates a list of
    columns names for numeric columns, transforms those columns
    into category type or integer type or leaves it as float type, 
    and then returns the dataframe with those changes applied.
    '''
    # make a copy of the df to avoid possible side-affects
    df = df.copy()
    
### THIS STEP SHOULD NOT BE NECESSARY!!! 
#     # find the numerical columns and fill nulls with 0
#     num_cols = df.select_dtypes('number').columns

#     for col in num_cols:
#         # fill nulls with 0
#         df[col].fillna(0, inplace=True)    


    # convert categorical DataFrame columns to the category dtype
    cat_cols = ['buildingqualitytypeid', 'fips', 'regionidcity', 
                'regionidcounty','regionidzip',]

    for col in cat_cols:
        df[col] = df[col].astype('category')


    # convert counted DataFrame columns to the int dtype
    int_cols = ['bathroomcnt', 'bedroomcnt', 'calculatedbathnbr',
                'fullbathcnt', 'rawcensustractandblock', 'roomcnt',
                'yearbuilt', 'assessmentyear', 'censustractandblock']

    for col in int_cols:
        df[col] = df[col].astype(int)

    '''
    columns that I left as floats:
    calculatedfinishedsquarefeet,
    finishedsquarefeet12, 
    latitude, longitude, lotsizesquarefeet, 
    structuretaxvaluedollarcnt, taxvaluedollarcnt, 
    landtaxvaluedollarcnt, taxamount, logerror
    '''

    return df

In [7]:
def get_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for
    the series.

    The values returned will be either 0 (if the point is not an outlier),
    or a number that indicates how far away from the upper or lower bound
    the observation is.
    '''
    s = s.copy()
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    lower_bound = q1 - k * iqr
    upper = s.apply(lambda x: max([x - upper_bound, 0]))
    lower = s.apply(lambda x: max([lower_bound - x, 0]))
      
#     df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
    
    return (upper,lower)

In [8]:
def add_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
   # make a copy of the df to avoid possible side-affects
    df = df.copy()
    
    for col in df.select_dtypes('number'):
        upper, lower = get_outliers(df[col], k)
        df[col + '_upper_outliers'] = upper
        df[col + '_lower_outliers'] = lower

    return df

In [9]:
def prepare_zillow():
    # Creating and returning dataframe here assuming aquire created
    # the csv file previously.
    
    # get the zillow data and set the customer_id as the index
    print('Reading Zillow data...')
    zillow_df = pd.read_csv('zillow_data.csv')
    
    print('Preparing and cleaning Zillow data...')
    # make the parcelid the index so we don't have the extra index column
    zillow_df.set_index('parcelid', inplace=True)
    # the id was added by SQL in creating file and is not needed
    zillow_df.drop(columns='id', inplace=True) 
    # print the summary
#     df_summary(zillow_df)
    
    # keep only single unit properties
    zillow_df = keep_only_single_unit_properties(zillow_df)
    
    # drop unitcnt column now
    zillow_df = zillow_df.drop(columns=['unitcnt'])
    
    # dropping columns with more than 45% data missing in a column
    # and dropping rows with more than 25% data missing in the row
    zillow_df = data_prep(
        zillow_df,
        cols_to_remove=[],
        prop_required_column=.55,
        prop_required_row=.75
    )
    
    zillow_df = fix_bathroom_cnts(zillow_df)
    
    # drop the rows with nans in any fields
    zillow_df = data_prep(
        zillow_df,
        cols_to_remove=[],
        prop_required_column=.55,
        prop_required_row=1
    )
    
    # data is now clean, so convert the column datatypes appropriately
    zillow_df = convert_number_columns_to_appropriate_datatype(zillow_df)
    
    # add outliers function from Maggie
    
    print('Zillow data is now ready for analysis!')
    
    return zillow_df

In [None]:
# get the zillow data and set the customer_id as the index

zillow_df = pd.read_csv('zillow_data.csv')
# make the parcelid the index so we don't have the extra index column
zillow_df.set_index('parcelid', inplace=True)
# the id was added by SQL in creating file and is not needed
zillow_df.drop(columns='id', inplace=True) 
print(zillow_df.ftypes)
#print(zillow_df.head(3).append(zillow_df.tail(3)))

In [None]:
# df_summary(zillow_df)

In [None]:
# There are no missing fields in the landtaxvaluedollarcnt column
zillow_df.landtaxvaluedollarcnt.sort_values(ascending=True).head(5)

In [None]:
# keep only single unit properties

zillow_df = keep_only_single_unit_properties(zillow_df)
print(zillow_df.unitcnt.value_counts())
print(zillow_df.propertylandusedesc.value_counts())

In [None]:
# drop unitcnt column now
zillow_df = zillow_df.drop(columns=['unitcnt'])

#### Handle Missing Values
See how many values are missing in each column.

In [None]:
amount_of_missing_values_in_columns(zillow_df).sort_values(by=['total_missing'], ascending=False)

In [None]:
# Note that only nulls are showing up... no empty strings, blanks, or 
# strings with the letters 'NaN' typed into them

In [None]:
# dropping columns with more than 45% data missing in a column
# and dropping rows with more than 25% data missing in the row
zillow_df = data_prep(
    zillow_df,
    cols_to_remove=[],
    prop_required_column=.55,
    prop_required_row=.75
)

In [None]:
amount_of_missing_values_in_columns(zillow_df).sort_values(by=['total_missing'], ascending=False)

In [None]:
zillow_df.shape

In [None]:
zillow_df = fix_bathroom_cnts(zillow_df)

In [None]:
amount_of_missing_values_in_columns(zillow_df).sort_values(by=['total_missing'], ascending=False)

In [None]:
zillow_df.shape

In [None]:
# drop the rows with nans in any fields
zillow_df = data_prep(
    zillow_df,
    cols_to_remove=[],
    prop_required_column=.55,
    prop_required_row=1
)

In [None]:
amount_of_missing_values_in_columns(zillow_df).sort_values(by=['total_missing'], ascending=False)

In [None]:
num_cols = zillow_df.select_dtypes('number').columns
num_cols

In [None]:
# data is now clean, so convert the column datatypes appropriately
zillow_df = convert_number_columns_to_appropriate_datatype(zillow_df)
print(zillow_df.dtypes)

In [None]:
# Add the columns for the outliers
zillow_df = add_outlier_columns(zillow_df, k=1.5)

In [None]:
# Examine the stats on the outliers.
# The outlier values returned will be either 0 (if the point is not an 
# outlier), or a number that indicates how far away from the upper or 
# lower bound the observation is.

outlier_cols = [col for col in zillow_df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = zillow_df[col][zillow_df[col] > 0]
    print(data.describe())

In [10]:
df = prepare_zillow()
df.head()

Reading Zillow data...
Preparing and cleaning Zillow data...
Zillow data is now ready for analysis!


Unnamed: 0_level_0,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,heatingorsystemdesc,latitude,...,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11016594,2,3,4.0,2,1684.0,1684.0,6037.0,2,Central,34280990.0,...,0,1959,122754.0,360170.0,2015,237416.0,6735.88,60371066461001,0.0276,2016-01-01
12098116,3,2,4.0,3,2217.0,2217.0,6037.0,3,Central,34136312.0,...,0,1940,61994.0,119906.0,2015,57912.0,11484.48,60374638003004,-0.004,2016-01-01
12643413,2,2,4.0,2,839.0,839.0,6037.0,2,Central,33755800.0,...,0,1987,171518.0,244880.0,2015,73362.0,3048.74,60372963002002,0.0218,2016-01-02
11509835,4,4,1.0,4,3067.0,3067.0,6037.0,4,Central,33870089.0,...,0,1982,880650.0,2447951.0,2015,1567301.0,27126.57,60376210044006,-0.2705,2016-01-02
12286022,1,2,7.0,1,1297.0,1297.0,6037.0,1,Floor/Wall,33899475.0,...,0,1939,64549.0,111521.0,2015,46972.0,2304.97,60375416053007,0.044,2016-01-02
