In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

In [2]:
import env
import acquire

In [3]:
df = acquire.acquire_zillow()

Force 'head' to show all columns

In [4]:
pd.set_option('display.max_columns', None) 

In [6]:
def remove_dup_col(df):
    df = df.loc[:,~df.columns.duplicated()]
    return df

In [7]:
df = remove_dup_col(df)

Adjust data types

In [8]:
df.dtypes

id                                int64
parcelid                          int64
logerror                        float64
transactiondate                  object
airconditioningtypeid           float64
architecturalstyletypeid        float64
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid              object
buildingqualitytypeid           float64
calculatedbathnbr               float64
decktypeid                      float64
finishedfloor1squarefeet        float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15             object
finishedsquarefeet50            float64
finishedsquarefeet6             float64
fips                            float64
fireplacecnt                    float64
fullbathcnt                     float64
garagecarcnt                    float64
garagetotalsqft                 float64


Calculate the number and percent of missing values for each attribute

In [9]:
# def new_df(df):
#     num_rows_missing = df.isna().sum()
#     pct_rows_missing = num_rows_missing/len(df)*100
#     df_sum = pd.DataFrame()
#     df_sum['num_rows_missing'] = num_rows_missing
#     df_sum['pct_rows_missing'] = pct_rows_missing
#     return df_sum

In [10]:
# new_df(df)

In [11]:
def handle_missing_values(df, prop_required_column = .9, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def data_prep(df, cols_to_remove=[], prop_required_column=.9, prop_required_row=.75):
    df.drop(columns = cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    return df

In [12]:
df = data_prep(df, cols_to_remove=[], prop_required_column=.9, prop_required_row=.75)

In [13]:
# def new_df(df):
#     num_rows_missing = df.isna().sum()
#     pct_rows_missing = num_rows_missing/len(df)*100
#     df_sum = pd.DataFrame()
#     df_sum['num_rows_missing'] = num_rows_missing
#     df_sum['pct_rows_missing'] = pct_rows_missing
#     return df_sum

In [14]:
# new_df(df).info()

In [15]:
def drop_col(df):
        df = df.drop(columns = ['bathroomcnt', 'bedroomcnt', 'finishedsquarefeet12', 'fullbathcnt', 'propertycountylandusecode',
                            'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidzip', 
                            'censustractandblock', 'propertylandusedesc', 'transactiondate'])
        return df

In [16]:
df = drop_col(df)

In [18]:
df.shape

(55720, 17)

In [20]:
df.fillna(value=pd.np.nan, inplace=True)

In [22]:
df = df.dropna()

In [27]:
df.shape

(55293, 17)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55293 entries, 0 to 55719
Data columns (total 17 columns):
id                              55293 non-null int64
parcelid                        55293 non-null int64
logerror                        55293 non-null float64
calculatedbathnbr               55293 non-null float64
calculatedfinishedsquarefeet    55293 non-null float64
fips                            55293 non-null float64
latitude                        55293 non-null float64
longitude                       55293 non-null float64
lotsizesquarefeet               55293 non-null float64
regionidcounty                  55293 non-null float64
roomcnt                         55293 non-null float64
yearbuilt                       55293 non-null float64
structuretaxvaluedollarcnt      55293 non-null float64
taxvaluedollarcnt               55293 non-null float64
assessmentyear                  55293 non-null float64
landtaxvaluedollarcnt           55293 non-null float64
taxamount      