In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000 # assists with processor speed

import matplotlib.pyplot as plt
# Only works inside notebook
%matplotlib inline 

# import preprocessing
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer

from sklearn.neighbors import KNeighborsClassifier

import QMCBT_00_quicktips as qt
import QMCBT_01_acquire as acquire
import QMCBT_02_prepare as prepare
#import QMCBT_03_explore as explore
#import QMCBT_04_evaluate as evaluate
import QMCBT_explore_evaluate as ee
import QMCBT_wrangle as w

from env import user, password, host
# allows import reload without needing to clear kernel and rerun
# reload(packagename) 
from importlib import reload

import warnings
warnings.filterwarnings("ignore")

In [13]:
# This code refreshes all of my helper files (w/o the need to stop the kernel) as I continuously test and update
reload(qt)
reload(acquire)
reload(prepare)
#reload(explore)
#reload(evaluate)
reload(ee)
reload(w)

<module 'QMCBT_wrangle' from '/Users/qmcbt/codeup-data-science/project-2_zillow/QMCBT_wrangle.py'>

In [2]:
# Read in DataFrame from Codeup db using defined arguments.
df = pd.read_csv('zillow_2017_sfh_merge_all.csv')

# INITIAL PEEK AT DATAFRAME

In [4]:
# list columns
df.columns

Index(['parcelid', 'typeconstructiontypeid', 'storytypeid',
       'propertylandusetypeid', 'heatingorsystemtypeid', 'buildingclasstypeid',
       'architecturalstyletypeid', 'airconditioningtypeid', 'id',
       'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid',
       'calculatedbathnbr', 'decktypeid', 'finishedfloor1squarefeet',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12',
       'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50',
       'finishedsquarefeet6', 'fips', 'fireplacecnt', 'fullbathcnt',
       'garagecarcnt', 'garagetotalsqft', 'hashottuborspa', 'latitude',
       'longitude', 'lotsizesquarefeet', 'poolcnt', 'poolsizesum',
       'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertyzoningdesc',
       'rawcensustractandblock', 'regionidcity', 'regionidcounty',
       'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr',
       'unitcnt', 'yardbuildingsqft17',

# CHECK IS_NULL

In [5]:
# set temporary conditions for this instance of code
with pd.option_context('display.max_rows', None):
    # print count >0 of nulls by column
    print (df.isnull().sum().sort_values(ascending=False))

finishedsquarefeet15            52442
buildingclassdesc               52442
buildingclasstypeid             52442
finishedsquarefeet13            52442
storytypeid                     52395
storydesc                       52395
basementsqft                    52395
yardbuildingsqft26              52379
architecturalstyletypeid        52372
architecturalstyledesc          52372
typeconstructiontypeid          52366
typeconstructiondesc            52366
fireplaceflag                   52361
finishedsquarefeet6             52277
decktypeid                      52053
pooltypeid10                    51998
poolsizesum                     51575
pooltypeid2                     51371
hashottuborspa                  50927
yardbuildingsqft17              50505
taxdelinquencyyear              50363
taxdelinquencyflag              50363
finishedfloor1squarefeet        48061
finishedsquarefeet50            48061
threequarterbathnbr             45718
fireplacecnt                    45199
pooltypeid7 

In [6]:
df.columns[df.isnull().any()].tolist()

['typeconstructiontypeid',
 'storytypeid',
 'heatingorsystemtypeid',
 'buildingclasstypeid',
 'architecturalstyletypeid',
 'airconditioningtypeid',
 'basementsqft',
 'buildingqualitytypeid',
 'calculatedbathnbr',
 'decktypeid',
 'finishedfloor1squarefeet',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet13',
 'finishedsquarefeet15',
 'finishedsquarefeet50',
 'finishedsquarefeet6',
 'fireplacecnt',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'hashottuborspa',
 'lotsizesquarefeet',
 'poolcnt',
 'poolsizesum',
 'pooltypeid10',
 'pooltypeid2',
 'pooltypeid7',
 'propertyzoningdesc',
 'regionidcity',
 'regionidneighborhood',
 'regionidzip',
 'threequarterbathnbr',
 'unitcnt',
 'yardbuildingsqft17',
 'yardbuildingsqft26',
 'yearbuilt',
 'numberofstories',
 'fireplaceflag',
 'structuretaxvaluedollarcnt',
 'taxvaluedollarcnt',
 'landtaxvaluedollarcnt',
 'taxamount',
 'taxdelinquencyflag',
 'taxdelinquencyyear',
 'censustractandblock',
 'airconditioningdes

In [7]:
# creating bool series True for NaN values
bool_series = pd.isnull(df["taxamount"])
 
# filtering data
# displaying data only with team = NaN
df[bool_series]

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
3598,11666665,,,261,2.0,,,,272990,,...,5366,-0.043649,2017-01-25,,,,Central,Single Family Residential,,
8300,11647249,,,261,2.0,,,,1451273,,...,12405,0.038192,2017-02-24,,,,Central,Single Family Residential,,
17546,11487838,,,261,7.0,,,,555782,,...,26165,0.689382,2017-04-13,,,,Floor/Wall,Single Family Residential,,
26352,12920381,,,261,7.0,,,,39253,,...,39122,-0.121397,2017-05-24,,,,Floor/Wall,Single Family Residential,,


In [17]:
# creating bool series True for NaN values
bool_series = pd.isnull(df["regionidzip"])
 
# filtering data
# displaying data only with team = NaN
df[bool_series].T

Unnamed: 0,2794,6576,6809,8832,9001,12465,18028,20843,23350,23914,...,34702,37078,38925,41468,42087,42309,43346,48979,49222,50789
parcelid,167637054,167636468,167636512,167636496,167686999,17051025,13963237,17072012,17188942,167639152,...,167636088,167636727,17072013,17188862,79640490,167637371,17188974,167636193,167636726,167636339
propertylandusetypeid,261,261,261,261,261,261,261,261,261,261,...,261,261,261,261,261,261,261,261,261,261
id,1596111,1805230,2460645,2222347,775695,2654492,2907523,1864486,462233,74677,...,2103174,2684284,835164,447456,2133032,358149,2534994,686390,2669317,2863251
bathroomcnt,1.0,3.0,4.0,3.0,0.0,2.0,2.5,4.5,3.0,4.0,...,6.0,3.0,3.5,0.0,2.5,5.0,3.5,5.0,2.0,3.0
bedroomcnt,2,4,5,5,0,3,4,4,4,5,...,5,4,4,0,3,4,4,4,3,3
calculatedbathnbr,1.0,3.0,4.0,3.0,,2.0,2.5,4.5,3.0,4.0,...,6.0,3.0,3.5,,2.5,5.0,3.5,5.0,2.0,3.0
calculatedfinishedsquarefeet,756.0,2239.0,2608.0,2157.0,,1686.0,2734.0,3436.0,3008.0,4232.0,...,5113.0,2670.0,3586.0,,1688.0,4000.0,3098.0,5363.0,1537.0,2815.0
finishedsquarefeet12,756.0,2239.0,2608.0,2157.0,,1686.0,2734.0,3436.0,3008.0,4232.0,...,5113.0,2670.0,3586.0,,1688.0,4000.0,3098.0,5363.0,1537.0,2815.0
fips,6037,6037,6037,6037,6037,6111,6059,6111,6111,6037,...,6037,6037,6111,6111,6059,6037,6111,6037,6037,6037
fullbathcnt,1.0,3.0,4.0,3.0,,2.0,2.0,4.0,3.0,4.0,...,6.0,3.0,3.0,,2.0,5.0,3.0,5.0,2.0,3.0


In [18]:
# Check result of dropping all columns with more than 1,442 NULL/NaN
df.dropna(axis='columns', thresh=51_000).isnull().sum().sort_values(ascending=False)

regionidcity                    1037
lotsizesquarefeet                369
finishedsquarefeet12             247
calculatedbathnbr                137
fullbathcnt                      137
censustractandblock              123
yearbuilt                        116
structuretaxvaluedollarcnt        84
calculatedfinishedsquarefeet      82
regionidzip                       26
taxamount                          4
landtaxvaluedollarcnt              1
taxvaluedollarcnt                  1
id.1                               0
logerror                           0
roomcnt                            0
transactiondate                    0
assessmentyear                     0
parcelid                           0
regionidcounty                     0
propertylandusetypeid              0
rawcensustractandblock             0
propertycountylandusecode          0
longitude                          0
latitude                           0
fips                               0
bedroomcnt                         0
b

In [19]:
# Drop all columns with more than 1,442 NULL/NaN
df = df.dropna(axis='columns', thresh=51_000)

In [20]:
# Check Work
df.isnull().sum().sort_values(ascending=False)

regionidcity                    1037
lotsizesquarefeet                369
finishedsquarefeet12             247
calculatedbathnbr                137
fullbathcnt                      137
censustractandblock              123
yearbuilt                        116
structuretaxvaluedollarcnt        84
calculatedfinishedsquarefeet      82
regionidzip                       26
taxamount                          4
landtaxvaluedollarcnt              1
taxvaluedollarcnt                  1
id.1                               0
logerror                           0
roomcnt                            0
transactiondate                    0
assessmentyear                     0
parcelid                           0
regionidcounty                     0
propertylandusetypeid              0
rawcensustractandblock             0
propertycountylandusecode          0
longitude                          0
latitude                           0
fips                               0
bedroomcnt                         0
b

In [21]:
# Check NO LOSS of records
df.shape[0]

52442

In [22]:
# Run custom Function to check remaining nulls and percentage of DataFrame
ee.null_stats(df)

COUNT OF NULL/NaN PER COLUMN:
regionidcity                    1037
lotsizesquarefeet                369
finishedsquarefeet12             247
calculatedbathnbr                137
fullbathcnt                      137
censustractandblock              123
yearbuilt                        116
structuretaxvaluedollarcnt        84
calculatedfinishedsquarefeet      82
regionidzip                       26
taxamount                          4
landtaxvaluedollarcnt              1
taxvaluedollarcnt                  1
id.1                               0
logerror                           0
roomcnt                            0
transactiondate                    0
assessmentyear                     0
parcelid                           0
regionidcounty                     0
propertylandusetypeid              0
rawcensustractandblock             0
propertycountylandusecode          0
longitude                          0
latitude                           0
fips                               0
bedroomc

In [26]:
# Drop rows with NULL/NaN since it is only 3% of DataFrame 
df = df.dropna()

In [27]:
# Check Work
df.isnull().sum().sort_values(ascending=False)

parcelid                        0
propertylandusetypeid           0
transactiondate                 0
logerror                        0
id.1                            0
censustractandblock             0
taxamount                       0
landtaxvaluedollarcnt           0
assessmentyear                  0
taxvaluedollarcnt               0
structuretaxvaluedollarcnt      0
yearbuilt                       0
roomcnt                         0
regionidzip                     0
regionidcounty                  0
regionidcity                    0
rawcensustractandblock          0
propertycountylandusecode       0
lotsizesquarefeet               0
longitude                       0
latitude                        0
fullbathcnt                     0
fips                            0
finishedsquarefeet12            0
calculatedfinishedsquarefeet    0
calculatedbathnbr               0
bedroomcnt                      0
bathroomcnt                     0
id                              0
propertylandus

# CHECK WHITESPACE

In [28]:
ee.check_whitespace(df)

There were 0 Whitespace characters found.


COUNT OF NULL/NaN PER COLUMN:
parcelid                        0
propertylandusetypeid           0
id                              0
bathroomcnt                     0
bedroomcnt                      0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
finishedsquarefeet12            0
fips                            0
fullbathcnt                     0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
id.1                            0
logerror

In [29]:
rslt_df = df[df.transactiondate > '2017-12-31']
rslt_df

Unnamed: 0,parcelid,propertylandusetypeid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,id.1,logerror,transactiondate,propertylandusedesc
52441,13083743,261,2455685,2.0,3,2.0,1050.0,1050.0,6037,2.0,...,77118.0,259334.0,2016,182216.0,3345.78,60374080000000.0,77613,-0.197755,2018-05-25,Single Family Residential
