<div class="alert alert-success">


# IMPORTS

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000 # assists with processor speed

import matplotlib.pyplot as plt
# Only works inside notebook
%matplotlib inline 

# import preprocessing
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer

from sklearn.neighbors import KNeighborsClassifier

import QMCBT_00_quicktips as qt
import QMCBT_01_acquire as acquire
import QMCBT_02_prepare as prepare
#import QMCBT_03_explore as explore
#import QMCBT_04_evaluate as evaluate
import QMCBT_explore_evaluate as ee
import QMCBT_wrangle as w

from env import user, password, host
# allows import reload without needing to clear kernel and rerun
# reload(packagename) 
from importlib import reload

import warnings
warnings.filterwarnings("ignore")

In [2]:
# This code refreshes all of my helper files (w/o the need to stop the kernel) as I continuously test and update
reload(qt)
reload(acquire)
reload(prepare)
#reload(explore)
#reload(evaluate)
reload(ee)
reload(w)

<module 'QMCBT_wrangle' from '/Users/qmcbt/codeup-data-science/project-2_zillow/QMCBT_wrangle.py'>

<div class="alert alert-warning">

    
## Get Data

In [3]:
query = """SELECT *
FROM predictions_2017
LEFT JOIN unique_properties USING (parcelid)
LEFT JOIN properties_2017 USING (parcelid)
LEFT JOIN airconditioningtype USING (airconditioningtypeid)
LEFT JOIN architecturalstyletype USING (architecturalstyletypeid)
LEFT JOIN buildingclasstype USING (buildingclasstypeid)
LEFT JOIN heatingorsystemtype USING (heatingorsystemtypeid)
LEFT JOIN propertylandusetype USING (propertylandusetypeid)
LEFT JOIN storytype USING (storytypeid)
LEFT JOIN typeconstructiontype USING (typeconstructiontypeid)
WHERE propertylandusetypeid = 261 and transactiondate LIKE '2017%'
"""

In [4]:
# Read in DataFrame from Codeup db using defined arguments.
df = pd.read_csv('zillow_2017_sfh_merge_all.csv')

<div class="alert alert-success">



# INITIAL PEEK AT DATAFRAME

In [5]:
df.shape

(52441, 69)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52441 entries, 0 to 52440
Data columns (total 69 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   typeconstructiontypeid        76 non-null     float64
 1   storytypeid                   47 non-null     float64
 2   propertylandusetypeid         52441 non-null  int64  
 3   heatingorsystemtypeid         33935 non-null  float64
 4   buildingclasstypeid           0 non-null      float64
 5   architecturalstyletypeid      70 non-null     float64
 6   airconditioningtypeid         13638 non-null  float64
 7   parcelid                      52441 non-null  int64  
 8   id                            52441 non-null  int64  
 9   logerror                      52441 non-null  float64
 10  transactiondate               52441 non-null  object 
 11  id.1                          52441 non-null  int64  
 12  basementsqft                  47 non-null     float64
 13  b

In [7]:
ee.nunique_column_objects(df)

transactiondate has 257 unique values.
propertycountylandusecode has 26 unique values.
propertyzoningdesc has 1286 unique values.
taxdelinquencyflag has 1 unique values.
airconditioningdesc has 4 unique values.
architecturalstyledesc has 5 unique values.
heatingorsystemdesc has 9 unique values.
propertylandusedesc has 1 unique values.
storydesc has 1 unique values.
typeconstructiondesc has 2 unique values.


In [8]:
ee.nunique_column_qty(df)

typeconstructiontypeid has 2 unique values.
storytypeid has 1 unique values.
propertylandusetypeid has 1 unique values.
heatingorsystemtypeid has 9 unique values.
buildingclasstypeid has 0 unique values.
architecturalstyletypeid has 5 unique values.
airconditioningtypeid has 4 unique values.
parcelid has 52320 unique values.
id has 52441 unique values.
logerror has 52180 unique values.
id.1 has 52320 unique values.
basementsqft has 41 unique values.
bathroomcnt has 22 unique values.
bedroomcnt has 14 unique values.
buildingqualitytypeid has 12 unique values.
calculatedbathnbr has 21 unique values.
decktypeid has 1 unique values.
finishedfloor1squarefeet has 1553 unique values.
calculatedfinishedsquarefeet has 4723 unique values.
finishedsquarefeet12 has 4716 unique values.
finishedsquarefeet13 has 0 unique values.
finishedsquarefeet15 has 0 unique values.
finishedsquarefeet50 has 1572 unique values.
finishedsquarefeet6 has 155 unique values.
fips has 3 unique values.
fireplacecnt has 5

In [9]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.describe(include='all'))

        typeconstructiontypeid  storytypeid  propertylandusetypeid  \
count                76.000000         47.0                52441.0   
unique                     NaN          NaN                    NaN   
top                        NaN          NaN                    NaN   
freq                       NaN          NaN                    NaN   
mean                  5.973684          7.0                  261.0   
std                   0.229416          0.0                    0.0   
min                   4.000000          7.0                  261.0   
25%                   6.000000          7.0                  261.0   
50%                   6.000000          7.0                  261.0   
75%                   6.000000          7.0                  261.0   
max                   6.000000          7.0                  261.0   

        heatingorsystemtypeid  buildingclasstypeid  architecturalstyletypeid  \
count            33935.000000                  0.0                  70.00000   

In [10]:
# list columns
df.columns

Index(['typeconstructiontypeid', 'storytypeid', 'propertylandusetypeid',
       'heatingorsystemtypeid', 'buildingclasstypeid',
       'architecturalstyletypeid', 'airconditioningtypeid', 'parcelid', 'id',
       'logerror', 'transactiondate', 'id.1', 'basementsqft', 'bathroomcnt',
       'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr',
       'decktypeid', 'finishedfloor1squarefeet',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12',
       'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50',
       'finishedsquarefeet6', 'fips', 'fireplacecnt', 'fullbathcnt',
       'garagecarcnt', 'garagetotalsqft', 'hashottuborspa', 'latitude',
       'longitude', 'lotsizesquarefeet', 'poolcnt', 'poolsizesum',
       'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertyzoningdesc',
       'rawcensustractandblock', 'regionidcity', 'regionidcounty',
       'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbat

In [11]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52431,52432,52433,52434,52435,52436,52437,52438,52439,52440
typeconstructiontypeid,,,,,,,,,,,...,,,,,,,,,,
storytypeid,,,,,,,,,,,...,,,,,,,,,,
propertylandusetypeid,261,261,261,261,261,261,261,261,261,261,...,261,261,261,261,261,261,261,261,261,261
heatingorsystemtypeid,,,,2.0,2.0,,2.0,2.0,,,...,2.0,7.0,,2.0,2.0,2.0,2.0,,2.0,2.0
buildingclasstypeid,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
buildingclassdesc,,,,,,,,,,,...,,,,,,,,,,
heatingorsystemdesc,,,,Central,Central,,Central,Central,,,...,Central,Floor/Wall,,Central,Central,Central,Central,,Central,Central
propertylandusedesc,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,...,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential
storydesc,,,,,,,,,,,...,,,,,,,,,,


<div class="alert alert-success">


## EXPLORE & PREPARE DATA

<div class="alert alert-warning">


## Clean Whitespace

In [12]:
ee.check_whitespace(df)

Cleaning 52441 Whitespace characters found and replaced with NULL/NaN.
Resulting in 0 additional rows containing NULL/NaN.


COUNT OF NULL/NaN PER COLUMN:
buildingclassdesc               52441
buildingclasstypeid             52441
finishedsquarefeet15            52441
finishedsquarefeet13            52441
basementsqft                    52394
storydesc                       52394
storytypeid                     52394
yardbuildingsqft26              52378
architecturalstyletypeid        52371
architecturalstyledesc          52371
typeconstructiontypeid          52365
typeconstructiondesc            52365
fireplaceflag                   52360
finishedsquarefeet6             52276
decktypeid                      52052
pooltypeid10                    51997
poolsizesum                     51574
pooltypeid2                     51370
hashottuborspa                  50926
yardbuildingsqft17              50504
taxdelinquencyyear              50362
taxdelinquencyflag              50362
finisheds

<div class="alert alert-warning">


## Clean Duplicates

In [13]:
# Run custom Function that checks for duplicate Rows and Columns
ee.check_duplicates(df)

There are 0 duplicate ROWS.
No Action Needed.


There are 3 duplicate COLUMNS that need to be removed.
Copy, Paste, and Run the following Code: "df=df.T.drop_duplicates().T"

This is the list of Dupliacate Columns:
['buildingclassdesc', 'finishedsquarefeet13', 'finishedsquarefeet15']


<div class="alert alert-info">


## OBSERVATION:
    
#### Duplicated Columns
* finishedsquarefeet13
* finishedsquarefeet15
* buildingclassdesc

#### There were four completely Null/NaN Columns
#### buildingclasstypeid was kept because it was the first column
* buildingclasstypeid             52441
* finishedsquarefeet13            52441
* finishedsquarefeet15            52441
* buildingclassdesc               52441

### These will be handled when we clean Null/NaN

<div class="alert alert-warning">


## Clean ```parcelid``` Duplicates

In [14]:
df.parcelid.count()

52441

In [15]:
df.parcelid.nunique()

52320

In [16]:
df.parcelid.count() - df.parcelid.nunique()

121

In [17]:
print(df.parcelid[df.parcelid.duplicated()].drop_duplicates().count())

120


In [18]:
df.shape

(52441, 69)

In [49]:
with pd.option_context('display.max_rows', None): 
    print(df.parcelid.value_counts().head(121))

11991059     3
11957553     2
12478591     2
12035592     2
12443331     2
12621730     2
11451345     2
11420117     2
14092694     2
12870253     2
14532131     2
11603473     2
12121210     2
12057023     2
14079874     2
12114701     2
12955531     2
11717962     2
17225336     2
14236060     2
11721753     2
14008322     2
11969146     2
17165666     2
11967869     2
11552513     2
10879060     2
11446756     2
11917650     2
12982361     2
11961462     2
12196319     2
14097534     2
11429175     2
14448410     2
11696784     2
12492881     2
12847318     2
17280166     2
14012730     2
12941764     2
10833654     2
12285822     2
13071085     2
12827519     2
12575721     2
11061050     2
12178305     2
12535098     2
17098564     2
10979425     2
10739478     2
13083743     2
14088988     2
13880422     2
11733550     2
11389003     2
10779619     2
12519794     2
11499166     2
12814323     2
11705026     2
10722858     2
11797465     2
11921077     2
12137395     2
14430658  

In [51]:
column_list_original = list(df.parcelid)
column_list_original_df = pd.DataFrame(list(df.parcelid))
column_list_original_df.shape[0]

52441

In [50]:
column_list_original

[14297519,
 17052889,
 14186244,
 12177905,
 12095076,
 12069064,
 12790562,
 11104527,
 13944538,
 17110996,
 14375300,
 11830315,
 14387959,
 14349322,
 11706737,
 12531488,
 14314879,
 11130689,
 12036177,
 14333888,
 12713253,
 14338242,
 11680471,
 12155445,
 14344863,
 10852826,
 10871883,
 10900115,
 11737342,
 12106936,
 11405344,
 13043344,
 11036179,
 14455676,
 14463877,
 10828742,
 13022358,
 13007403,
 14452416,
 14448439,
 12464464,
 12840475,
 13091973,
 14432426,
 11016518,
 14434171,
 11018202,
 10808296,
 11023776,
 12871444,
 12879860,
 13069118,
 13055657,
 12481281,
 10736968,
 14191835,
 10935174,
 14284348,
 12579560,
 14201161,
 10925280,
 12218511,
 14121694,
 12227278,
 14249444,
 11652563,
 10871205,
 14210616,
 14296692,
 10901522,
 14434754,
 14179809,
 12188496,
 11878863,
 14176622,
 14260184,
 10726564,
 12346913,
 13880979,
 11309647,
 12350928,
 13872686,
 14170679,
 14172985,
 13854120,
 13850030,
 11177755,
 17134185,
 13840290,
 13101300,
 17292247,

In [20]:
column_list_unique = list(df.parcelid.unique())
column_list_unique_df = pd.DataFrame(column_list_unique)
column_list_unique_df.shape[0]

52320

In [21]:
column_list_unique

[14297519,
 17052889,
 14186244,
 12177905,
 12095076,
 12069064,
 12790562,
 11104527,
 13944538,
 17110996,
 14375300,
 11830315,
 14387959,
 14349322,
 11706737,
 12531488,
 14314879,
 11130689,
 12036177,
 14333888,
 12713253,
 14338242,
 11680471,
 12155445,
 14344863,
 10852826,
 10871883,
 10900115,
 11737342,
 12106936,
 11405344,
 13043344,
 11036179,
 14455676,
 14463877,
 10828742,
 13022358,
 13007403,
 14452416,
 14448439,
 12464464,
 12840475,
 13091973,
 14432426,
 11016518,
 14434171,
 11018202,
 10808296,
 11023776,
 12871444,
 12879860,
 13069118,
 13055657,
 12481281,
 10736968,
 14191835,
 10935174,
 14284348,
 12579560,
 14201161,
 10925280,
 12218511,
 14121694,
 12227278,
 14249444,
 11652563,
 10871205,
 14210616,
 14296692,
 10901522,
 14434754,
 14179809,
 12188496,
 11878863,
 14176622,
 14260184,
 10726564,
 12346913,
 13880979,
 11309647,
 12350928,
 13872686,
 14170679,
 14172985,
 13854120,
 13850030,
 11177755,
 17134185,
 13840290,
 13101300,
 17292247,

In [23]:
column_list_duplicates = list(df.parcelid[df.parcelid.duplicated()].drop_duplicates())
column_list_duplicates_df = pd.DataFrame(column_list_duplicates)
column_list_duplicates_df.shape[0]

120

In [24]:
column_list_duplicates

[11721753,
 11289917,
 11705026,
 14269464,
 11446756,
 10739478,
 13973642,
 11389003,
 11967869,
 11429175,
 11696784,
 11921077,
 11499166,
 11391972,
 13960284,
 11743374,
 12870253,
 11460552,
 14236060,
 12982361,
 14365030,
 14008322,
 11552513,
 12478591,
 12535098,
 12575721,
 12443331,
 12955531,
 11391577,
 13880422,
 12814323,
 12385712,
 10976131,
 11658743,
 12057023,
 11187927,
 11420117,
 11451345,
 10984661,
 12196319,
 12827519,
 11957553,
 11367981,
 11312124,
 11711539,
 12035592,
 14079874,
 13071085,
 10732347,
 10722858,
 12114701,
 13083743,
 17086759,
 13921492,
 17193966,
 11969146,
 12178305,
 11289757,
 11961462,
 12492881,
 12285822,
 12519794,
 14092694,
 11460921,
 12749741,
 11797465,
 13863275,
 11061050,
 11733550,
 10779619,
 11917650,
 14012730,
 14430658,
 11830465,
 17165634,
 10811539,
 12048224,
 17165666,
 17136356,
 14010551,
 12811794,
 10833654,
 13885693,
 12892594,
 12347492,
 12941764,
 10871677,
 14257065,
 11499751,
 11603473,
 14532131,

In [60]:
df2 = df[df['parcelid'].isin(column_list_duplicates)]
df2.T

Unnamed: 0,671,672,834,835,1195,1196,1380,1381,1795,1796,...,35441,35442,36900,36901,38620,38621,40594,40595,43477,43478
typeconstructiontypeid,,,,,,,,,,,...,,,,,,,,,,
storytypeid,,,,,,,,,,,...,,,,,,,,,,
propertylandusetypeid,261,261,261,261,261,261,261,261,261,261,...,261,261,261,261,261,261,261,261,261,261
heatingorsystemtypeid,7.0,7.0,2.0,2.0,2.0,2.0,,,7.0,7.0,...,2.0,2.0,2.0,2.0,,,2.0,2.0,,
buildingclasstypeid,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
buildingclassdesc,,,,,,,,,,,...,,,,,,,,,,
heatingorsystemdesc,Floor/Wall,Floor/Wall,Central,Central,Central,Central,,,Floor/Wall,Floor/Wall,...,Central,Central,Central,Central,,,Central,Central,,
propertylandusedesc,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,...,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential
storydesc,,,,,,,,,,,...,,,,,,,,,,


In [62]:
df2.parcelid.nunique()

120

In [57]:
isin:array = [column_list_duplicates]df.loc[df['parcelid'].isin(array)]

SyntaxError: invalid syntax (1258100158.py, line 1)

<div class="alert alert-danger">


### How can 14092694 exist in Duplicates but not in Original or Unique?</div> 

<div class="alert alert-success">

    
* Because the list being displayed for search is not complete
    # ...]</div>

In [38]:
original_list = list(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '5', '7', '8', '8'])
original_list_df = pd.DataFrame(original_list, columns = ['Numbers'])
original_list_df.shape[0]

14

In [39]:
original_list

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '5', '7', '8', '8']

In [26]:
unique_list = list(original_list_df.Numbers.unique())
unique_list_df = pd.DataFrame(unique_list)
unique_list_df.shape[0]

10

In [30]:
unique_list

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [28]:
duplicate_list = list(original_list_df.Numbers[original_list_df.Numbers.duplicated()].drop_duplicates())
duplicate_list_df = pd.DataFrame(duplicate_list)
duplicate_list_df.shape[0]

3

In [29]:
duplicate_list

['5', '7', '8']

<div class="alert alert-warning">


## Convert before removing Null/NaN
* basementsqft
* decktypeid
* pooltypeid10
* poolsizesum
* pooltypeid2
* pooltypeid7
* poolcnt
* hashottuborspa
* taxdelinquencyyear
* fireplacecnt
* numberofstories
* garagecarcnt
* garagetotalsqft

convert_columns_df = df(df['basementsqft', 
                        'decktypeid', 
                        'pooltypeid10', 
                        'poolsizesum', 
                        'pooltypeid2', 
                        'pooltypeid7', 
                        'poolcnt', 
                        'hashottuborspa', 
                        'taxdelinquencyyear', 
                        'fireplacecnt', 
                        'numberofstories', 
                        'garagecarcnt', 
                        'garagetotalsqft'])

In [None]:
# Convert Null/NaN to 0 in order to retain values and further convert to True/False, has/has_not, is/is_not.


# CHECK IS_NULL

In [None]:
# set temporary conditions for this instance of code
with pd.option_context('display.max_rows', None):
    # print count >0 of nulls by column
    print (df.isnull().sum().sort_values(ascending=False))

In [None]:
df.columns[df.isnull().any()].tolist()

In [None]:
# creating bool series True for NaN values
bool_series = pd.isnull(df["taxamount"])
 
# filtering data
# displaying data only with team = NaN
df[bool_series]

In [None]:
# creating bool series True for NaN values
bool_series = pd.isnull(df["regionidzip"])
 
# filtering data
# displaying data only with team = NaN
df[bool_series].T

In [None]:
# Check result of dropping all columns with more than 1,442 NULL/NaN
df.dropna(axis='columns', thresh=19_000).isnull().sum().sort_values(ascending=False)

In [None]:
# Drop all columns with more than 1,442 NULL/NaN
df = df.dropna(axis='columns', thresh=19_000)

In [None]:
# Check Work
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Check NO LOSS of records
df.shape[0]

In [None]:
# Run custom Function to check remaining nulls and percentage of DataFrame
ee.null_stats(df)

In [None]:
# Drop rows with NULL/NaN since it is only 3% of DataFrame 
df = df.dropna()

In [None]:
# Check Work
df.isnull().sum().sort_values(ascending=False)

# CHECK WHITESPACE

In [None]:
rslt_df = df[df.transactiondate > '2017-12-31']
rslt_df

In [None]:
df.fips.unique()

In [None]:
df.regionidzip.unique()

Given more time...
* We could scrape Zip Code Income, Population and Demographics to include in the DataFrame
* [Name Census](https://namecensus.com/zip-codes/california/orange-county/#:~:text=Orange%20County%20makes%20up%20approximately,information%20for%20each%20zip%20code) keeps all of this data
* Here is an example: <a href="https://namecensus.com/demographics/california/90620/">90620 Zip Code Income, Population and Demographics</a>
    * REF:
    * 90620 Zip Code Income, Population and Demographics. NameCensus.com. Retrieved from https://namecensus.com/demographics/california/90620/.