# Zillow Clustering Project Scratchpad

In [1]:
import pandas as pd
import numpy as np
import acquire
import prepare
import warnings
warnings.filterwarnings('ignore')

### Data Acquisition

In [2]:
# bring in the data
df = acquire.get_zillow_cluster()

#### Data Aquistion Key Findings & Takeaways:

- This dataframe has 77574 rows and 68 columns.
- Target variable: 'logerror'
- There are 11 columns that are dtype "object" that either need to be dropped, or converted to int
- There are numerous columns with mostly missing values that either need to be dropped, or imputed
- There are numerous columns that at first glance appear to be duplicates, or similar enough to either be dropped, or joined

### Data Preparation

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77574 entries, 0 to 77573
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   heatingorsystemtypeid         49569 non-null  float64
 1   buildingclasstypeid           15 non-null     float64
 2   architecturalstyletypeid      206 non-null    float64
 3   airconditioningtypeid         25006 non-null  float64
 4   parcelid                      77574 non-null  int64  
 5   typeconstructiontypeid        222 non-null    float64
 6   storytypeid                   50 non-null     float64
 7   propertylandusetypeid         77574 non-null  float64
 8   id                            77574 non-null  int64  
 9   basementsqft                  50 non-null     float64
 10  bathroomcnt                   77574 non-null  float64
 11  bedroomcnt                    77574 non-null  float64
 12  buildingqualitytypeid         49808 non-null  float64
 13  c

In [4]:
df, \
train, \
validate, \
test, \
X_train, \
y_train, \
X_validate, \
y_validate, \
X_test, \
y_test = prepare.prep_zillow_cluster(df)

In [5]:
prepare.summarize(df)

----------------------------------------------------
DataFrame Head
   parcelid  bathroomcnt  bedroomcnt  calculatedbathnbr  \
0  14297519          3.5         4.0                3.5   
1  17052889          1.0         2.0                1.0   
2  14186244          2.0         3.0                2.0   

   calculatedfinishedsquarefeet  finishedsquarefeet12    fips  fullbathcnt  \
0                        3100.0                3100.0  6059.0          3.0   
1                        1465.0                1465.0  6111.0          1.0   
2                        1243.0                1243.0  6059.0          2.0   

     latitude    longitude  ...  logerror transactiondate  \
0  33634931.0 -117869207.0  ...  0.025595      2017-01-01   
1  34449266.0 -119281531.0  ...  0.055619      2017-01-01   
2  33886168.0 -117823170.0  ...  0.005383      2017-01-01   

         propertylandusedesc  LA  Orange  Ventura   age   taxrate     acres  \
0  Single Family Residential   0       1        0  19.0  1

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27010 entries, 27396 to 18368
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      27010 non-null  int64  
 1   bathroomcnt                   27010 non-null  float64
 2   bedroomcnt                    27010 non-null  float64
 3   calculatedbathnbr             27010 non-null  float64
 4   calculatedfinishedsquarefeet  27010 non-null  float64
 5   finishedsquarefeet12          27010 non-null  float64
 6   fips                          27010 non-null  float64
 7   fullbathcnt                   27010 non-null  float64
 8   latitude                      27010 non-null  float64
 9   longitude                     27010 non-null  float64
 10  lotsizesquarefeet             27010 non-null  float64
 11  propertycountylandusecode     27010 non-null  object 
 12  rawcensustractandblock        27010 non-null  float64
 1

In [7]:
train.shape

(27010, 33)

In [8]:
X_train.shape

(27010, 32)

In [9]:
X_train.head().T

Unnamed: 0,27396,8743,18209,37792,47261
parcelid,11267849,14480949,10841075,10902222,11102982
bathroomcnt,1,3,3,2,2
bedroomcnt,2,3,3,3,3
calculatedbathnbr,1,3,3,2,2
calculatedfinishedsquarefeet,994,3328,1924,1890,1238
finishedsquarefeet12,994,3328,1924,1890,1238
fips,6037,6059,6037,6037,6037
fullbathcnt,1,3,3,2,2
latitude,3.46357e+07,3.36173e+07,3.4189e+07,3.41637e+07,3.44347e+07
longitude,-1.18175e+08,-1.17572e+08,-1.18435e+08,-1.18399e+08,-1.18479e+08


In [10]:
X_train.nunique()

parcelid                        27010
bathroomcnt                         7
bedroomcnt                          4
calculatedbathnbr                   7
calculatedfinishedsquarefeet     3010
finishedsquarefeet12             3006
fips                                3
fullbathcnt                         4
latitude                        26512
longitude                       26572
lotsizesquarefeet                9706
propertycountylandusecode          15
rawcensustractandblock          19810
regionidcity                      172
regionidzip                       375
roomcnt                            12
yearbuilt                         129
structuretaxvaluedollarcnt      20852
taxvaluedollarcnt               21688
assessmentyear                      1
landtaxvaluedollarcnt           20569
taxamount                       26557
censustractandblock             19784
transactiondate                   248
propertylandusedesc                 1
LA                                  2
Orange      