# Prepare

**NOTE:**
For the first iteration of your model, use only square feet of the home, number of bedrooms, and number of bathrooms to estimate the property's assessed value, taxvaluedollarcnt. You can expand this to other fields after you have completed an mvp (minimally viable product).

### Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

from acquire_kwame import get_zillow_data

In [2]:
df = get_zillow_data()
df.head(3)

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,latitude,longitude,regionidcounty,roomcnt,yearbuilt,taxvaluedollarcnt,assessmentyear,propertycountylandusecode,propertylandusetypeid
0,14297519,1727539,3.5,4.0,3.5,3100.0,6059.0,33634931.0,-117869207.0,1286.0,0.0,1998.0,1023282.0,2016.0,122,261.0
1,17052889,1387261,1.0,2.0,1.0,1465.0,6111.0,34449266.0,-119281531.0,2061.0,5.0,1967.0,464000.0,2016.0,1110,261.0
2,14186244,11677,2.0,3.0,2.0,1243.0,6059.0,33886168.0,-117823170.0,1286.0,6.0,1962.0,564778.0,2016.0,122,261.0


### Summarize the Data

In [34]:
print(f"The shape of the original dataframe:\n {df.shape[0]} rows and {df.shape[1]} columns.")

The shape of the original dataframe:
 77614 rows and 16 columns.


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77614 entries, 0 to 77613
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      77614 non-null  int64  
 1   id                            77614 non-null  int64  
 2   bathroomcnt                   77580 non-null  float64
 3   bedroomcnt                    77580 non-null  float64
 4   calculatedbathnbr             76964 non-null  float64
 5   calculatedfinishedsquarefeet  77379 non-null  float64
 6   fips                          77580 non-null  float64
 7   latitude                      77580 non-null  float64
 8   longitude                     77580 non-null  float64
 9   regionidcounty                77580 non-null  float64
 10  roomcnt                       77580 non-null  float64
 11  yearbuilt                     77310 non-null  float64
 12  taxvaluedollarcnt             77579 non-null  float64
 13  a

In [6]:
df.describe()

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,latitude,longitude,regionidcounty,roomcnt,yearbuilt,taxvaluedollarcnt,assessmentyear,propertylandusetypeid
count,77614.0,77614.0,77580.0,77580.0,76964.0,77379.0,77580.0,77580.0,77580.0,77580.0,77580.0,77310.0,77579.0,77580.0,77580.0
mean,13007810.0,1496056.0,2.298492,3.053222,2.316388,1784.925923,6048.813998,34008370.0,-118203700.0,2534.522235,1.476257,1968.611396,490147.6,2016.0,261.824465
std,3518694.0,861344.8,0.996726,1.140472,0.979684,954.247864,20.747162,265285.6,359407.8,801.445328,2.823673,23.793037,653794.2,0.0,5.141564
min,10711860.0,349.0,0.0,0.0,1.0,128.0,6037.0,33339530.0,-119475400.0,1286.0,0.0,1824.0,1000.0,2016.0,31.0
25%,11538210.0,752595.2,2.0,2.0,2.0,1182.0,6037.0,33814630.0,-118415000.0,1286.0,0.0,1953.0,206899.0,2016.0,261.0
50%,12530060.0,1499186.0,2.0,3.0,2.0,1542.0,6037.0,34022000.0,-118181000.0,3101.0,0.0,1970.0,358878.0,2016.0,261.0
75%,14211000.0,2242084.0,3.0,4.0,3.0,2112.0,6059.0,34174310.0,-117928600.0,3101.0,0.0,1987.0,569000.0,2016.0,266.0
max,167689300.0,2985182.0,18.0,16.0,18.0,35640.0,6111.0,34818770.0,-117554600.0,3101.0,15.0,2016.0,49061240.0,2016.0,275.0


In [24]:
df.columns

Index(['parcelid', 'id', 'bathroomcnt', 'bedroomcnt', 'calculatedbathnbr',
       'calculatedfinishedsquarefeet', 'fips', 'latitude', 'longitude',
       'regionidcounty', 'roomcnt', 'yearbuilt', 'taxvaluedollarcnt',
       'assessmentyear', 'propertycountylandusecode', 'propertylandusetypeid'],
      dtype='object')

### Find and Handle Missing Values

In [7]:
df.isna().sum()

parcelid                          0
id                                0
bathroomcnt                      34
bedroomcnt                       34
calculatedbathnbr               650
calculatedfinishedsquarefeet    235
fips                             34
latitude                         34
longitude                        34
regionidcounty                   34
roomcnt                          34
yearbuilt                       304
taxvaluedollarcnt                35
assessmentyear                   34
propertycountylandusecode        34
propertylandusetypeid            34
dtype: int64

In [30]:
df.bathroomcnt.value_counts(dropna=False)

2.0     31576
3.0     17354
1.0     12945
2.5      6607
4.0      3356
1.5      1419
3.5      1036
5.0      1026
4.5       696
0.0       599
6.0       419
5.5       224
7.0       114
8.0       108
6.5        47
NaN        34
9.0        23
7.5        16
10.0        7
8.5         3
11.0        3
13.0        1
18.0        1
Name: bathroomcnt, dtype: int64

In [37]:
df.dropna()

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,latitude,longitude,regionidcounty,roomcnt,yearbuilt,taxvaluedollarcnt,assessmentyear,propertycountylandusecode,propertylandusetypeid
0,14297519,1727539,3.5,4.0,3.5,3100.0,6059.0,33634931.0,-117869207.0,1286.0,0.0,1998.0,1023282.0,2016.0,122,261.0
1,17052889,1387261,1.0,2.0,1.0,1465.0,6111.0,34449266.0,-119281531.0,2061.0,5.0,1967.0,464000.0,2016.0,1110,261.0
2,14186244,11677,2.0,3.0,2.0,1243.0,6059.0,33886168.0,-117823170.0,1286.0,6.0,1962.0,564778.0,2016.0,122,261.0
3,12177905,2288172,3.0,4.0,3.0,2376.0,6037.0,34245180.0,-118240722.0,3101.0,0.0,1970.0,145143.0,2016.0,0101,261.0
4,10887214,1970746,3.0,3.0,3.0,1312.0,6037.0,34185120.0,-118414640.0,3101.0,0.0,1964.0,119407.0,2016.0,010C,266.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77609,11000655,673515,2.0,2.0,2.0,1286.0,6037.0,34245368.0,-118282383.0,3101.0,0.0,1940.0,354621.0,2016.0,0100,261.0
77610,17239384,2968375,2.0,4.0,2.0,1612.0,6111.0,34300140.0,-118706327.0,2061.0,7.0,1964.0,67205.0,2016.0,1111,261.0
77611,12773139,1843709,1.0,3.0,1.0,1032.0,6037.0,34040895.0,-118038169.0,3101.0,0.0,1954.0,49546.0,2016.0,0100,261.0
77612,12826780,1187175,2.0,3.0,2.0,1762.0,6037.0,33937685.0,-117996709.0,3101.0,0.0,1955.0,522000.0,2016.0,0100,261.0
