# Prepare

### Imports and Acquire Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

from acquire_kwame import get_zillow_data
from prepare_kwame import prepare_zillow

In [2]:
df = get_zillow_data()
df.head(3)

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,latitude,longitude,regionidcounty,roomcnt,yearbuilt,taxvaluedollarcnt,assessmentyear,propertycountylandusecode,propertylandusetypeid
0,14297519,1727539,3.5,4.0,3.5,3100.0,6059.0,33634931.0,-117869207.0,1286.0,0.0,1998.0,1023282.0,2016.0,122,261.0
1,17052889,1387261,1.0,2.0,1.0,1465.0,6111.0,34449266.0,-119281531.0,2061.0,5.0,1967.0,464000.0,2016.0,1110,261.0
2,14186244,11677,2.0,3.0,2.0,1243.0,6059.0,33886168.0,-117823170.0,1286.0,6.0,1962.0,564778.0,2016.0,122,261.0


In [3]:
print(f"The shape of the original dataframe:\n {df.shape[0]} rows and {df.shape[1]} columns.")

The shape of the original dataframe:
 77614 rows and 16 columns.


### Drop features to create an MVP (first iteration)

**Note from curriculum:**
For the first iteration of your model, use only square feet of the home, number of bedrooms, and number of bathrooms to estimate the property's assessed value, taxvaluedollarcnt. You can expand this to other fields after you have completed an mvp (minimally viable product).

In [4]:
df.columns

Index(['parcelid', 'id', 'bathroomcnt', 'bedroomcnt', 'calculatedbathnbr',
       'calculatedfinishedsquarefeet', 'fips', 'latitude', 'longitude',
       'regionidcounty', 'roomcnt', 'yearbuilt', 'taxvaluedollarcnt',
       'assessmentyear', 'propertycountylandusecode', 'propertylandusetypeid'],
      dtype='object')

In [5]:
df = df.drop(columns=['calculatedbathnbr', 'fips', 'latitude', 'longitude', 'regionidcounty', 'roomcnt', 'yearbuilt', 'assessmentyear', 'propertycountylandusecode', 'propertylandusetypeid'])
df.head(3)

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,14297519,1727539,3.5,4.0,3100.0,1023282.0
1,17052889,1387261,1.0,2.0,1465.0,464000.0
2,14186244,11677,2.0,3.0,1243.0,564778.0


### Summarize the Data

In [6]:
print(f"The shape of the dataframe with features dropped:\n {df.shape[0]} rows and {df.shape[1]} columns.")

The shape of the dataframe with features dropped:
 77614 rows and 6 columns.


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77614 entries, 0 to 77613
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      77614 non-null  int64  
 1   id                            77614 non-null  int64  
 2   bathroomcnt                   77580 non-null  float64
 3   bedroomcnt                    77580 non-null  float64
 4   calculatedfinishedsquarefeet  77379 non-null  float64
 5   taxvaluedollarcnt             77579 non-null  float64
dtypes: float64(4), int64(2)
memory usage: 3.6 MB


In [8]:
df.columns

Index(['parcelid', 'id', 'bathroomcnt', 'bedroomcnt',
       'calculatedfinishedsquarefeet', 'taxvaluedollarcnt'],
      dtype='object')

In [9]:
df.bathroomcnt.value_counts(dropna=False)

2.0     31576
3.0     17354
1.0     12945
2.5      6607
4.0      3356
1.5      1419
3.5      1036
5.0      1026
4.5       696
0.0       599
6.0       419
5.5       224
7.0       114
8.0       108
6.5        47
NaN        34
9.0        23
7.5        16
10.0        7
8.5         3
11.0        3
13.0        1
18.0        1
Name: bathroomcnt, dtype: int64

There is a single-unit home out there with 18 bathrooms, what a life!

In [10]:
df.bedroomcnt.value_counts(dropna=False)

3.0     30437
2.0     19223
4.0     17551
5.0      4550
1.0      3391
6.0      1000
0.0       837
8.0       253
7.0       208
9.0        70
NaN        34
10.0       31
12.0       16
11.0        9
13.0        2
16.0        1
14.0        1
Name: bedroomcnt, dtype: int64

In [11]:
df.taxvaluedollarcnt.value_counts(dropna=False)

400000.0     56
600000.0     50
350000.0     49
450000.0     45
420000.0     44
             ..
124876.0      1
247786.0      1
820528.0      1
302891.0      1
3925848.0     1
Name: taxvaluedollarcnt, Length: 50950, dtype: int64

### Find and Handle Missing Values

I'm going to just drop the rows that have any NaN or null values in them, because they make up a very small portion of the data, and in my opinion incomplete observations in this case (with few features) are not helpful.

In [12]:
df.isna().sum()

parcelid                          0
id                                0
bathroomcnt                      34
bedroomcnt                       34
calculatedfinishedsquarefeet    235
taxvaluedollarcnt                35
dtype: int64

In [13]:
df.calculatedfinishedsquarefeet.value_counts(dropna=False)

NaN       235
1200.0    182
1120.0    169
1440.0    148
1080.0    133
         ... 
5333.0      1
5178.0      1
5396.0      1
4288.0      1
6758.0      1
Name: calculatedfinishedsquarefeet, Length: 4973, dtype: int64

In [14]:
# Drop rows with null values
df = df.dropna()
print(f"The shape of the dataframe with NaNs dropped:\n {df.shape[0]} rows and {df.shape[1]} columns.")

The shape of the dataframe with NaNs dropped:
 77378 rows and 6 columns.


### Split the Data

Splitting the data into train, validate, and test datasets.

In [15]:
train_validate, test = train_test_split(df, test_size=.2, random_state=666)
train, validate = train_test_split(train_validate, test_size=.3, random_state=666)

print(f'shape of train: {train.shape}')
print(f'shape of validate: {validate.shape}')
print(f'shape of test: {test.shape}')

shape of train: (43331, 6)
shape of validate: (18571, 6)
shape of test: (15476, 6)


In [16]:
train.columns

Index(['parcelid', 'id', 'bathroomcnt', 'bedroomcnt',
       'calculatedfinishedsquarefeet', 'taxvaluedollarcnt'],
      dtype='object')

In [17]:
X_train = train.drop(columns='taxvaluedollarcnt')
X_validate = validate.drop(columns='taxvaluedollarcnt')
X_test = test.drop(columns='taxvaluedollarcnt')

y_train = train['taxvaluedollarcnt']
y_validate = validate['taxvaluedollarcnt']
y_test = test['taxvaluedollarcnt']

print(f'shape of X_train: {X_train.shape}')
print(f'shape of X_validate : {X_validate.shape}')
print(f'shape of X_test : {X_test.shape}')

shape of X_train: (43331, 5)
shape of X_validate : (18571, 5)
shape of X_test : (15476, 5)


### I would typically scale the data here, but I will save that for the second iteration of Prepare to achieve our MVP.

In [18]:
train.head()

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
55990,10794022,2161904,2.0,2.0,920.0,291160.0
6478,11017636,207554,3.0,4.0,2159.0,429683.0
231,14394667,1369676,3.0,4.0,3309.0,1452674.0
8648,10729885,717282,2.0,4.0,1570.0,67681.0
32526,14138791,369561,2.0,3.0,1325.0,358482.0


---

### The next and final step for the first iteration of the Prepare stage is to turn the code in this notebook into a .py file with functions.

In [20]:
df, train, validate, test, X_train, X_validate, X_test, y_train, y_validate, y_test = prepare_zillow()
df.head()

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,14297519,1727539,3.5,4.0,3100.0,1023282.0
1,17052889,1387261,1.0,2.0,1465.0,464000.0
2,14186244,11677,2.0,3.0,1243.0,564778.0
3,12177905,2288172,3.0,4.0,2376.0,145143.0
4,10887214,1970746,3.0,3.0,1312.0,119407.0
