# Prepping the Data
We need to
- [x] remove high null columns and rows
- [x] remove unnecessary columns
- [x] remove outliers
- [x] impute/remove leftover nulls
- [x] create new features
- [ ] convert to appropriate data types
- [X] reorder/rename columns
- [X] put into one function

Preprocessing
- [ ] split into train, validate, test (stratify)
- [ ] scale the data (fit on train)

In [1]:
import Acquire

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = Acquire.get_home_data()

In [3]:
df

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate
0,1,10759547,,,,0.0,0.0,,,,...,,27516.0,2015.0,27516.0,,,,,0.055619,2017-01-01
1,6,10933547,,,,0.0,0.0,,,,...,404013.0,563029.0,2016.0,159016.0,6773.34,,,,-0.001011,2017-01-01
2,14,11142747,,,,0.0,0.0,,,,...,,4265.0,2015.0,4265.0,,,,,-0.008935,2017-01-02
3,15,11193347,,,,0.0,0.0,,,,...,,10.0,2016.0,10.0,,,,,0.008669,2017-01-02
4,16,11215747,,,,0.0,0.0,,,,...,,10.0,2016.0,10.0,,,,,-0.021896,2017-01-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70359,77609,11212539,1.0,,,3.0,4.0,,8.0,3.0,...,129566.0,162019.0,2016.0,32453.0,2860.33,,,6.037911e+13,0.020615,2017-09-20
70360,77610,11212639,1.0,,,3.0,4.0,,8.0,3.0,...,100744.0,125923.0,2016.0,25179.0,2394.26,,,6.037911e+13,0.013209,2017-09-21
70361,77611,11212962,1.0,,,2.0,3.0,,6.0,2.0,...,149241.0,198988.0,2016.0,49747.0,3331.81,,,6.037911e+13,0.037129,2017-09-21
70362,77612,11213162,1.0,,,3.0,3.0,,8.0,3.0,...,118900.0,148600.0,2016.0,29700.0,2510.53,,,6.037911e+13,0.007204,2017-09-25


# Remove Nulls in Columns/Rows

In [4]:
# sets thresh hold to 75 percent nulls
threshold = df.shape[0] * .75

# remove columns with high nulls
df = df.dropna(axis=1, thresh=threshold)

df.columns

Index(['id', 'parcelid', 'bathroomcnt', 'bedroomcnt', 'calculatedbathnbr',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'fips',
       'fullbathcnt', 'latitude', 'longitude', 'lotsizesquarefeet',
       'propertycountylandusecode', 'propertylandusetypeid',
       'rawcensustractandblock', 'regionidcity', 'regionidcounty',
       'regionidzip', 'roomcnt', 'yearbuilt', 'structuretaxvaluedollarcnt',
       'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt',
       'taxamount', 'censustractandblock', 'logerror', 'transactiondate'],
      dtype='object')

In [5]:
# sets thresh hold to 25 percent nulls
thresh_hold = df.shape[1] * .75

# remove rows with high nulls
df = df.dropna(axis=0,thresh=thresh_hold)

### Removing repeated/unecessary columns
- multiple sqft columns
- multiple county ids
- multiple census tract and block ids

In [6]:
# remove additional sqft
df = df.drop(columns=['finishedsquarefeet12'])

In [7]:
# removing one of the county id columns
df.regionidcounty.value_counts()

3101.0    45697
1286.0    18281
2061.0     5900
Name: regionidcounty, dtype: int64

In [8]:
df.fips.value_counts()

6037.0    45697
6059.0    18281
6111.0     5900
Name: fips, dtype: int64

In [9]:
# region county id and fips are the same, removing longer columns name
df = df.drop('regionidcounty',axis=1)

In [10]:
df.rawcensustractandblock.value_counts()

6.059052e+07    36
6.059052e+07    34
6.037920e+07    28
6.037274e+07    25
6.037142e+07    25
                ..
6.037575e+07     1
6.037503e+07     1
6.037482e+07     1
6.037431e+07     1
6.037431e+07     1
Name: rawcensustractandblock, Length: 43095, dtype: int64

In [11]:
df.censustractandblock.value_counts()

6.037142e+13    25
6.037277e+13    25
6.037920e+13    23
6.037920e+13    23
6.037920e+13    23
                ..
6.111001e+13     1
6.037400e+13     1
6.037238e+13     1
6.037111e+13     1
6.037911e+13     1
Name: censustractandblock, Length: 42741, dtype: int64

In [12]:
# region county id and fips are the same, removing longer columns name
df = df.drop('rawcensustractandblock',axis=1)

# Removing Outliers Using IQR Rule
- Before outliers have 70,364 rows
- After, we have 63,084

In [13]:
columns = ['calculatedfinishedsquarefeet','lotsizesquarefeet','structuretaxvaluedollarcnt',
           'landtaxvaluedollarcnt','taxamount']

for x in columns:
    
    Q1 = df[x].quantile(0.25)
    Q3 = df[x].quantile(0.75)
    IQR = (Q3 - Q1) * 1.5
    upper = Q3 + (1.5 * IQR)
    lower = Q1 - (1.5 * IQR)
    
    print('column:', x,'\nIQR:', IQR, '\nUpper bound:', upper, '\nLower bound:', lower, '\n')

column: calculatedfinishedsquarefeet 
IQR: 1317.0 
Upper bound: 4049.5 
Lower bound: -779.5 

column: lotsizesquarefeet 
IQR: 6632.25 
Upper bound: 20205.875 
Lower bound: -4112.375 

column: structuretaxvaluedollarcnt 
IQR: 188069.25 
Upper bound: 485437.125 
Lower bound: -204150.125 

column: landtaxvaluedollarcnt 
IQR: 374597.625 
Upper bound: 889017.4375 
Lower bound: -484507.1875 

column: taxamount 
IQR: 5588.126249999998 
Upper bound: 14526.511874999997 
Lower bound: -5963.284374999997 



In [14]:
for x in columns:
    
    Q1 = df[x].quantile(0.25)
    Q3 = df[x].quantile(0.75)
    IQR = (Q3 - Q1) * 1.5
    upper = Q3 + (1.5 * IQR)
    lower = Q1 - (1.5 * IQR)
    
    df = df[(df[x] > (lower)) | (df[x] < (upper))]

In [15]:
df.shape

(63084, 25)

# Exploring leftover nulls to determine if dropping or imputing with mean, median, mode

In [16]:
df.isnull().sum()

id                                 0
parcelid                           0
bathroomcnt                        0
bedroomcnt                         0
calculatedbathnbr                166
calculatedfinishedsquarefeet       0
fips                               0
fullbathcnt                      166
latitude                           0
longitude                          0
lotsizesquarefeet                  0
propertycountylandusecode          0
propertylandusetypeid              0
regionidcity                    1055
regionidzip                       41
roomcnt                            0
yearbuilt                         35
structuretaxvaluedollarcnt         0
taxvaluedollarcnt                  0
assessmentyear                     0
landtaxvaluedollarcnt              0
taxamount                          0
censustractandblock              237
logerror                           0
transactiondate                    0
dtype: int64

### Calculatedbathnbdr will be dealt with when creating features later
### Fullbathcnt

In [17]:
df.fullbathcnt.value_counts()

2.0     31523
3.0     14541
1.0     12661
4.0      2738
5.0       931
6.0       322
7.0       123
8.0        39
9.0        24
10.0        9
11.0        3
20.0        1
13.0        1
19.0        1
12.0        1
Name: fullbathcnt, dtype: int64

- Fullbathcnt has 166 nulls
- Most common value 2 has 35,694 observations
- Will impute the nulls with this

In [18]:
mode = df.fullbathcnt.mode()[0]

df['fullbathcnt'] = df.fullbathcnt.fillna(mode)

In [19]:
df['fullbathcnt'] = df.fullbathcnt.fillna(2)

### Region ID City
- has 1,055 nulls
- we already have latitude and longitude with no nulls for location
- will drop this column

In [20]:
df = df.drop('regionidcity',axis=1)

### Region ID Zip
- will replace nulls with 90000 to represent no known zip code (but not create outliers by using 0)
- can use latitude/longitude or clustering if necessary to determine actual values
- however, for only 41 nulls it is not a significant amount to worry about

In [21]:
df.regionidzip.value_counts()

96193.0    519
96368.0    517
97118.0    512
97319.0    479
96361.0    473
          ... 
95991.0      4
96226.0      2
96467.0      1
97177.0      1
96963.0      1
Name: regionidzip, Length: 385, dtype: int64

In [22]:
df['regionidzip'] = df.regionidzip.fillna(90_000)

### Year Built
- 35 nulls
- not many nulls to be significant, will replace with mean

In [23]:
df.yearbuilt.value_counts(), df.yearbuilt.mean()

(1955.0    2174
 1950.0    1850
 1954.0    1843
 1956.0    1704
 1953.0    1596
           ... 
 1862.0       1
 1880.0       1
 1889.0       1
 2016.0       1
 1893.0       1
 Name: yearbuilt, Length: 133, dtype: int64,
 1963.5353931069485)

In [24]:
df['yearbuilt'] = df.yearbuilt.fillna(1964)

### censustractandblock
- 237 nulls is not significant
- will replace with mode

In [25]:
df.censustractandblock.value_counts(), df.censustractandblock.mode()

(6.037142e+13    24
 6.037920e+13    23
 6.037920e+13    23
 6.037920e+13    23
 6.037277e+13    23
                 ..
 6.037409e+13     1
 6.037430e+13     1
 6.037620e+13     1
 6.111004e+13     1
 6.037540e+13     1
 Name: censustractandblock, Length: 40846, dtype: int64,
 0    6.037142e+13
 dtype: float64)

In [26]:
mode = df.censustractandblock.mode()[0]

df['censustractandblock'] = df.censustractandblock.fillna(mode)

# Create Features

### calculate our own bath_bed
- we have a column from the database, however it has 215 nulls
- bathroom and bedroom count by themselves have no nulls
- calculate our own and drop the original

In [27]:
df['bed_plus_bath'] = df.bathroomcnt + df.bedroomcnt
df = df.drop('calculatedbathnbr',axis=1)

In [28]:
df.isnull().sum()

id                              0
parcelid                        0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
fips                            0
fullbathcnt                     0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
propertylandusetypeid           0
regionidzip                     0
roomcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
logerror                        0
transactiondate                 0
bed_plus_bath                   0
dtype: int64

### All our nulls and outliers are removed, what other features could we create?

## How many years has the house been around?
- calculate by todays year - year built

In [29]:
df['age'] = 2020 - df.yearbuilt

## Dummy variables for year assessed?
- could say assesed in 2016 or not
- however, only 20 properties were not assessed in 2016
- we can drop this feature

In [30]:
df.assessmentyear.value_counts()

2016.0    63064
2014.0       18
2015.0        2
Name: assessmentyear, dtype: int64

In [31]:
df = df.drop('assessmentyear',axis=1)

## Transaction date by months? Range of 1-12

In [32]:
df.transactiondate.value_counts()

2017-06-30    1061
2017-05-31     845
2017-04-28     768
2017-07-28     763
2017-05-26     712
              ... 
2017-07-09       1
2017-04-08       1
2017-08-05       1
2017-09-25       1
2017-09-10       1
Name: transactiondate, Length: 256, dtype: int64

In [33]:
# converting to string to use split method
df['transactiondate'] = df.transactiondate.astype('str')

# creating new feature as the second index (month) of the transacion date split
df['transaction_month'] = df.transactiondate.str.split('-',expand=True)[1]

## Tax Rate
- tax paid for house / tax value of house * 100 = tax rate %

In [34]:
df['tax_rate'] = (df.taxamount / df.taxvaluedollarcnt) * 100

# Reorder/Rename Columns for Better Intuition

In [35]:
df.columns

Index(['id', 'parcelid', 'bathroomcnt', 'bedroomcnt',
       'calculatedfinishedsquarefeet', 'fips', 'fullbathcnt', 'latitude',
       'longitude', 'lotsizesquarefeet', 'propertycountylandusecode',
       'propertylandusetypeid', 'regionidzip', 'roomcnt', 'yearbuilt',
       'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
       'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock', 'logerror',
       'transactiondate', 'bed_plus_bath', 'age', 'transaction_month',
       'tax_rate'],
      dtype='object')

In [37]:
df.columns = ['index_id','parcel_id','bathrooms','bedrooms','property_sqft','county_id','full_bathrooms','latitude',
             'longitude','lot_sqft','land_use_code','land_use_type','zip_code','room_count','year_built',
             'structure_tax_value','tax_value','land_tax_value','tax_amount', 'census_id','log_error',
              'transaction_date','bed_plus_bath','property_age','transaction_month','tax_rate'
             ]

In [38]:
df.columns

Index(['index_id', 'parcel_id', 'bathrooms', 'bedrooms', 'property_sqft',
       'county_id', 'full_bathrooms', 'latitude', 'longitude', 'lot_sqft',
       'land_use_code', 'land_use_type', 'zip_code', 'room_count',
       'year_built', 'structure_tax_value', 'tax_value', 'land_tax_value',
       'tax_amount', 'census_id', 'log_error', 'transaction_date',
       'bed_plus_bath', 'property_age', 'transaction_month', 'tax_rate'],
      dtype='object')

In [39]:
# Reordering columns
df = df[['index_id', 'parcel_id',
    'log_error', 'tax_value', 'structure_tax_value', 'land_tax_value', 'tax_amount',
    'county_id', 'zip_code', 'latitude', 'longitude', 'census_id',
    'bathrooms', 'bedrooms', 'full_bathrooms', 'bed_plus_bath', 'room_count',
    'property_sqft', 'lot_sqft',
    'land_use_code', 'land_use_type',
    'year_built', 'property_age', 'transaction_date', 'transaction_month'
   ]]

## Convert Data Types to Correct Types

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63084 entries, 6 to 70363
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index_id             63084 non-null  int64  
 1   parcel_id            63084 non-null  int64  
 2   log_error            63084 non-null  float64
 3   tax_value            63084 non-null  float64
 4   structure_tax_value  63084 non-null  float64
 5   land_tax_value       63084 non-null  float64
 6   tax_amount           63084 non-null  float64
 7   county_id            63084 non-null  float64
 8   zip_code             63084 non-null  float64
 9   latitude             63084 non-null  float64
 10  longitude            63084 non-null  float64
 11  census_id            63084 non-null  float64
 12  bathrooms            63084 non-null  float64
 13  bedrooms             63084 non-null  float64
 14  full_bathrooms       63084 non-null  float64
 15  bed_plus_bath        63084 non-null 

- county id needs to be integer
- zip code needs to be integer
- room counts need to be integers
- land use type needs to be integer
- transaction month needs to be integer

In [45]:
df['county_id'] = df.county_id.astype('int')

df['zip_code'] = df.zip_code.astype('int')

df['bathrooms'] = df.bathrooms.astype('int')

df['bedrooms'] = df.bedrooms.astype('int')

df['full_bathrooms'] = df.full_bathrooms.astype('int')

df['bed_plus_bath'] = df.bed_plus_bath.astype('int')

df['room_count'] = df.room_count.astype('int')

df['land_use_type'] = df.land_use_type.astype('int')

df['transaction_month'] = df.transaction_month.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63084 entries, 6 to 70363
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index_id             63084 non-null  int64  
 1   parcel_id            63084 non-null  int64  
 2   log_error            63084 non-null  float64
 3   tax_value            63084 non-null  float64
 4   structure_tax_value  63084 non-null  float64
 5   land_tax_value       63084 non-null  float64
 6   tax_amount           63084 non-null  float64
 7   county_id            63084 non-null  int64  
 8   zip_code             63084 non-null  int64  
 9   latitude             63084 non-null  float64
 10  longitude            63084 non-null  float64
 11  census_id            63084 non-null  float64
 12  bathrooms            63084 non-null  int64  
 13  bedrooms             63084 non-null  int64  
 14  full_bathrooms       63084 non-null  int64  
 15  bed_plus_bath        63084 non-null 

# Put all together into a function for the Prepare.py module

In [None]:
def prepare_zillow():
    '''
    Acquire and prepare the zillow data obtained from the SQL database.
    Nulls are removed/replaced, outliers are removed, new features are created,
    and columns are renamed/rearranged. Returns the prepped df.
    '''
    # acquire the data from module
    df = acquire.get_home_data()
    
    # Removing Nulls from Columns
    # sets thresh hold to 75 percent nulls, if more than %25 nulls it will be removed
    threshold = df.shape[0] * .75

    # remove columns with specified threshold
    df = df.dropna(axis=1, thresh=threshold)
    
    # Removing Nulls from Rows
    # sets thresh hold to 75 percent nulls, if more than %25 nulls it will be removed
    thresh_hold = df.shape[1] * .75

    # remove rows with specified threshold
    df = df.dropna(axis=0,thresh=thresh_hold)
    
    # Removing Columns with Repeated Data/Unecessary Data
    # don't need additional sqft, county id/city, assessment year, and census columns
    df = df.drop(columns=['finishedsquarefeet12', 'regionidcounty', 'rawcensustractandblock',
                          'regionidcity','assessmentyear'], axis=1)
    
    # Removing Outliers from Continuous Variables
    # assigning columns to remove outliers
    columns = ['calculatedfinishedsquarefeet','lotsizesquarefeet','structuretaxvaluedollarcnt',
           'landtaxvaluedollarcnt','taxamount']
    
    # looping through continuous variables to remove outliers
    for x in columns:
    
        # calculate IQR
        Q1 = df[x].quantile(0.25)
        Q3 = df[x].quantile(0.75)
        IQR = (Q3 - Q1) * 1.5
        
        # calculate upper and lower bounds, outlier if above or below these
        upper = Q3 + (1.5 * IQR)
        lower = Q1 - (1.5 * IQR)
    
        # creates df of values that are within the outlier bounds
        df = df[(df[x] > (lower)) | (df[x] < (upper))]
        
    # Filling Leftover Nulls by Columns
    # Full Bathroom Count Nulls
    # mode of bathroomcnt
    fullbath_mode = df.fullbathcnt.mode()[0]
    # filling nulls with the mode
    df['fullbathcnt'] = df.fullbathcnt.fillna(fullbath_mode)
    
    # Region Zip Code Nulls
    # filling with 90000 to represent no known zipcode (0 would skew the data)
    df['regionidzip'] = df.regionidzip.fillna(90_000)
    
    # Year Built Nulls
    # average of property year built
    year_avg = round(df.yearbuilt.mean())
    # filling nulls with average year built
    df['yearbuilt'] = df.yearbuilt.fillna(year_avg)
    
    # Census Tract and Block Nulls
    # mode of census tract and block
    census_mode = df.censustractandblock.mode()[0]
    # filling nulls with mode
    df['censustractandblock'] = df.censustractandblock.fillna(mode)
    
    # Feature Engineering - creating columns
    # calculating bed+bath from 0 null columns of bedroom/bathroom count
    df['bed_plus_bath'] = df.bathroomcnt + df.bedroomcnt
    # droping original calculated field that had nulls
    df = df.drop('calculatedbathnbr',axis=1)
    
    # Property Age
    # current year minus year built
    df['age'] = 2020 - df.yearbuilt
    
    # Transaction Month
    # converting date to string to use split method
    df['transactiondate'] = df.transactiondate.astype('str')
    # creating new feature as the second index (month) of the transaction date split
    df['transaction_month'] = df.transactiondate.str.split('-',expand=True)[1]
    
    # Calculating Tax Rate for Property
    # Tax paid / tax value * 100 = tax rate %
    df['tax_rate'] = (df.taxamount / df.taxvaluedollarcnt) * 100
    
    # Renaming Columns
    df.columns = ['index_id', 'parcel_id', 'bathrooms', 'bedrooms', 'property_sqft', 'county_id', 'full_bathrooms',
                  'latitude', 'longitude', 'lot_sqft', 'land_use_code', 'land_use_type', 'zip_code', 'room_count',
                  'year_built', 'structure_tax_value', 'tax_value', 'land_tax_value', 'tax_amount', 'census_id',
                  'log_error', 'transaction_date', 'bed_plus_bath', 'property_age', 'transaction_month','tax_rate'
             ]
    
    # Reordering Columns
    df = df[['index_id', 'parcel_id',
        'log_error', 'tax_value', 'structure_tax_value', 'land_tax_value', 'tax_amount', 'tax_rate',
        'county_id', 'zip_code', 'latitude', 'longitude', 'census_id',
        'bathrooms', 'bedrooms', 'full_bathrooms', 'bed_plus_bath', 'room_count',
        'property_sqft', 'lot_sqft',
        'land_use_code', 'land_use_type',
        'year_built', 'property_age', 'transaction_date', 'transaction_month'
       ]]
    
    return df

# Preprocessing Function - Preparing Data For Explore/Modeling

## split the data

In [None]:
from sklearn.model_selection import train_test_split

def train_validate_test(df): 
    
    '''
    Split a df into train, validate, and test.
    70% - 20% - 10% respectively.
    Returns the three split dfs.
    '''
    
    # split into train, validate, and test sets
    train_and_validate, test = train_test_split(df, test_size = .10, random_state=123)
    train, validate = train_test_split(train_and_validate, test_size = .22, random_state=123)

    # These two print functions allow us to ensure the date is properly split
    # Will print the shape of each variable when running the function
    print("train shape: ", train.shape, ", validate shape: ", validate.shape, ", test shape: ", test.shape)

    # Will print the shape of eachvariable as a percentage of the total data set
    # Varialbe to hold the sum of all rows (total observations in the data)
    total = df.count()[0]
    print("\ntrain percent: ", round(((train.shape[0])/total),2) * 100, 
            ", validate percent: ", round(((validate.shape[0])/total),2) * 100, 
            ", test percent: ", round(((test.shape[0])/total),2) * 100)
    
    return train, validate, test

In [None]:
train, validate, test = train_validate_test(df)

In [None]:
train.head(1).T

In [None]:
X_train = train.drop('log_error', axis=1)
y_train = train[['log_error']]

X_validate = validate.drop('log_error', axis=1)
y_validate = validate[['log_error']]

X_test = test.drop('log_error',axis=1)
y_test = test[['log_error']]

## scale the data

In [None]:
from sklearn.model_selection import train_test_split 
import sklearn.preprocessing

def scale_data(train, validate, test):

    columns_to_scale = ['tax_value','structure_tax_value','land_tax_value','tax_amount',
                       'bathrooms','bedrooms','bed_plus_bath','room_count','property_sqft',
                       'lot_sqft']
    
    # 1. Create the Scaling Object
    scaler = sklearn.preprocessing.StandardScaler()

    # 2. Fit to the train data only
    scaler.fit(train[columns_to_scale])

    # 3. use the object on the whole df
    # this returns an array, so we convert to df in the same line
    train_scaled = pd.DataFrame(scaler.transform(train[columns_to_scale]))
    validate_scaled = pd.DataFrame(scaler.transform(validate[columns_to_scale]))
    test_scaled = pd.DataFrame(scaler.transform(test[columns_to_scale]))

    # the result of changing an array to a df resets the index and columns
    # for each train, validate, and test, we change the index and columns back to original values

    # Train
    train_scaled.index = train[columns_to_scale].index
    train_scaled.columns = train[columns_to_scale].columns

    # Validate
    validate_scaled.index = validate[columns_to_scale].index
    validate_scaled.columns = validate[columns_to_scale].columns

    # Test
    test_scaled.index = test[columns_to_scale].index
    test_scaled.columns = test[columns_to_scale].columns

    return train_scaled, validate_scaled, test_scaled

In [None]:
train_scaled, validate_scaled, test_scaled = scale_data(train, test, validate)

In [None]:
train_scaled

## Put together into the prepare function
- split will go inside the prep_zillow() function
- scale will go in a seperate function in the prepare.py