In [165]:
import pandas as pd
import numpy as np

In [166]:
path = '/Users/ianwe/Documents/GitHub/PlanningAhead/Wake County Data/' ## Update this to the path to where the repo is cloned


wakeCounty = pd.read_csv(path + 'WakeCountyHousing.csv')

wakeCounty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308292 entries, 0 to 308291
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Real_Estate_Id      308292 non-null  int64  
 1   Deeded_Acreage      308292 non-null  float64
 2   Total_Sale_Price    308292 non-null  int64  
 3   Total_Sale_Date     308292 non-null  object 
 4   Month_Year_of_Sale  308292 non-null  object 
 5   Year_of_Sale        308292 non-null  int64  
 6   Year_Built          308292 non-null  int64  
 7   Year_Remodeled      308292 non-null  int64  
 8   Heated_Area         308292 non-null  int64  
 9   Num_Stories         308292 non-null  object 
 10  Design_Style        308292 non-null  object 
 11  Bath                308275 non-null  object 
 12  Utilities           306324 non-null  object 
 13  Physical_City       308183 non-null  object 
 14  Physical_Zip        308146 non-null  float64
dtypes: float64(2), int64(6), object(7)

### Wake County
#### Clean the Data (deal with missing values)

In [167]:
wakeCounty.rename(columns={'Real_Estate_Id':'ID',
                            'Deeded_Acreage':'acreage',
                            'Total_Sale_Price':'total_sale_price',
                            'Total_Sale_Date':'total_sale_date',
                            'Month_Year_Sale':'month_year_sale',
                            'Year_of_Sale':'year_of_sale',
                            'Yeah_Build':'year_built',
                            'Year_Remodeled':'year_remodeled',
                            'Headed_Area':'headed_area',
                            'Num_Stories':'num_stories',
                            'Design_Style':'design_style',
                            'Bath':'bath',
                            'Utilities':'utilities',
                            'Physical_City':'city',
                            'Physical_Zip':'zip'
                          },inplace=True)

wakeCounty["bath"] = wakeCounty["bath"].fillna("None of Fixtures")
wakeCounty["utilities"] = wakeCounty["utilities"].fillna("N/A")
wakeCounty["city"] = wakeCounty["city"].fillna("Outside of City Limits")
wakeCounty = wakeCounty.dropna()

wakeCounty.isnull().sum()

ID                    0
acreage               0
total_sale_price      0
total_sale_date       0
Month_Year_of_Sale    0
year_of_sale          0
Year_Built            0
year_remodeled        0
Heated_Area           0
num_stories           0
design_style          0
bath                  0
utilities             0
city                  0
zip                   0
dtype: int64

#### Use One-Hot Encoder

In [169]:
from sklearn.preprocessing import OneHotEncoder

#One hot encode 'design_style'
ohe = OneHotEncoder(sparse=False)
X = ohe.fit_transform(wakeCounty[['design_style']])

X

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Making a Custom Transformer to use Ordinal Encoding

In [170]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [171]:
class customEncoderTranformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        cols_to_transform = list(X.columns)
        
        if self.columns:
            cols_to_transform = self.columns
                   
        encoder = OrdinalEncoder()
        encoder.fit(X[cols_to_transform])
        X[cols_to_transform] = encoder.transform(X[cols_to_transform])
        return X

In [172]:
pipe = Pipeline(
    steps=[
        ("ordinal_encoder", customEncoderTranformer(columns=['bath']))
    ]
)

transformed_df = pipe.fit_transform(wakeCounty)

In [173]:
transformed_df.head()

Unnamed: 0,ID,acreage,total_sale_price,total_sale_date,Month_Year_of_Sale,year_of_sale,Year_Built,year_remodeled,Heated_Area,num_stories,design_style,bath,utilities,city,zip
0,19,0.21,34500,1/1/1974,January 1974,1974,1964,1964,1828,One Story,Split level,2.0,ALL,Raleigh,27610.0
1,20,0.46,35500,5/18/1983,May 1983,1983,1970,1970,1240,One Story,Conventional,0.0,E,Raleigh,27610.0
2,22,0.46,37500,9/16/2004,September 2004,2004,1900,1900,2261,One Story,Conventional,2.0,WSE,Wendell,27591.0
3,25,0.96,70000,1/1/1971,January 1971,1971,1971,1971,3770,One Story,Conventional,7.0,WGE,Raleigh,27613.0
4,30,0.47,380000,8/12/2015,August 2015,2015,1946,2017,1789,One Story,Conventional,2.0,ALL,Raleigh,27607.0


##### Key for 'bath'
- 0 : 1 Bath
- 1 : 1 ½ Bath
- 2 : 2 Bath
- 3 : 2½ Bath
- 4 : 3 Bath
- 5 : 3½ Bath
- 6 : None of Fixtures
- 7 : Other

#### Scaling with Sklearn.preprocessing

In [193]:
from sklearn import preprocessing

db = wakeCounty['acreage']
db_scaled = preprocessing.scale(db)

db_scaled

array([-0.12608609,  0.00892712,  0.00892712, ..., -0.13688715,
        2.14753641, -0.17469085])

In [194]:
print(db_scaled.mean(axis=0))
print(db_scaled.std(axis=0))

-1.0367164071503153e-16
0.9999999999999997
