In [None]:
import pandas as pd
import numpy as np

In [42]:
path = '/Users/ianwe/Documents/GitHub/PlanningAhead/Wake County Data/' ## Update this to the path to where the repo is cloned


wakeCounty = pd.read_csv(path + 'WakeCountyHousing.csv')

wakeCounty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308292 entries, 0 to 308291
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Real_Estate_Id      308292 non-null  int64  
 1   Deeded_Acreage      308292 non-null  float64
 2   Total_Sale_Price    308292 non-null  int64  
 3   Total_Sale_Date     308292 non-null  object 
 4   Month_Year_of_Sale  308292 non-null  object 
 5   Year_of_Sale        308292 non-null  int64  
 6   Year_Built          308292 non-null  int64  
 7   Year_Remodeled      308292 non-null  int64  
 8   Heated_Area         308292 non-null  int64  
 9   Num_Stories         308292 non-null  object 
 10  Design_Style        308292 non-null  object 
 11  Bath                308275 non-null  object 
 12  Utilities           306324 non-null  object 
 13  Physical_City       308183 non-null  object 
 14  Physical_Zip        308146 non-null  float64
dtypes: float64(2), int64(6), object(7)

### Wake County
#### Clean the Data (deal with missing values)

In [43]:
wakeCounty.rename(columns={'Real_Estate_Id':'ID',
                            'Deeded_Acreage':'acreage',
                            'Total_Sale_Price':'total_sale_price',
                            'Total_Sale_Date':'total_sale_date',
                            'Month_Year_Sale':'month_year_sale',
                            'Year_of_Sale':'year_of_sale',
                            'Yeah_Build':'year_built',
                            'Year_Remodeled':'year_remodeled',
                            'Headed_Area':'headed_area',
                            'Num_Stories':'num_stories',
                            'Design_Style':'design_style',
                            'Bath':'bath',
                            'Utilities':'utilities',
                            'Physical_City':'city',
                            'Physical_Zip':'zip'
                          },inplace=True)

wakeCounty["bath"] = wakeCounty["bath"].fillna("None of Fixtures")
wakeCounty["utilities"] = wakeCounty["utilities"].fillna("N/A")
wakeCounty["city"] = wakeCounty["city"].fillna("Outside of City Limits")
wakeCounty = wakeCounty.dropna()

wakeCounty.isnull().sum()

ID                    0
acreage               0
total_sale_price      0
total_sale_date       0
Month_Year_of_Sale    0
year_of_sale          0
Year_Built            0
year_remodeled        0
Heated_Area           0
num_stories           0
design_style          0
bath                  0
utilities             0
city                  0
zip                   0
dtype: int64

#### Use One-Hot Encoder

In [44]:
from sklearn.preprocessing import OneHotEncoder

#One hot encode 'design_style'
ohe = OneHotEncoder(sparse=False)
encodedDesign = ohe.fit_transform(wakeCounty[['design_style']])

### Making a Custom Transformer to use Ordinal Encoding

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder

In [46]:
class customEncoderTranformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        cols_to_transform = list(X.columns)
        
        if self.columns:
            cols_to_transform = self.columns
                   
        encoder = OrdinalEncoder()
        encoder.fit(X[cols_to_transform])
        X[cols_to_transform] = encoder.transform(X[cols_to_transform])
        return X

In [47]:
pipe = Pipeline(
    steps=[
        ("ordinal_encoder", customEncoderTranformer(columns=['bath','design_style','utilities','num_stories']))
    ]
)

transformed_df = pipe.fit_transform(wakeCounty)

In [48]:
transformed_df.head()
wakeCounty = transformed_df

##### Key for 'bath' 
- 0 : 1 Bath
- 1 : 1 ½ Bath
- 2 : 2 Bath
- 3 : 2½ Bath
- 4 : 3 Bath
- 5 : 3½ Bath
- 6 : None of Fixtures
- 7 : Other

#### Scaling with Sklearn.preprocessing

In [49]:
from sklearn import preprocessing

db = wakeCounty['acreage']
db_scaled = preprocessing.scale(db)

db_scaled

array([-0.12608609,  0.00892712,  0.00892712, ..., -0.13688715,
        2.14753641, -0.17469085])

In [50]:
print(db_scaled.mean(axis=0))
print(db_scaled.std(axis=0))

-1.0367164071503153e-16
0.9999999999999997


In [51]:
from sklearn.model_selection import train_test_split
x = wakeCounty.drop(['total_sale_price','total_sale_date','Month_Year_of_Sale','city'], axis=1)
y = wakeCounty['total_sale_price']

trainX, testX, trainY, testY = train_test_split(x,y)

In [None]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge")

In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(trainX)
trainX = scaler.transform(trainX)
testX = scaler.transform(testX)

In [None]:
clf.fit(trainX, trainY)

In [None]:
y_pred = clf.predict(textX)