In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '/Users/ianwe/Documents/GitHub/PlanningAhead/Wake County Data/'

wakeCounty = pd.read_csv(path + 'WakeCountyHousing.csv')

wakeCounty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308292 entries, 0 to 308291
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Real_Estate_Id      308292 non-null  int64  
 1   Deeded_Acreage      308292 non-null  float64
 2   Total_Sale_Price    308292 non-null  int64  
 3   Total_Sale_Date     308292 non-null  object 
 4   Month_Year_of_Sale  308292 non-null  object 
 5   Year_of_Sale        308292 non-null  int64  
 6   Year_Built          308292 non-null  int64  
 7   Year_Remodeled      308292 non-null  int64  
 8   Heated_Area         308292 non-null  int64  
 9   Num_Stories         308292 non-null  object 
 10  Design_Style        308292 non-null  object 
 11  Bath                308275 non-null  object 
 12  Utilities           306324 non-null  object 
 13  Physical_City       308183 non-null  object 
 14  Physical_Zip        308146 non-null  float64
dtypes: float64(2), int64(6), object(7)

### Wake County
#### Clean the Data (deal with missing values)

In [3]:
wakeCounty.rename(columns={'Real_Estate_Id':'ID',
                            'Deeded_Acreage':'acreage',
                            'Total_Sale_Price':'total_sale_price',
                            'Total_Sale_Date':'total_sale_date',
                            'Month_Year_Sale':'month_year_sale',
                            'Year_of_Sale':'year_of_sale',
                            'Yeah_Build':'year_built',
                            'Year_Remodeled':'year_remodeled',
                            'Headed_Area':'headed_area',
                            'Num_Stories':'num_stories',
                            'Design_Style':'design_style',
                            'Bath':'bath',
                            'Utilities':'utilities',
                            'Physical_City':'city',
                            'Physical_Zip':'zip'
                          },inplace=True)
wakeCountyCopy = wakeCounty.copy() #This will be used at the end for the pipeline that does the entire process


wakeCounty["bath"] = wakeCounty["bath"].fillna("None of Fixtures")
wakeCounty["utilities"] = wakeCounty["utilities"].fillna("N/A")
wakeCounty["city"] = wakeCounty["city"].fillna("Outside of City Limits")
wakeCounty = wakeCounty.dropna()

wakeCounty.isnull().sum()

ID                    0
acreage               0
total_sale_price      0
total_sale_date       0
Month_Year_of_Sale    0
year_of_sale          0
Year_Built            0
year_remodeled        0
Heated_Area           0
num_stories           0
design_style          0
bath                  0
utilities             0
city                  0
zip                   0
dtype: int64

#### Use One-Hot Encoder

In [4]:
from sklearn.preprocessing import OneHotEncoder

#One hot encode 'design_style'
ohe = OneHotEncoder(sparse=False)
encodedDesign = ohe.fit_transform(wakeCounty[['design_style']])

### Making a Custom Transformer to use Ordinal Encoding

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder

In [6]:
class customEncoderTranformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        cols_to_transform = list(X.columns)
        
        if self.columns:
            cols_to_transform = self.columns
                   
        encoder = OrdinalEncoder()
        encoder.fit(X[cols_to_transform])
        X[cols_to_transform] = encoder.transform(X[cols_to_transform])
        return X

In [7]:
pipe = Pipeline(
    steps=[
        ("ordinal_encoder", customEncoderTranformer(columns=['bath','design_style','utilities','num_stories']))
    ]
)

transformed_df = pipe.fit_transform(wakeCounty)

In [8]:
transformed_df.head()
wakeCounty = transformed_df

#### Split the dataset to allow for future models to be made


In [9]:
from sklearn.model_selection import train_test_split
x = wakeCounty.drop(['total_sale_price','total_sale_date','Month_Year_of_Sale','city'], axis=1)
y = wakeCounty['total_sale_price']

trainX, testX, trainY, testY = train_test_split(x,y)

##### Key for 'bath' 
- 0 : 1 Bath
- 1 : 1 ½ Bath
- 2 : 2 Bath
- 3 : 2½ Bath
- 4 : 3 Bath
- 5 : 3½ Bath
- 6 : None of Fixtures
- 7 : Other

#### Scaling with Sklearn.preprocessing

In [10]:
from sklearn import preprocessing

db = wakeCounty['acreage']
db_scaled = preprocessing.scale(db)

#Pt 2
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(trainX)
trainX = scaler.transform(trainX)
testX = scaler.transform(testX)

In [11]:
print(db_scaled.mean(axis=0))
print(db_scaled.std(axis=0))

-1.0367164071503153e-16
0.9999999999999997


#### Use SGDClassifier

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

clf = SGDClassifier(loss="log")

In [13]:
#clf.fit(testX, testY)            #Note that by fitting trainX and trainY take a significantly longer amount of time
#y_pred = clf.predict(trainX)     # so for this we swapped which sets to fit, however this also greatly affects the 
                                 # accuracy since there far fewer rows of data to train with
#accuracy_score(trainY, y_pred)

#### Use sklearn.linear_model.LinearRegression

In [14]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

lReg = LinearRegression()
  
lReg.fit(trainX, trainY)
print(lReg.score(testX, testY))

0.6831536395045472


#### Use sklearn.tree.DecisionTreeRegressor

In [15]:
from sklearn.tree import DecisionTreeRegressor

dtReg = DecisionTreeRegressor()

dtReg.fit(trainX, trainY)
print(dtReg.score(testX, testY))

0.7034362437627133


#### Create a single pipeline that does full process from data preparation to final prediction.

In [16]:
class clean(BaseEstimator, TransformerMixin):
    def __init__(self, drop=None, encode=None):
        self.drop = drop
        self.encode = encode
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.drop:
            cols_to_drop = self.drop
            X = X.drop(cols_to_drop,axis=1)
        
        X = X.dropna()   
        
        if self.encode:
            cols_to_encode = self.encode           
            encoder = OrdinalEncoder()
            encoder.fit(X[cols_to_encode])
            X[cols_to_encode] = encoder.transform(X[cols_to_encode])
        return X

In [21]:
class model(BaseEstimator, TransformerMixin):
    def __init__(self, target=None):
        self.target = target
        
    def fit(self, X, y=None, target=None):
        if self.target:
            target = self.target
            x = X
            y = X.drop([target], axis=1)
            trainX, testX, trainY, testY = train_test_split(x,y)
            scaler = StandardScaler()
            scaler.fit(trainX)
            trainX = scaler.transform(trainX)
            testX = scaler.transform(testX)
            lReg = LinearRegression()
  
            lReg.fit(trainX, trainY)
            print(lReg.score(testX, testY))
        return self
    
    def transform(self, X, y=None):
        return X

In [22]:
 wakeCountyCopy.describe()

Unnamed: 0,ID,acreage,total_sale_price,year_of_sale,Year_Built,year_remodeled,Heated_Area,zip
count,308292.0,308292.0,308292.0,308292.0,308292.0,308292.0,308292.0,308146.0
mean,244524.805863,0.443446,273841.0,2009.583593,1994.992666,1995.810647,2267.37589,27572.340936
std,129858.542506,1.851256,190400.3,10.517202,19.671737,19.321347,997.488231,41.80028
min,19.0,0.0,0.0,1956.0,0.0,0.0,220.0,27501.0
25%,140864.75,0.14,150000.0,2004.0,1986.0,1986.0,1532.0,27526.0
50%,244362.5,0.24,235000.0,2013.0,1999.0,2000.0,2087.0,27591.0
75%,346481.25,0.42,350000.0,2018.0,2007.0,2008.0,2794.0,27610.0
max,484520.0,307.91,6100200.0,2021.0,2022.0,2209.0,21993.0,27713.0


In [23]:
pipe1 = Pipeline(steps=[
            ("clean_and_encode", clean(drop=['total_sale_date','Month_Year_of_Sale','city'], encode=['bath','design_style','utilities','num_stories'])),
            ("scale_and_model", model(target='total_sale_price'))
        ])
        

In [24]:
td = pipe1.fit_transform(wakeCountyCopy)

cleaning...
clean
modeling...
1.0
modeled
T
