In [4]:
import pandas as pd
import numpy as np

In [5]:
path = '/Users/ianwe/Documents/GitHub/PlanningAhead/Wake County Data/'

wakeCounty = pd.read_csv(path + 'WakeCountyHousing.csv')

wakeCounty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308292 entries, 0 to 308291
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Real_Estate_Id      308292 non-null  int64  
 1   Deeded_Acreage      308292 non-null  float64
 2   Total_Sale_Price    308292 non-null  int64  
 3   Total_Sale_Date     308292 non-null  object 
 4   Month_Year_of_Sale  308292 non-null  object 
 5   Year_of_Sale        308292 non-null  int64  
 6   Year_Built          308292 non-null  int64  
 7   Year_Remodeled      308292 non-null  int64  
 8   Heated_Area         308292 non-null  int64  
 9   Num_Stories         308292 non-null  object 
 10  Design_Style        308292 non-null  object 
 11  Bath                308275 non-null  object 
 12  Utilities           306324 non-null  object 
 13  Physical_City       308183 non-null  object 
 14  Physical_Zip        308146 non-null  float64
dtypes: float64(2), int64(6), object(7)

### Wake County
#### Clean the Data (deal with missing values)

In [6]:
wakeCounty.rename(columns={'Real_Estate_Id':'ID',
                            'Deeded_Acreage':'acreage',
                            'Total_Sale_Price':'total_sale_price',
                            'Total_Sale_Date':'total_sale_date',
                            'Month_Year_Sale':'month_year_sale',
                            'Year_of_Sale':'year_of_sale',
                            'Yeah_Build':'year_built',
                            'Year_Remodeled':'year_remodeled',
                            'Headed_Area':'headed_area',
                            'Num_Stories':'num_stories',
                            'Design_Style':'design_style',
                            'Bath':'bath',
                            'Utilities':'utilities',
                            'Physical_City':'city',
                            'Physical_Zip':'zip'
                          },inplace=True)
wakeCountyCopy = wakeCounty.copy() #This will be used at the end for the pipeline that does the entire process


wakeCounty["bath"] = wakeCounty["bath"].fillna("None of Fixtures")
wakeCounty["utilities"] = wakeCounty["utilities"].fillna("N/A")
wakeCounty["city"] = wakeCounty["city"].fillna("Outside of City Limits")
wakeCounty = wakeCounty.dropna()

wakeCounty.isnull().sum()

ID                    0
acreage               0
total_sale_price      0
total_sale_date       0
Month_Year_of_Sale    0
year_of_sale          0
Year_Built            0
year_remodeled        0
Heated_Area           0
num_stories           0
design_style          0
bath                  0
utilities             0
city                  0
zip                   0
dtype: int64

#### Use One-Hot Encoder

In [7]:
from sklearn.preprocessing import OneHotEncoder

#One hot encode 'design_style'
ohe = OneHotEncoder(sparse=False)
encodedDesign = ohe.fit_transform(wakeCounty[['design_style']])

#### Making a Custom Transformer to use Ordinal Encoding

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder

In [9]:
class customEncoderTranformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        cols_to_transform = list(X.columns)
        
        if self.columns:
            cols_to_transform = self.columns
                   
        encoder = OrdinalEncoder()
        encoder.fit(X[cols_to_transform])
        X[cols_to_transform] = encoder.transform(X[cols_to_transform])
        return X

In [10]:
pipe = Pipeline(
    steps=[
        ("ordinal_encoder", customEncoderTranformer(columns=['bath','design_style','utilities','num_stories']))
    ]
)

transformed_df = pipe.fit_transform(wakeCounty)

In [11]:
transformed_df.head()
wakeCounty = transformed_df

#### Split the dataset to allow for future models to be made


In [12]:
from sklearn.model_selection import train_test_split
x = wakeCounty.drop(['total_sale_price','total_sale_date','Month_Year_of_Sale','city'], axis=1)
y = wakeCounty['total_sale_price']

trainX, testX, trainY, testY = train_test_split(x,y)

#### Scaling with Sklearn.preprocessing

In [13]:
from sklearn import preprocessing

db = wakeCounty['acreage']
db_scaled = preprocessing.scale(db)

#Pt 2
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(trainX)
trainX = scaler.transform(trainX)
testX = scaler.transform(testX)

In [14]:
print(db_scaled.mean(axis=0))
print(db_scaled.std(axis=0))

-1.0367164071503153e-16
0.9999999999999997


#### Use SGDClassifier

In [15]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

clf = SGDClassifier(loss="log")

In [16]:
#clf.fit(testX, testY)            #Note that by fitting trainX and trainY take a significantly longer amount of time
#y_pred = clf.predict(trainY)     # so for this we swapped which sets to fit, however this also greatly affects the 
                                 # accuracy since there far fewer rows of data to train with
#accuracy_score(trainY, y_pred)

#### Use sklearn.linear_model.LinearRegression

In [17]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

lReg = LinearRegression()
  
lReg.fit(trainX, trainY)
print(lReg.score(testX, testY))

0.6793503747000617


#### Use sklearn.tree.DecisionTreeRegressor

In [18]:
from sklearn.tree import DecisionTreeRegressor

dtReg = DecisionTreeRegressor()

dtReg.fit(trainX, trainY)
print(dtReg.score(testX, testY))

0.715392326327501


#### Use k-fold Cross Validation (cross_val_score)

In [24]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, KFold

lasso = linear_model.Lasso()
cv = KFold(n_splits=10, random_state=1, shuffle=True)
print(cross_val_score(lasso, trainX, trainY, cv=cv))    
 

[0.69811941 0.65653376 0.67692802 0.67754133 0.6856723  0.69891819
 0.66540438 0.69873716 0.67841995 0.69826682]


#### Use StratifiedKFold cross validation

In [25]:
from sklearn.model_selection import StratifiedKFold
  
X = np.array(wakeCounty['acreage'])
y = np.array(wakeCounty['total_sale_price'])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

for train_index, test_index in skf.split(X, y):
    lasso = linear_model.Lasso()
    print(cross_val_score(lasso, trainX, trainY, cv=cv))    




[0.69811941 0.65653376 0.67692802 0.67754133 0.6856723  0.69891819
 0.66540438 0.69873716 0.67841995 0.69826682]
[0.69811941 0.65653376 0.67692802 0.67754133 0.6856723  0.69891819
 0.66540438 0.69873716 0.67841995 0.69826682]


#### Create a single pipeline that does full process from data preparation to final prediction.

In [20]:
class clean(BaseEstimator, TransformerMixin):
    def __init__(self, drop=None, encode=None):
        self.drop = drop
        self.encode = encode
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.drop:
            cols_to_drop = self.drop
            X = X.drop(cols_to_drop,axis=1)
        
        X = X.dropna()   
        
        if self.encode:
            cols_to_encode = self.encode           
            encoder = OrdinalEncoder()
            encoder.fit(X[cols_to_encode])
            X[cols_to_encode] = encoder.transform(X[cols_to_encode])
        return X

In [21]:
class model(BaseEstimator, TransformerMixin):
    def __init__(self, target=None):
        self.target = target
        
    def fit(self, X, y=None, target=None):
        if self.target:
            target = self.target
            x = X
            y = X.drop([target], axis=1)
            trainX, testX, trainY, testY = train_test_split(x,y)
            scaler = StandardScaler()
            scaler.fit(trainX)
            trainX = scaler.transform(trainX)
            testX = scaler.transform(testX)
            lReg = LinearRegression()
  
            lReg.fit(trainX, trainY)
            print("Score: ",lReg.score(testX, testY))
        return self
    
    def transform(self, X, y=None):
        return X

In [22]:
pipe1 = Pipeline(steps=[
            ("clean_and_encode", clean(drop=['total_sale_date','Month_Year_of_Sale','city'], encode=['bath','design_style','utilities','num_stories'])),
            ("scale_and_model", model(target='total_sale_price'))
        ])
        

In [23]:
td = pipe1.fit_transform(wakeCountyCopy)

Score:  1.0
