In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

In [2]:
#input_dtypes = ['int32', 'S10'] , dtype=input_dtypes
df = pd.read_csv("Zillow_data_20191023_(Raleigh Durham Cary).csv")

In [3]:
df.head()

Unnamed: 0,@id,@name,@type,FIPScounty,bathrooms,bedrooms,city,finishedSqFt,lastSoldDate,lastSoldPrice,...,street,taxAssessment,taxAssessmentYear,totalRooms,useCode,yearBuilt,zestimate,zindexValue,zipcode,zpid
0,270325,Southeast Raleigh,neighborhood,37183.0,3.0,3.0,Raleigh,1618.0,4/2/2019,19000.0,...,3915 Alder Grove Ln,28000.0,2018.0,,SingleFamily,2019.0,,149300,27610,79886745
1,54047,Raleigh,city,37183.0,3.0,4.0,Raleigh,2537.0,,,...,3207 Britmass Dr,40000.0,2018.0,,SingleFamily,2019.0,,221100,27616,215491685
2,343869,Northeast Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,1773.0,12/1/1989,102500.0,...,2004 Carthage Cir,159269.0,2018.0,7.0,SingleFamily,1989.0,224901.0,171900,27604,6504197
3,54047,Raleigh,city,37183.0,4.0,3.0,Raleigh,4617.0,5/16/2003,575000.0,...,9924 Waterview Rd,618168.0,2018.0,12.0,SingleFamily,1994.0,869696.0,221100,27615,6483992
4,270318,North Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,3066.0,5/27/2004,364000.0,...,2047 Hornbeck Ct,435002.0,2018.0,14.0,SingleFamily,2003.0,469902.0,270100,27614,65332699


In [4]:
#### Generate the Data and Target data. Target Data is LAST_SOLD_PRICE. 
#### Rest of the inputs are InputData.

housing_data = df.copy()   ## Make a copy of the data before cleaning.

In [5]:
## Clean the dataset to drop DataPoints where LastSoldPrice is not available.

print(f"Shape of dataset before cleaning is {housing_data.shape}")
housing_data.dropna(subset=['zestimate'], inplace = True)
print(f"Shape of dataset after cleaning is {housing_data.shape}")

Shape of dataset before cleaning is (1683, 25)
Shape of dataset after cleaning is (1594, 25)


In [6]:
housing_data.describe()

Unnamed: 0,@id,FIPScounty,bathrooms,bedrooms,finishedSqFt,lastSoldPrice,latitude,longitude,lotSizeSqFt,taxAssessment,taxAssessmentYear,totalRooms,yearBuilt,zestimate,zipcode,zpid
count,1594.0,1405.0,1589.0,1589.0,1593.0,1242.0,1594.0,1594.0,1491.0,1391.0,1396.0,980.0,1518.0,1594.0,1594.0,1594.0
mean,311732.245922,37140.024911,3.231278,3.697294,2722.768362,337951.0,35.877602,-78.776342,23278.84,330118.3,2017.984241,9.121429,1991.719368,469912.3,27623.569636,299435200.0
std,299234.568474,57.835237,1.248111,0.87557,1338.966652,400311.6,0.104047,0.130304,56234.12,280254.6,0.124588,39.792651,25.185985,305905.4,71.687343,657493800.0
min,24457.0,37037.0,1.0,1.0,544.0,2500.0,35.59441,-79.033768,871.0,4800.0,2017.0,2.0,1888.0,75526.0,27511.0,6375533.0
25%,51297.0,37063.0,2.5,3.0,1811.0,162000.0,35.801305,-78.893846,7448.5,165807.0,2018.0,6.0,1980.0,290714.0,27603.0,6533989.0
50%,270318.0,37183.0,3.0,4.0,2485.0,265500.0,35.87686,-78.799619,11369.0,259210.0,2018.0,8.0,1998.0,390623.5,27614.0,50028110.0
75%,343876.0,37183.0,4.0,4.0,3272.0,399000.0,35.956491,-78.665335,20037.0,401979.5,2018.0,10.0,2009.0,553791.0,27703.0,98212940.0
max,816439.0,37183.0,10.0,7.0,12174.0,8528500.0,36.130052,-78.454965,1176120.0,2741296.0,2018.0,1250.0,2019.0,3695643.0,27713.0,2141154000.0


In [7]:
print(f"Maximum Estimated price is {housing_data['zestimate'].max()}")
print(f"Minimum Estimated price is {housing_data['zestimate'].min()}")

Maximum Estimated price is 3695643.0
Minimum Estimated price is 75526.0


In [8]:
housing_data['zestimate'].describe()['25%']

290714.0

In [9]:
#### To implement classification divide the input data into 5 clusters.
#### We will try to predict what group a home in given region would fall into

bin_labels = ['1', '2', '3', '4', '5']
housing_data['price_bin'] = pd.qcut(housing_data['zestimate'], q=[0, .2, .4, .6, .8, 1], labels = bin_labels)

In [10]:
housing_data.head()

Unnamed: 0,@id,@name,@type,FIPScounty,bathrooms,bedrooms,city,finishedSqFt,lastSoldDate,lastSoldPrice,...,taxAssessment,taxAssessmentYear,totalRooms,useCode,yearBuilt,zestimate,zindexValue,zipcode,zpid,price_bin
2,343869,Northeast Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,1773.0,12/1/1989,102500.0,...,159269.0,2018.0,7.0,SingleFamily,1989.0,224901.0,171900,27604,6504197,1
3,54047,Raleigh,city,37183.0,4.0,3.0,Raleigh,4617.0,5/16/2003,575000.0,...,618168.0,2018.0,12.0,SingleFamily,1994.0,869696.0,221100,27615,6483992,5
4,270318,North Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,3066.0,5/27/2004,364000.0,...,435002.0,2018.0,14.0,SingleFamily,2003.0,469902.0,270100,27614,65332699,4
5,343864,West Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,2979.0,7/31/2007,381500.0,...,343669.0,2018.0,8.0,SingleFamily,1974.0,442602.0,173400,27606,6382773,3
6,270318,North Raleigh,neighborhood,37183.0,4.0,4.0,Raleigh,4977.0,7/8/2014,665000.0,...,673517.0,2018.0,8.0,SingleFamily,1997.0,760869.0,270100,27615,6551268,5


In [11]:
###### Drop the lastSoldPrice as the column was needed only to generate
###### the bins. SVM is a classification model and not regression.
ml_house_prices = housing_data.copy()
ml_house_prices.drop("zestimate", axis = 1, inplace = True)
ml_house_prices.head()

Unnamed: 0,@id,@name,@type,FIPScounty,bathrooms,bedrooms,city,finishedSqFt,lastSoldDate,lastSoldPrice,...,street,taxAssessment,taxAssessmentYear,totalRooms,useCode,yearBuilt,zindexValue,zipcode,zpid,price_bin
2,343869,Northeast Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,1773.0,12/1/1989,102500.0,...,2004 Carthage Cir,159269.0,2018.0,7.0,SingleFamily,1989.0,171900,27604,6504197,1
3,54047,Raleigh,city,37183.0,4.0,3.0,Raleigh,4617.0,5/16/2003,575000.0,...,9924 Waterview Rd,618168.0,2018.0,12.0,SingleFamily,1994.0,221100,27615,6483992,5
4,270318,North Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,3066.0,5/27/2004,364000.0,...,2047 Hornbeck Ct,435002.0,2018.0,14.0,SingleFamily,2003.0,270100,27614,65332699,4
5,343864,West Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,2979.0,7/31/2007,381500.0,...,4404 Driftwood Dr,343669.0,2018.0,8.0,SingleFamily,1974.0,173400,27606,6382773,3
6,270318,North Raleigh,neighborhood,37183.0,4.0,4.0,Raleigh,4977.0,7/8/2014,665000.0,...,421 Chatterson Dr,673517.0,2018.0,8.0,SingleFamily,1997.0,270100,27615,6551268,5


In [12]:
target = ml_house_prices['price_bin']
target_names = bin_labels

In [13]:
##### Drop the price_bin as that is the prediction target. 

ml_house_prices.drop("price_bin", axis = 1, inplace = True)
ml_house_prices.head()

Unnamed: 0,@id,@name,@type,FIPScounty,bathrooms,bedrooms,city,finishedSqFt,lastSoldDate,lastSoldPrice,...,state,street,taxAssessment,taxAssessmentYear,totalRooms,useCode,yearBuilt,zindexValue,zipcode,zpid
2,343869,Northeast Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,1773.0,12/1/1989,102500.0,...,NC,2004 Carthage Cir,159269.0,2018.0,7.0,SingleFamily,1989.0,171900,27604,6504197
3,54047,Raleigh,city,37183.0,4.0,3.0,Raleigh,4617.0,5/16/2003,575000.0,...,NC,9924 Waterview Rd,618168.0,2018.0,12.0,SingleFamily,1994.0,221100,27615,6483992
4,270318,North Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,3066.0,5/27/2004,364000.0,...,NC,2047 Hornbeck Ct,435002.0,2018.0,14.0,SingleFamily,2003.0,270100,27614,65332699
5,343864,West Raleigh,neighborhood,37183.0,3.0,4.0,Raleigh,2979.0,7/31/2007,381500.0,...,NC,4404 Driftwood Dr,343669.0,2018.0,8.0,SingleFamily,1974.0,173400,27606,6382773
6,270318,North Raleigh,neighborhood,37183.0,4.0,4.0,Raleigh,4977.0,7/8/2014,665000.0,...,NC,421 Chatterson Dr,673517.0,2018.0,8.0,SingleFamily,1997.0,270100,27615,6551268


In [14]:
###### Drop the additional columns, that we dont need for training.
###### Since we are dropping columns from DataFrame, make a copy of that dataframe before drop.

##print(ml_house_prices.columns)
ml_house_prices_2 = ml_house_prices.copy()
ml_house_prices_2.drop(["@id", "@name", "FIPScounty", "city", "lastSoldDate", "lastSoldPrice", "links", 'state', 'street', 'taxAssessmentYear', 'useCode', 'zindexValue', 'zipcode', 'zpid'], axis=1, inplace=True)

####### Print a list of all columns in the dataframe.
print(ml_house_prices_2.columns)

Index(['@type', 'bathrooms', 'bedrooms', 'finishedSqFt', 'latitude',
       'longitude', 'lotSizeSqFt', 'taxAssessment', 'totalRooms', 'yearBuilt'],
      dtype='object')


In [15]:
ml_house_prices_2["@type"] = pd.Categorical(ml_house_prices_2["@type"])
ml_house_prices_2["type-codes"] = ml_house_prices_2["@type"].cat.codes
ml_house_prices_2.drop(['@type'], axis = 1, inplace = True)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(ml_house_prices_2, target, random_state = 42)

In [17]:
X_train = np.nan_to_num(X_train)
y_train = np.nan_to_num(y_train)

In [18]:
print(f"Model Fit activities started at {dt.now().strftime('%d-%b-%Y %I:%M:%S %p')}")

Model Fit activities started at 26-Oct-2019 11:53:27 PM


In [None]:
start_date_time = dt.now()
print(f"Model fit activities started at {start_date_time.strftime('%d-%b-%Y %I:%M:%S %p')}")
from sklearn.svm import SVC
model = SVC(kernel = 'poly', random_state = 42)
model.fit (X_train, y_train)
end_date_time = dt.now()
print(f"Model fit ended at {end_date_time.strftime('%d-%b-%Y %I:%M:%S %p')}")

Model fit activities started at 26-Oct-2019 11:53:28 PM


