In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

In [None]:
#input_dtypes = ['int32', 'S10'] , dtype=input_dtypes
df = pd.read_csv("Zillow_data_20191023_(Raleigh Durham Cary).csv")

In [None]:
df.head()

In [None]:
#### Generate the Data and Target data. Target Data is LAST_SOLD_PRICE. 
#### Rest of the inputs are InputData.

housing_data = df.copy()   ## Make a copy of the data before cleaning.

In [None]:
## Clean the dataset to drop DataPoints where LastSoldPrice is not available.

print(f"Shape of dataset before cleaning is {housing_data.shape}")
housing_data.dropna(subset=['zestimate'], inplace = True)
print(f"Shape of dataset after cleaning is {housing_data.shape}")

In [None]:
housing_data.describe()

In [None]:
print(f"Maximum Estimated price is {housing_data['zestimate'].max()}")
print(f"Minimum Estimated price is {housing_data['zestimate'].min()}")

In [None]:
housing_data['zestimate'].describe()['25%']

In [None]:
#### To implement classification divide the input data into 5 clusters.
#### We will try to predict what group a home in given region would fall into

bin_labels = ['1', '2', '3', '4', '5']
housing_data['price_bin'] = pd.qcut(housing_data['zestimate'], q=[0, .2, .4, .6, .8, 1], labels = bin_labels)

In [None]:
housing_data.head()

In [None]:
###### Drop the lastSoldPrice as the column was needed only to generate
###### the bins. SVM is a classification model and not regression.
ml_house_prices = housing_data.copy()
ml_house_prices.drop("zestimate", axis = 1, inplace = True)
ml_house_prices.head()

In [None]:
target = ml_house_prices['price_bin']
target_names = bin_labels

In [None]:
##### Drop the price_bin as that is the prediction target. 

ml_house_prices.drop("price_bin", axis = 1, inplace = True)
ml_house_prices.head()

In [None]:
###### Drop the additional columns, that we dont need for training.
###### Since we are dropping columns from DataFrame, make a copy of that dataframe before drop.

##print(ml_house_prices.columns)
ml_house_prices_2 = ml_house_prices.copy()
ml_house_prices_2.drop(["@id", "@name", "FIPScounty", "city", "lastSoldDate", "lastSoldPrice", "links", 'state', 'street', 'taxAssessmentYear', 'useCode', 'zindexValue', 'zipcode', 'zpid'], axis=1, inplace=True)

####### Print a list of all columns in the dataframe.
print(ml_house_prices_2.columns)

In [None]:
ml_house_prices_2["@type"] = pd.Categorical(ml_house_prices_2["@type"])
ml_house_prices_2["type-codes"] = ml_house_prices_2["@type"].cat.codes
ml_house_prices_2.drop(['@type'], axis = 1, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(ml_house_prices_2, target, random_state = 42)

In [None]:
X_train = np.nan_to_num(X_train)
y_train = np.nan_to_num(y_train)

In [None]:
print(f"Model Fit activities started at {dt.now().strftime('%d-%b-%Y %I:%M:%S %p')}")

In [None]:
start_date_time = dt.now()
print(f"Model fit activities started at {start_date_time.strftime('%d-%b-%Y %I:%M:%S %p')}")
from sklearn.svm import SVC
model = SVC(kernel = 'poly', random_state = 42)
model.fit (X_train, y_train)
end_date_time = dt.now()
print(f"Model fit ended at {end_date_time.strftime('%d-%b-%Y %I:%M:%S %p')}")