In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_toolkits
from scipy.stats import skew,norm

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder 
from sklearn import metrics 

%matplotlib inline
pd.options.display.float_format = '{:.1f}'.format
from scipy import stats

import tensorflow as tf
from tensorflow import keras

In [None]:
def display(pandas):
    with pd.set_option('display.max_columns', None, 'display.max_rows', None):
        display(pandas)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Read  Parcel Data

In [None]:
housing =  pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Parceldata.csv")
housing

### Check parcel columns 

In [None]:
housing.columns

### Check count of features for Crescent and Wedge 

In [None]:
housing['Divide'].value_counts().plot(kind='bar')
plt.title('Geographic Variation')
plt.xlabel('Divide')
plt.ylabel('Count')
sns.despine

### Check number of null values in each of the variables 

In [None]:
# pd.set_option('display.max_columns', None, 'display.max_rows', None)

In [None]:
Nullpercent = (housing.isnull().sum()/len(housing))*100
Nullpercent.sort_index()

#### Drop  variables that have null values over 30%

In [None]:
housing = housing.dropna(axis = 1, thresh = 0.70*len(housing))
housing

### Create a column for the year 

In [None]:
housing = housing.assign(Age = 2020 - housing["yearbuilt"])
housing = housing.drop(columns = ["yearbuilt"] )
housing

###  Drop unwanted columns 

In [None]:
housing = housing.drop(columns = ['FULL_ADDRESS','OID_', 'ORIG_FID','city','mailaddr1',
                'map_block','map_book','map_page','municipality','ownerlastname','ownerfirstname','ownertype','parcel_type',
                'pid','state','stdir','stname','stsuffix','sttype','taxfire', 'accounttype','descbuildingtype',
                'propertyusecode', 'netbldgvalue', 'extravaluefeature','landvalue','totalvalue',
                'lot_num','nc_pin','legal_from','commonpid','taxpid','houseno','cardno','deedbook','deedpage','legalreference',
                'neighbourhood','parlegaldesc','landsequenceno', 'FamilyPovertyRate', 'Total_Households', 'FamiliesInPoverty', 'FID_Parcels',
                'TotalFamilies','typeofdeed','taxmun','landusecode', 'physicaldepcode', 'ownerno','codemunicipality','zipcode',
                'vacantorimproved','foundation','extwall','ownerno','TotalFamilies','FID_HHincome','codemunicipality',
                'condo_town_flag','cdebuilding','actype', 'cdebuilding', 'aheatingtype', 'fireplaces' ])


In [None]:
housing.columns

### Check statistics for each of the variable

In [None]:
housing.head()

In [None]:
housing = housing.dropna()

In [None]:
housing.describe().transpose() 

In [None]:
cleandata = (housing.isnull().sum()/len(housing))*100
cleandata

In [None]:
housing.dtypes

### Review and convert categorical variables

In [None]:
housing 

In [None]:
 housing.groupby("storyheight").size()

In [None]:
## Rename GOOD 06 as GOOD
housing.bldggrade[housing.bldggrade == "GOOD 06"] = "GOOD"

## Rename Blank story height as 1 STORY

housing.storyheight[housing.storyheight == " "] = "1 STORY"

#### Use Label Encoder to Convert Catetgorical variable to numeric

In [None]:
Labelencoder = LabelEncoder()

housing['Divide_transformed'] = Labelencoder.fit_transform(housing["Divide"])
housing['bldggrade_transformed'] = Labelencoder.fit_transform(housing["bldggrade"])
housing['heatedfuel_transformed'] = Labelencoder.fit_transform(housing["heatedfuel"])
housing['storyheight_transformed'] = Labelencoder.fit_transform(housing["storyheight"])
housing


In [None]:
# ## Covert divide to 1 and 0, Crescent = 1, Wedge = 0
# housing.Divide[housing.Divide == "Crescent"] = 1
# housing.Divide[housing.Divide == "Wedge"] = 0

# ###Convert building grade 
# housing.bldggrade[housing.bldggrade == "AVERAGE"] = 1
# housing.bldggrade[housing.bldggrade == "CUSTOM"] = 2
# housing.bldggrade[housing.bldggrade == "EXCELLENT"] = 3
# housing.bldggrade[housing.bldggrade == "FAIR"] = 4
# housing.bldggrade[housing.bldggrade == "GOOD"] = 5
# housing.bldggrade[housing.bldggrade == "GOOD 06"] = 5
# housing.bldggrade[housing.bldggrade == "MINIMUM"] = 6
# housing.bldggrade[housing.bldggrade == "VERY GOOD"] = 7

# ##convert heated fuel
# housing.heatedfuel[housing.heatedfuel == "ELECTRIC"] = 1
# housing.heatedfuel[housing.heatedfuel == "GAS"] = 2 
# housing.heatedfuel[housing.heatedfuel == "NONE"] = 3
# housing.heatedfuel[housing.heatedfuel == " "] = 3
# housing.heatedfuel[housing.heatedfuel == "OIL/WD/COAL"] = 4
# housing.heatedfuel[housing.heatedfuel == "SOLAR/GEOTHRM"] = 5

In [None]:
 housing.groupby("descpropertyuse").size()

### We are only interested in single family, Multi family, Condos, and Townhomes

In [None]:
### Filter Single Family, Multi Family and Condos 

singlefamily = housing[housing["descpropertyuse"] == "Single-Family"] 
multifamily = housing[housing["descpropertyuse"] == "Multi-Family"] 
Condo = housing[housing["descpropertyuse"] == "Condo/Townhome"] 

housing = pd.concat([singlefamily,multifamily,Condo], axis = 0)

In [None]:
corrmat = housing.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat,vmax=0.9, square=True,fmt= '.2f',annot_kws={'size': 10})
plt.show();

In [None]:
housing.columns

In [None]:
### Delete original columns for building grade, Divide, heatedfuel and storyheight

housing = housing.drop(columns = ['bldggrade', 'heatedfuel', "Divide",'descpropertyuse', 'storyheight'] )
housing 

In [None]:
housing.columns

In [None]:
housing = housing.loc[:, ('totalac', 'price', 'heatedarea', 'numfireplaces',
       'fullbaths', 'halfbaths', 'bedrooms', 'units',
       'Median_Household_Income', 'POINT_X', 'POINT_Y', 'Age', 'Divide_transformed',
       'bldggrade_transformed', 'heatedfuel_transformed')]. apply(np.int64)
housing.dtypes

In [None]:
housing.dtypes

In [None]:
# kernel density plot
sns.distplot(housing.price,fit=norm);
plt.ylabel =('Frequency')
plt.title = ('Price Distribution');
#Get the fitted parameters used by the function
(mu, sigma) = norm.fit(housing['price']);
#QQ plot
fig = plt.figure()
res = stats.probplot(housing['price'], plot=plt)
plt.show()
print("skewness: %f" % housing['price'].skew())
print("kurtosis: %f" % housing['price'].kurt())

### Implementation of Neural Network on our housing data


In [None]:
X=housing.drop(['price'],axis=1)

In [None]:
X.shape

In [None]:
T= housing['price'].copy()
T=T.values.reshape(-1,1)
T

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, T_train, T_test = train_test_split(X[:10000],T[:10000], test_size=0.3, random_state=40)
X_train=X_train.values
X_test=X_test.values

In [None]:

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
from keras.models import Sequential
from keras.initializers import glorot_normal
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Activation
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.losses import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [None]:
model = Sequential()
model.add(Dense(30, input_dim=14, activation="relu"))
model.add(Dense(15, activation="relu"))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse')

In [None]:
#history = model.fit(X_train, T_train, epochs=5, batch_size=1,verbose=2)
history = model.fit(X_train, T_train, epochs=100, validation_data=(X_test, T_test), verbose=2)

In [None]:

import matplotlib.pyplot as pyplot

#pyplot.title('Loss / Mean Squared Error')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

#predict = model.predict(test_data)

In [None]:
test_loss, test_acc = model.evaluate(X_test, T_test)