In [195]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
# ignore warnings
import warnings
import math

warnings.filterwarnings('ignore')

In [196]:
column_names = [
"address",
"price",
"gross_tax",
"strata_maintenance_fees",
"bedrooms",
"bathrooms",
"property_type",
"property_age",
"title",
"style",
"heating_type",
"feature",
"amenities",
"appliances",
"community",
"days_on_rew",
"property_views",
"mls®_number",
"source",
'frontage', 
'lot_size', 
'year_built', 
'depth',
'half_bathrooms'
]



In [197]:

def get_df(file_path, apply_columns=True):

    sys.path.append(file_path)

    data = pd.read_csv(file_path, on_bad_lines='skip')

    # data.dropna(inplace=True)
    if apply_columns:
        data.columns = column_names
    return data



folder = "../data/raw_2023_04_19/"

filenames = {
    "burnaby": "burnaby_real_estate_data.csv",
    "langley":"langley_real_estate_data.csv",
    "richmond":"richmond_real_estate_data.csv",
    "maple_ridge":"maple-ridge_real_estate_data.csv",
    "surrey":"surrey_real_estate_data.csv",
    "new_west": "new-westminster_real_estate_data.csv",
    "vancouver":"vancouver_real_estate_data.csv",
    "coquitlam":"coquitlam_real_estate_data.csv",
    "north_vancouver":"north-vancouver_real_estate_data.csv",
    "west_vancouver":"west-vancouver_real_estate_data.csv",
    "delta":"delta_real_estate_data.csv",
    "pitt_meadows":"pitt-meadows_real_estate_data.csv",
    "kelowna":"kelowna_real_estate_data.csv",
    "port_coquitlam":"port-coquitlam_real_estate_data.csv"
}

dataframes = {}

for k,v in filenames.items():
    dataframes[k] = get_df(folder + v)

dataframes.keys()



dict_keys(['burnaby', 'langley', 'richmond', 'maple_ridge', 'surrey', 'new_west', 'vancouver', 'coquitlam', 'north_vancouver', 'west_vancouver', 'delta', 'pitt_meadows', 'kelowna', 'port_coquitlam'])

In [198]:
# get neighbourhoods per city

neighbourhoods_city = {}

for city, df in dataframes.items():
    
    neighbourhoods = df['community'].unique()
    for neighbourhood in neighbourhoods:

        neighbourhoods_city[neighbourhood] = city


In [199]:
# find out which city the row is in
clean_combined_data = get_df(file_path=folder + "clean_combined_data.csv", apply_columns=False)

clean_combined_data['city'] = clean_combined_data['community'].apply(lambda x: neighbourhoods_city.get(x, 'unknown'))

# Select Features

In [212]:
# Select the specified columns
# selected_columns = ['address', 'price', 'gross_tax', 'bedrooms',
#                     'bathrooms', 'property_type', 
#                     'community', 'lot_size', 'half_bathrooms', 'lot_width', 'lot_length',
#                     'age', 'bungalow', 'storey', 'basement', 'laneway_house', 'garage',
#                     'split_entry', 'city']

features = [
    'price', 
    'city',
    'gross_tax',
    'community', 
    'bedrooms',
    'bathrooms', 
    'lot_width',
    'lot_length',
    'age',
    'bungalow',
    'storey', 
    'basement', 
    'laneway_house',
    'garage',
    'split_entry',
]

df = clean_combined_data[features]
df.columns



Index(['price', 'city', 'gross_tax', 'community', 'bedrooms', 'bathrooms',
       'lot_width', 'lot_length', 'age', 'bungalow', 'storey', 'basement',
       'laneway_house', 'garage', 'split_entry'],
      dtype='object')

# Code Cleanup

In [201]:
# Drop 0 Rows
# df = df[df['gross_tax'] != 0]
# df = df[df['lot_size'] != 0]
df = df[df['lot_width'] != 0]
df = df[df['lot_length'] != 0]
# before (2104, 294)
df = df.dropna()
# after (1379, 294)

# round price for better prediction
# df['price'] = df['price'].apply(lambda x: math.ceil(x / 500000) * 500000)

df.shape


(1382, 15)

In [219]:
pd.set_option('max_colwidth', 1000)

In [221]:
# Identify the categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding to the categorical columns
df = df.drop_duplicates()

df = pd.get_dummies(df, columns=categorical_columns)
# set max number of columns and rows to display


pd.set_option('max_colwidth', 1000)
df.style.set_properties(subset=None, **{'width': '200px'})

print(df.columns)
arr = []
for feature in df.columns:
    arr.append(feature)

print(arr)

Index(['price', 'gross_tax', 'bedrooms', 'bathrooms', 'lot_width',
       'lot_length', 'age', 'bungalow', 'storey', 'basement',
       ...
       'community_whitby estates', 'community_white rock',
       'community_whonnock', 'community_whytecliff',
       'community_willingdon heights', 'community_willoughby heights',
       'community_woodland acres', 'community_woodlands-sunshine-cascade',
       'community_woodwards', 'community_yaletown'],
      dtype='object', length=299)
['price', 'gross_tax', 'bedrooms', 'bathrooms', 'lot_width', 'lot_length', 'age', 'bungalow', 'storey', 'basement', 'laneway_house', 'garage', 'split_entry', 'city_burnaby', 'city_coquitlam', 'city_delta', 'city_kelowna', 'city_langley', 'city_maple_ridge', 'city_new_west', 'city_north_vancouver', 'city_pitt_meadows', 'city_port_coquitlam', 'city_richmond', 'city_surrey', 'city_vancouver', 'city_west_vancouver', 'community_albion', 'community_aldergrove', 'community_altamont', 'community_ambleside', 'community

In [203]:
# Export training data
df.to_csv('../data/training_data_april.csv', index=False)


# Neural Network

## Training

In [204]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import math
import tensorflow as tf
import keras


data = df

# Define the target variable (price) and feature columns
target = 'price'
features = data.drop(columns=[target]).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

# Normalize the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# lstm
# Define the neural network model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=500, batch_size=32, verbose=1, callbacks=[keras.callbacks.EarlyStopping(patience=3)])


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x179e9a220>

In [205]:
print(data.shape)
print(X_train.shape)
print(X_train_scaled.shape)
X_train_scaled[0].shape

(1382, 268)
(1105, 267)
(1105, 267)


(267,)

## Evaluation

In [206]:

# Evaluate the model
loss = model.evaluate(X_test_scaled, y_test)
mse = loss[0]
formatted_mse = "{:,.2f}".format(mse)
print(f'Mean Squared Error: {formatted_mse}')

rmse = math.sqrt(mse)
formatted_mse = "{:,.2f}".format(rmse)
print(f'Root Mean Squared Error: {formatted_mse}')
# Root Mean Squared Error: 1,226,026.20

Mean Squared Error: 1,524,868,579,328.00
Root Mean Squared Error: 1,234,855.69


## Predict

In [207]:
X_test_scaled.shape
# prediction = model.predict(X_test_scaled[0:])
# prediction

(277, 267)

In [208]:

# Make predictions
predictions = model.predict(X_test_scaled)



In [209]:
# Concatenate predicted and actual values
results = np.concatenate((predictions, y_test.values.reshape(-1, 1)), axis=1)

# Create DataFrame to show it side by side with predicted, actual and their differences
results_df = pd.DataFrame(results, columns=['Predicted', 'Actual'])
results_df['Difference'] = abs(results_df['Actual'] - results_df['Predicted'])

formated_df = results_df.copy()
formated_df['Predicted'] = results_df['Predicted'].apply(lambda x: '{:,.2f}'.format(x))
formated_df['Actual'] = results_df['Actual'].apply(lambda x: '{:,.2f}'.format(x))
formated_df['Difference'] = results_df['Difference'].apply(lambda x: '{:,.2f}'.format(x))

mean = results_df['Difference'].mean()
formatted_mean = "{:,.2f}".format(mean)
print("average difference",  formatted_mean)
# Print results
formated_df.head()


average difference 808,323.13


Unnamed: 0,Predicted,Actual,Difference
0,2659563.5,2499999.0,159564.5
1,1843018.88,2379312.0,536293.12
2,2287282.5,3300000.0,1012717.5
3,1393322.38,1798800.0,405477.62
4,1798154.38,1899000.0,100845.62


In [210]:
# Concatenate predicted and actual values
results = np.concatenate((predictions, y_test.values.reshape(-1, 1)), axis=1)

# Create DataFrame
results_df = pd.DataFrame(results, columns=['Predicted', 'Actual'])

# Print results
results_df.head()

Unnamed: 0,Predicted,Actual
0,2659563.5,2499999.0
1,1843018.875,2379312.0
2,2287282.5,3300000.0
3,1393322.375,1798800.0
4,1798154.375,1899000.0


In [211]:
model.save('housingNNmodel')

INFO:tensorflow:Assets written to: housingNNmodel/assets
