In [18]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
# ignore warnings
import warnings
import math

warnings.filterwarnings('ignore')

In [19]:
column_names = [
"address",
"price",
"gross_tax",
"strata_maintenance_fees",
"bedrooms",
"bathrooms",
"property_type",
"property_age",
"title",
"style",
"heating_type",
"feature",
"amenities",
"appliances",
"community",
"days_on_rew",
"property_views",
"mls®_number",
"source",
'frontage', 
'lot_size', 
'year_built', 
'depth',
'half_bathrooms'
]



In [20]:

def get_df(file_path, apply_columns=True):

    sys.path.append(file_path)

    data = pd.read_csv(file_path, on_bad_lines='skip')

    # data.dropna(inplace=True)
    if apply_columns:
        data.columns = column_names
    return data



folder = "../data/raw_2023_04_19/"

filenames = {
    "burnaby": "burnaby_real_estate_data.csv",
    "langley":"langley_real_estate_data.csv",
    "richmond":"richmond_real_estate_data.csv",
    "maple_ridge":"maple-ridge_real_estate_data.csv",
    "surrey":"surrey_real_estate_data.csv",
    "new_west": "new-westminster_real_estate_data.csv",
    "vancouver":"vancouver_real_estate_data.csv",
    "coquitlam":"coquitlam_real_estate_data.csv",
    "north_vancouver":"north-vancouver_real_estate_data.csv",
    "west_vancouver":"west-vancouver_real_estate_data.csv",
    "delta":"delta_real_estate_data.csv",
    "pitt_meadows":"pitt-meadows_real_estate_data.csv",
    "kelowna":"kelowna_real_estate_data.csv",
    "port_coquitlam":"port-coquitlam_real_estate_data.csv"
}

dataframes = {}

for k,v in filenames.items():
    dataframes[k] = get_df(folder + v)

dataframes.keys()



dict_keys(['burnaby', 'langley', 'richmond', 'maple_ridge', 'surrey', 'new_west', 'vancouver', 'coquitlam', 'north_vancouver', 'west_vancouver', 'delta', 'pitt_meadows', 'kelowna', 'port_coquitlam'])

In [21]:
# get neighbourhoods per city

neighbourhoods_city = {}

for city, df in dataframes.items():
    
    neighbourhoods = df['community'].unique()
    for neighbourhood in neighbourhoods:

        neighbourhoods_city[neighbourhood] = city


In [22]:
# find out which city the row is in
clean_combined_data = get_df(file_path=folder + "clean_combined_data.csv", apply_columns=False)

clean_combined_data['city'] = clean_combined_data['community'].apply(lambda x: neighbourhoods_city.get(x, 'unknown'))

In [23]:
# Select the specified columns
selected_columns = ['address', 'price', 'gross_tax', 'bedrooms',
                    'bathrooms', 'property_type', 
                    'community', 'lot_size', 'half_bathrooms', 'lot_width', 'lot_length',
                    'age', 'bungalow', 'storey', 'basement', 'laneway_house', 'garage',
                    'split_entry', 'city']

df = clean_combined_data[selected_columns]



In [24]:
# Drop rows where the 'gross_tax' column is equal to 0
df = df[df['gross_tax'] != 0]
df = df[df['lot_size'] != 0]

In [25]:
# Identify the categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding to the categorical columns
df = df.drop_duplicates()

df = pd.get_dummies(df, columns=categorical_columns)
df

Unnamed: 0,price,gross_tax,bedrooms,bathrooms,lot_size,half_bathrooms,lot_width,lot_length,age,bungalow,...,city_langley,city_maple_ridge,city_new_west,city_north_vancouver,city_pitt_meadows,city_port_coquitlam,city_richmond,city_surrey,city_vancouver,city_west_vancouver
0,3488000,4504,7.0,7.0,7875.0,2.0,63.0,63.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,1100000,3316,3.0,3.0,,0.0,,,43.0,0,...,0,0,0,0,0,0,0,0,0,0
2,1450000,3867,4.0,2.0,,0.0,,,41.0,0,...,0,0,0,0,0,0,0,0,0,0
3,1379000,3789,4.0,2.0,,1.0,,,42.0,0,...,0,0,0,0,0,0,0,0,0,0
4,2099900,5129,6.0,2.0,9648.0,0.0,72.0,72.0,64.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2258000,6531,5.0,3.0,9325.0,1.0,69.0,69.0,46.0,0,...,0,0,0,0,0,0,0,0,0,0
2186,2999800,7579,5.0,4.0,6100.0,1.0,50.0,50.0,2.0,0,...,0,0,0,0,0,0,0,0,0,0
2187,3280000,9413,9.0,6.0,7695.0,1.0,57.0,57.0,11.0,0,...,0,0,0,0,0,0,0,0,0,0
2188,2898000,4560,7.0,4.0,6000.0,2.0,50.0,50.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

df['price'] = df['price'].apply(lambda x: math.ceil(x / 500000) * 500000)

df = df.dropna()
scaler = StandardScaler()

X = df.drop(['price'], axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)
train_data = X_train.join(y_train)
test_data = X_test.join(y_test)

X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.fit_transform(X_test)





In [27]:
from sklearn.ensemble import RandomForestClassifier
forest_s = RandomForestClassifier()
forest_s.fit(X_train_s, y_train)
forest_s.score(X_test_s, y_test)

0.42748091603053434

In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import math
import tensorflow as tf
import keras


data = df

# Define the target variable (price) and feature columns
target = 'price'
features = data.drop(columns=[target]).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

# Normalize the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, verbose=1, callbacks=[keras.callbacks.EarlyStopping(patience=3)])

# Evaluate the model
loss = model.evaluate(X_test_scaled, y_test)
print(f'Mean Squared Error: {loss}')

# Make predictions
predictions = model.predict(X_test_scaled)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [44]:
# Concatenate predicted and actual values
results = np.concatenate((predictions, y_test.values.reshape(-1, 1)), axis=1)

# Create DataFrame
results_df = pd.DataFrame(results, columns=['Predicted', 'Actual'])

# Print results
results_df.head()

Unnamed: 0,Predicted,Actual
0,5261825.5,5000000.0
1,7375059.0,7000000.0
2,2144078.25,3000000.0
3,2994023.75,2000000.0
4,5493119.5,6000000.0


In [48]:
# Concatenate predicted and actual values
results = np.concatenate((predictions, y_test.values.reshape(-1, 1)), axis=1)

# Create DataFrame
results_df = pd.DataFrame(results, columns=['Predicted', 'Actual'])

# Print results
results_df.head()

Unnamed: 0,Predicted,Actual
0,5312828.5,5000000.0
1,7495261.0,7000000.0
2,2159935.25,3000000.0
3,3023623.25,2000000.0
4,5538542.0,6000000.0


In [30]:
tf.saved_model.save(model, 'housingNNmodel')

INFO:tensorflow:Assets written to: housingNNmodel/assets
