# **"Regression for Structured Data"**
This dataset was introduced and published in a 2016 paper titled "2016 House Price Estimation from Visual and Textual Features.
https://github.com/emanhamed/Houses-dataset

https://arxiv.org/pdf/1609.08399.pdf



In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
import pandas as pd
import numpy as np
import glob
import cv2
import os
import locale

In [None]:
cols = ["bedrooms", "bathrooms", "area", "zipcode", "price"]

df = pd.read_csv("https://raw.githubusercontent.com/emanhamed/Houses-dataset/master/Houses%20Dataset/HousesInfo.txt", sep=" ", header=None, names=cols)


In [None]:
df.head(10)

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226
5,4,5.0,4581,85266,1249000
6,3,4.0,2544,85262,799000
7,4,5.0,5524,85266,1698000
8,3,4.0,4229,85255,1749000
9,4,5.0,3550,85262,1500000


In [None]:
# number of unique zipcodes
df['zipcode'].nunique()

49

In [None]:
zipcode_counts = df['zipcode'].value_counts()

zipcodes = zipcode_counts.index
counts = zipcode_counts.values
# loop over each of the unique zip codes and their corresponding
# count
for (zipcode, count) in zip(zipcodes, counts):
    # the zip code counts for our housing dataset is *extremely*
    # unbalanced (some only having 1 or 2 houses per zip code)
    # removing any houses with less
    # than 25 houses per zip code
    if count < 25:
        idxs = df[df["zipcode"] == zipcode].index
        df.drop(idxs, inplace=True)
df.shape

(362, 5)

In [None]:
#splitting the dataset into training and testing sets using train_test_split from sklearn.model_selection
(train, test) = train_test_split(df, test_size=0.25, random_state=42)
print(train.shape)
print(test.shape)

(271, 5)
(91, 5)


In [None]:
# find the largest house price in the training set and use it to
# scale our house prices to the range [0, 1]
# for a better training and convergence
maxPrice = train["price"].max()
trainY = train["price"] / maxPrice
testY = test["price"] / maxPrice

In [14]:
# Initialize the LabelBinarizer - for 'zipcode'
zipBinarizer = LabelBinarizer()

# Fit on the training data only
zipBinarizer.fit(train["zipcode"])

# Transform both training and testing data
trainCategorical = zipBinarizer.transform(train["zipcode"])
testCategorical = zipBinarizer.transform(test["zipcode"])


In [15]:
# Initialize the column names of the continuous data
continuous = ["bedrooms", "bathrooms", "area"]

# Initialize MinMaxScaler
cs = MinMaxScaler()

# Fit on the training data only
cs.fit(train[continuous])

# Transform both training and testing data
trainContinuous = cs.transform(train[continuous])
testContinuous = cs.transform(test[continuous])

In [16]:
#combine them back into a single dataset for training or testing
trainX = np.hstack([trainCategorical, trainContinuous])
testX = np.hstack([testCategorical, testContinuous])
print(trainX.shape)
print(testX.shape)

(271, 10)
(91, 10)


In [17]:
#model architecture
dim = trainX.shape[1]
# define our MLP network
model = Sequential()
model.add(Dense(8, input_dim=dim, activation="relu"))
model.add(Dense(4, activation="relu"))
model.add(Dense(1, activation="linear"))

In [19]:
#compile model
opt = Adam(learning_rate=1e-3)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)

In [20]:
model.fit(trainX, trainY, validation_data=(testX, testY),
    epochs=200, batch_size=8)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x7f052d81e740>

In [21]:
preds = model.predict(testX)



In [25]:
# Model Evaluation Process
# 1. Making Predictions:
#    - The model is used to predict house prices on the testing dataset (testX).
#    - Predictions are generated based on the input features of the test set.
preds = model.predict(testX)

# 2. Error Calculation:
#    - The difference between predicted and actual house prices (testY) is calculated.
#    - Both the percentage difference and the absolute percentage difference are computed to assess the model's accuracy and error distribution.

diff = preds.flatten() - testY
percentDiff = (diff / testY) * 100
absPercentDiff = np.abs(percentDiff)


# 3. Statistical Summary of Prediction Errors:
#    - The mean and standard deviation of the absolute percentage differences are computed.
#    - These statistics provide insights into the average error margin of the model's predictions and the variability of these errors across the test dataset.
mean = np.mean(absPercentDiff)
std = np.std(absPercentDiff)


# 4. Displaying Dataset and Model Performance Statistics:
#    - Average and standard deviation of actual house prices in the dataset are displayed, formatted as currency for better readability.
#    - The mean absolute percentage error (MAPE) and its standard deviation are also shown, providing a quantitative measure of the model's prediction accuracy and consistency.
#    - The computation assumes no zero values in testY to avoid division by zero errors.


print("avg. house price: {}, std house price: {}".format(
    locale.currency(df["price"].mean(), grouping=True),
    locale.currency(df["price"].std(), grouping=True)))
print("mean: {:.2f}%, std: {:.2f}%".format(mean, std))

avg. house price: $533,388.27, std house price: $493,403.08
mean: 22.90%, std: 21.57%
