# Pasig House Prices Prediction 🏠

# Install Dependencies

In [1]:
!pip install -q tensorflow tensorflow_decision_forests
!pip install -q prettytable
!pip install -q ydf -U -qq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
import tensorflow_decision_forests as tfdf
import pandas as pd

# Prepare Data

In [14]:
# Load a dataset in a Pandas dataframe
train_data = pd.read_csv("./project/train.csv")
test_data = pd.read_csv("./project/test.csv")

In [15]:
# Check if there are any non-numeric values in the 'Price_PHP' column
if train_data['Price_PHP'].apply(lambda x: isinstance(x, (int, float))).all():
    print("✅ All values in 'Price_PHP' are numeric.")
else:
    print("❌ Non-numeric values found in 'Price_PHP'.")

❌ Non-numeric values found in 'Price_PHP'.


In [16]:
print(train_data['Price_PHP'].dtype)

object


In [17]:
# Remove commas in 'Price_PHP' and convert it to float
train_data['Price_PHP'] = train_data['Price_PHP'].str.replace(',', '').astype(float)

In [18]:
# Check if there are any non-numeric values in the 'Price_PHP' column
if train_data['Price_PHP'].apply(lambda x: isinstance(x, (int, float))).all():
    print("✅ All values in 'Price_PHP' are numeric.")
else:
    print("❌ Non-numeric values found in 'Price_PHP'.")

✅ All values in 'Price_PHP' are numeric.


In [19]:
print(train_data['Price_PHP'].dtype)

float64


In [20]:
# Exclude 'Location' from the training data
train_data = train_data.drop(['Location'], axis=1)

In [23]:
# hundred million
train_data['Price_PHP'].max()

175265000.0

In [24]:
# million
train_data['Price_PHP'].min()

3297501.0

In [22]:
# Calculate the range of PHP_Price
price_range = train_data['Price_PHP'].max() - train_data['Price_PHP'].min()
print('Price range:', price_range)

Price range: 171967499.0


In [11]:
# Convert the train data to tf.data.Dataset
train_data = tfdf.keras.pd_dataframe_to_tf_dataset(train_data, task=tfdf.keras.Task.REGRESSION, label="Price_PHP")

# Select and train model


In [None]:
# Create and train the model
model = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION)
model.fit(train_data)

Use /tmp/tmpzsxkt3ho as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.315761. Found 281 examples.
Training model...
Model trained in 0:00:00.134474
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x78ea2925fa00>

The summary shows the model name, target column (Label) and source columns (a.k.a. input features).

In [None]:
# Summary of the model structure
model.summary()

Model: "gradient_boosted_trees_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (5):
	Bath
	Bedrooms
	Floor_area_sqm
	Latitude
	Longitude

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1. "Floor_area_sqm"  0.625945 ################
    2.       "Latitude"  0.278390 ##
    3.           "Bath"  0.274422 ##
    4.      "Longitude"  0.223111 
    5.       "Bedrooms"  0.201398 

Variable Importance: NUM_AS_ROOT:
    1. "Floor_area_sqm" 29.000000 ################
    2.           "Bath" 25.000000 #############
    3.       "Latitude" 16.000000 ########
    4.      "Longitude"  1.000000 

Variable Importance: NUM_NODES:
    1. "Floor_area_sqm" 474.00000

## Predict on test data - never before seen

In [None]:
from prettytable import PrettyTable

# List of indices you want to predict on
indices = [0]  # replace with your indices

for index in indices:
    # Initialize the PrettyTable
    table = PrettyTable()
    table.field_names = ["Index", "Location", "Bedrooms", "Bath", "Floor_area_sqm", "Latitude", "Longitude", "Prediction (Price_PHP)"]

    single_test_data = test_data.iloc[[index]]
    single_test_data_tf = tfdf.keras.pd_dataframe_to_tf_dataset(single_test_data, task=tfdf.keras.Task.REGRESSION)
    prediction = model.predict(single_test_data_tf)
    table.add_row([
        index,
        single_test_data['Location'].values[0],
        single_test_data['Bedrooms'].values[0],
        single_test_data['Bath'].values[0],
        single_test_data['Floor_area_sqm'].values[0],
        single_test_data['Latitude'].values[0],
        single_test_data['Longitude'].values[0],
        prediction[0][0]
    ])

    print(table)

+-------+---------------+----------+------+----------------+-----------+------------+------------------------+
| Index |    Location   | Bedrooms | Bath | Floor_area_sqm |  Latitude | Longitude  | Prediction (Price_PHP) |
+-------+---------------+----------+------+----------------+-----------+------------+------------------------+
|   0   | Oranbo, Pasig |    2     |  2   |      104       | 14.575822 | 121.064324 |       32212794.0       |
+-------+---------------+----------+------+----------------+-----------+------------+------------------------+


# Save the model

In [None]:
model.save("./project/pasig-model")

# Evaluate the model

- **Model**: The model tab shows the model summary which are model's name, target column and source columns (a.k.a. input features).

- **Training (Quality)**: The training tab shows how good the model performs by reporting the model’s evaluation metrics computed during training on the validation data (or something equivalent).

- **Dataspec**: The dataset tab shows data specification or statistics about the columns in the dataset.

- **Variable importance**: What input features matter to the model.

- **Structure**: The structure tab shows the representation of the model.


In [None]:
model.describe()

Calculating the range of your target variable (Price_PHP) was computed earlier in order to help interpret the Root Mean Squared Error (RMSE) of our model.

The RMSE is a measure of the differences between the values predicted by a model and the actual values. However, RMSE alone doesn't tell you much about the performance of your model. It's a "relative" measure, meaning its interpretation depends on the context, specifically the scale of the target variable.

By knowing the range of your target variable (the difference between the maximum and minimum house prices), you can better understand the RMSE. For example, an RMSE of 1000 is much more significant if your house prices range from 500 to 1500 (because the error is quite large compared to the range of your target variable), compared to if your house prices range from 3,000,000 to 100,000,000 (where the same RMSE would be relatively small compared to the range of your target variable).

So, the range can provide context that helps you interpret the RMSE and understand how well your model is performing.

In [None]:
test_data_drop_location = test_data.drop(['Location'], axis=1)

# Drop 'Location' column because it is not used in training
test_data_eval = test_data_drop_location.astype('float32')

# Convert the test data to tf.data.Dataset
test_data_eval = tfdf.keras.pd_dataframe_to_tf_dataset(test_data_eval, task=tfdf.keras.Task.REGRESSION)

model.evaluate(test_data_eval)



0.0

# Test saved (exported) model

In [None]:
# Load the model with YDF
import ydf

model = ydf.from_tensorflow_decision_forests("/content/project/pasig-model")

# Make predictions with the model
examples = {
  "Bedrooms" : [2],
  "Bath" : [2],
  "Floor_area_sqm" : [104],
  "Latitude" : [14.575822],
  "Longitude" : [121.064324],
}
model.predict(examples)

array([32212794.], dtype=float32)

# Download project folder (if using Google Colab)

In [None]:
!zip -r /content/project.zip /content/project

  adding: content/project/ (stored 0%)
  adding: content/project/test.csv (deflated 91%)
  adding: content/project/train.csv (deflated 88%)
  adding: content/project/pasig-model/ (stored 0%)
  adding: content/project/pasig-model/keras_metadata.pb (deflated 75%)
  adding: content/project/pasig-model/assets/ (stored 0%)
  adding: content/project/pasig-model/assets/85c3972e9b9b4742nodes-00000-of-00001 (deflated 61%)
  adding: content/project/pasig-model/assets/85c3972e9b9b4742header.pb (deflated 27%)
  adding: content/project/pasig-model/assets/85c3972e9b9b4742done (stored 0%)
  adding: content/project/pasig-model/assets/85c3972e9b9b4742data_spec.pb (deflated 9%)
  adding: content/project/pasig-model/assets/85c3972e9b9b4742gradient_boosted_trees_header.pb (deflated 40%)
  adding: content/project/pasig-model/fingerprint.pb (stored 0%)
  adding: content/project/pasig-model/variables/ (stored 0%)
  adding: content/project/pasig-model/variables/variables.data-00000-of-00001 (deflated 54%)
  a

In [None]:
from google.colab import files
files.download("/content/project.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>