## Mumbai House Price Prediction using Decision Trees ( Regression Trees )

In [3]:
# Import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Obtain the modified csv 
data = "../assets/data/modified_mumbai_house_prices.csv"
house_price = pd.read_csv(data)

# Refer assets/scripts/house-price-dataset.py for preprocessing steps

In [5]:
# printing the info for dataset
print(house_price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   bhk        76038 non-null  int64  
 1   type       76038 non-null  float64
 2   area       76038 non-null  int64  
 3   price      76038 non-null  float64
 4   status     76038 non-null  int64  
 5   age        76038 non-null  float64
 6   latitude   76038 non-null  float64
 7   longitude  76038 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 4.6 MB
None


In [6]:
house_price.head()

Unnamed: 0,bhk,type,area,price,status,age,latitude,longitude
0,3,0.25,685,2.5,1,1.0,19.112122,72.867676
1,2,0.25,640,0.5251,0,1.0,18.969048,72.821182
2,2,0.25,610,1.73,0,1.0,18.563005,73.906578
3,2,0.25,876,0.5998,0,1.0,18.999653,73.126328
4,2,0.25,659,0.9411,0,1.0,18.969048,72.821182


In [7]:
house_price.shape

(76038, 8)

In [8]:
# Define the ratios for train, test, and validation sets
test_ratio = 0.05
val_ratio = 0.05

indices = list(house_price.index)

# Calculate the number of samples for the testing and validation sets
test_size = int(test_ratio * len(house_price))
val_size = int(val_ratio * len(house_price))

# Split the indices into training, testing, and validation sets
test_indices = indices[:test_size]
val_indices = indices[test_size:test_size+val_size]
train_indices = indices[test_size+val_size:]

# Create training, validation, and testing sets
X_train = house_price.loc[train_indices]
X_val = house_price.loc[val_indices]
X_test = house_price.loc[test_indices]

# Extract target variable
y_train = X_train.pop("price")
y_val = X_val.pop("price")
y_test = X_test.pop("price")

print("The size of X_train is: ", X_train.shape)
print("The size of y_train is: ", y_train.shape)
print("The size of X_val is: ", X_val.shape)
print("The size of y_val is: ", y_val.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_test is: ", y_test.shape)

The size of X_train is:  (68436, 7)
The size of y_train is:  (68436,)
The size of X_val is:  (3801, 7)
The size of y_val is:  (3801,)
The size of X_test is:  (3801, 7)
The size of y_test is:  (3801,)


In [126]:
# MSE and MAE loss
def calculate_loss(x):
    if len(x) == 0:
        return 0

    # return ((x - x.mean())**2).mean()
    return (np.abs(x - x.mean())).mean()

# MSE Loss doesn't scale well with our dataset. Thus we also define a dynamic way to calculate MSE loss

In [127]:
def get_best_split(X, y, depth):
    best_loss = float('inf') 
    best_split_val = 0
    best_split_dim = 0

    for col in X.columns:
        # print(f"Trying split at depth {depth} for column: {col}")
        # Sort the data for given column
        data = X[col].drop_duplicates().sort_values().reset_index(drop=True)

        # Test all split values for given column
        for i in range(len(data) - 1):
            split_val = (data.iloc[i] + data.iloc[i + 1]) / 2
            
            left_mask = X[col] <= split_val  
            right_mask = X[col] > split_val

            X_left = X[left_mask]
            y_left = y[left_mask]

            X_right = X[right_mask]
            y_right = y[right_mask]

            left_loss  = calculate_loss(y_left)
            right_loss  = calculate_loss(y_right)
            # left_loss, right_loss, pre_loss  = calculate_loss_dynamic(y_left, y_right)  

            total_loss = (len(X_left)/len(X))*left_loss + (len(X_right)/len(X))*right_loss

            if total_loss < best_loss:
                best_loss = total_loss
                best_split_val = split_val
                best_split_dim = col
    
    # Splitting data based on best value
    left_mask = X[best_split_dim] <= best_split_val  
    right_mask = X[best_split_dim] > best_split_val

    X_left = X[left_mask]
    y_left = y[left_mask]

    X_right = X[right_mask]
    y_right = y[right_mask]
    
    return best_split_dim, best_split_val, X_left, y_left, X_right, y_right


In [128]:
# First we define a decision node using class
class Node:
   def __init__(self, dim, val, left, right):
      self.left = left
      self.right = right
      self.dim = dim
      self.val = val

# Defining class for a leaf node
class Leaf:
   def __init__(self, data):
      self.data = data

In [129]:
# Building the decision tree
def BuildTree(X_train, y_train, K, max_depth, min_instances):
    if (len(X_train) <= min_instances):
        return Leaf(np.mean(y_train))
    elif (K > max_depth):
        # Returing average of nodes
        return Leaf(np.mean(y_train))
    else:
        split_dim, split_val, left_X, left_y, right_X, right_y = get_best_split(X_train, y_train, K)
        return Node(split_dim, split_val, BuildTree(left_X, left_y, K+1, max_depth, min_instances), BuildTree(right_X, right_y, K+1, max_depth, min_instances))

In [130]:
# Traversing the tree using depth first search
def traverse_tree(node):
    # If the node is a Leaf, print its value
    if isinstance(node, Leaf):
        print(f"Leaf value: {node.data}")

    # If the node is a regular Node, print its dimension and value
    elif isinstance(node, Node):
        print(f"Node dim: {node.dim}, val: {node.val}")
        traverse_tree(node.left)
        traverse_tree(node.right)

In [131]:
max_depth = 10
min_instances = 50
root = BuildTree(X_train, y_train, 0, max_depth, min_instances)
# traverse_tree(root)

In [132]:
# Evaluate a given instance
def evaluate_instance(X, root):
        
    current_node = root

    # Traverse the tree until a leaf node is reached
    while isinstance(current_node, Node):
        if X[current_node.dim] <= current_node.val:
            current_node = current_node.left
        else: 
            current_node = current_node.right

    if isinstance(current_node, Leaf):
        return current_node.data

    # If the root is a leaf node (directly passed), just return its value
    return current_node.data

In [136]:
def evaluate(X, y):
    y_hat = np.zeros(y.shape)
    for i in range (len(X)):
        y_hat[i] = evaluate_instance(X.iloc[i], root)
    # loss = ((y_hat - y)**2).mean()
    loss = (np.abs(y_hat - y)).mean()
    return loss


In [137]:
# This is done to tune height and min_instances
val_loss = evaluate(X_val, y_val)
print(f"Loss on validation set: {val_loss}")

Loss on validation set: 0.272673669539347


In [138]:
test_loss = evaluate(X_test, y_test)
print(f"Loss on test set: {test_loss}")

Loss on test set: 0.33680512218810493
