In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [11]:
data = fetch_california_housing(as_frame = True)
features = pd.DataFrame(data.data, columns = data.feature_names)
target = data.target
features['MedHouseVal'] = target
sample_data = features.sample(frac=0.2).reset_index(drop=True)

In [12]:
X = sample_data.drop(columns='MedHouseVal', axis=1)
y = sample_data['MedHouseVal']

In [13]:
class Node:
  def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):

    # for decision node
    self.feature_index = feature_index
    self.threshold = threshold
    self.left = left
    self.right = right
    self.var_red = var_red

    # for leaf node
    self.value = value

In [14]:
class DecisionTreeRegression:
  def __init__(self, min_samples_split=2, max_depth=2):
    self.root = None
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth

  def build_tree(self, dataset, curr_depth=0):
    X, Y = dataset[:, :-1], dataset[:, -1]
    num_samples, num_features = np.shape(X)

    if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
      best_split = self.get_best_split(dataset, num_samples, num_features)
      if best_split["var_red"] > 0:
        left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
        right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
        return Node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["var_red"])

    leaf_value = self.calculate_leaf_value(Y)
    return Node(value=leaf_value)

  def get_best_split(self, dataset, num_samples, num_features):
    best_split = {}
    max_var_red = -float("inf")
    for feature_index in range(num_features):
      feature_values = dataset[:, feature_index]
      possible_thresholds = np.unique(feature_values)
      for threshold in possible_thresholds:
        dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
        if len(dataset_left) > 0 and len(dataset_right) > 0:
          y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
          curr_var_red = self.variance_reduction(y, left_y, right_y)
          if curr_var_red > max_var_red:
            best_split["feature_index"] = feature_index
            best_split["threshold"] = threshold
            best_split["dataset_left"] = dataset_left
            best_split["dataset_right"] = dataset_right
            best_split["var_red"] = curr_var_red
            max_var_red = curr_var_red

    return best_split

  def split(self, dataset, feature_index, threshold):
    dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
    return dataset_left, dataset_right

  def variance_reduction(self, parent, l_child, r_child):
    weight_l = len(l_child) / len(parent)
    weight_r = len(r_child) / len(parent)
    reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
    return reduction

  def calculate_leaf_value(self, y):
    return np.mean(y)

  def fit(self, X, Y):
    dataset = np.concatenate((X, Y), axis = 1)
    self.root = self.build_tree(dataset)

  def predict(self, X):
    predictions = [self.make_prediction(x, self.root) for x in X]
    return predictions

  def make_prediction(self, x, tree):
    if tree.value != None:
      return tree.value
    feature_val = x[tree.feature_index]
    if feature_val <= tree.threshold:
      return self.make_prediction(x, tree.left)
    else:
      return self.make_prediction(x, tree.right)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [16]:
model = DecisionTreeRegression(min_samples_split=3, max_depth=3)
model.fit(X_train.values, y_train.values.reshape(-1, 1))

In [17]:
y_pred = model.predict(X_test.values)
np.sqrt(mean_squared_error(y_test, y_pred))

0.7542399154330638