In [2]:
import csv
import random
import numpy as np
from sklearn.linear_model import LinearRegression

In [23]:
class DecisionNode:
    """A Decision Node asks a question. This holds a reference to the question, and to the two child nodes."""
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch


class Leaf:
    """A Leaf node for regression. This holds predictions based on average, median, and fitting."""
    def __init__(self, rows):
        # Extracting the target values
        target_values = [row[-1] for row in rows]
        self.predictions = {
            'average': np.mean(target_values),
            'median': np.median(target_values)
        }
        # Fitting part using Linear Regression
        if len(rows[0]) > 1:  # Check if there are features along with the target variable
            features = np.array([row[:-1] for row in rows])
            target = np.array(target_values)
            model = LinearRegression()
            model.fit(features, target)
            self.predictions['fitting'] = model


class DecisionTreeRegressor:
    def __init__(self, max_depth=float('inf'), min_samples_split=2):
        self.tree = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        data = [X[i] + [y[i]] for i in range(len(X))]
        self.tree = self.build_tree(data)

    def predict(self, rows, method='average'):
        predictions = [self.classify(row, self.tree, method) for row in rows]
        return predictions

    def build_tree(self, rows, depth=0):
        """Builds the tree with pre-pruning."""
        if len(rows) < self.min_samples_split or depth >= self.max_depth:
            return Leaf(rows)

        gain, question = self.find_best_split(rows)
        if gain == 0:
            return Leaf(rows)

        true_rows, false_rows = self.partition(rows, question)
        true_branch = self.build_tree(true_rows, depth + 1)
        false_branch = self.build_tree(false_rows, depth + 1)

        return DecisionNode(question, true_branch, false_branch)


    def find_best_split(self, rows):
        best_gain = 0
        best_question = None
        current_variance = self.variance(rows)
        n_features = len(rows[0]) - 1  # number of columns

        for col in range(n_features):  # for each feature
            values = set([row[col] for row in rows])  # unique values in the column
            for val in values:  # for each value
                question = (col, val)
                true_rows, false_rows = self.partition(rows, question)

                if len(true_rows) == 0 or len(false_rows) == 0:
                    continue

                gain = self.info_gain(true_rows, false_rows, current_variance)

                if gain >= best_gain:
                    best_gain, best_question = gain, question

        return best_gain, best_question

    def variance(self, rows):
        """Calculates the variance of target values in a dataset."""
        if not rows:
            return 0
        targets = [row[-1] for row in rows]
        mean_value = np.mean(targets)
        variance = sum((x - mean_value) ** 2 for x in targets) / len(targets)
        return variance

    def info_gain(self, left, right, current_variance):
        """Information Gain based on variance reduction."""
        p = float(len(left)) / (len(left) + len(right))
        return current_variance - p * self.variance(left) - (1 - p) * self.variance(right)

    def classify(self, row, node, method):
        """Classify a new data point based on the tree."""
        if isinstance(node, Leaf):
            if method == 'fitting' and 'fitting' in node.predictions:
                return node.predictions['fitting'].predict([row[:-1]])[0]
            else:
                return node.predictions[method]

        if row[node.question[0]] >= node.question[1]:
            return self.classify(row, node.true_branch, method)
        else:
            return self.classify(row, node.false_branch, method)

    def partition(self, rows, question):
        """Partitions a dataset."""
        true_rows, false_rows = [], []
        col, val = question
        for row in rows:
            if row[col] >= val:
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows

    @staticmethod
    def load_dataset(filename):
        dataset = []
        with open(filename, 'r') as file:
            csv_reader = csv.reader(file)
            for row in csv_reader:
                dataset.append(row)
        dataset.pop(0)
        return dataset

    @staticmethod
    def split_dataset_train_test(dataset, split_ratio):
        random.shuffle(dataset)
        split_index = int(len(dataset) * split_ratio)
        train_data = dataset[:split_index]
        test_data = dataset[split_index:]
        return train_data, test_data
    
    @staticmethod
    def print_tree(node, spacing=""):
        if isinstance(node, Leaf):
            print(spacing + "Predict", node.predictions)
            return

        print(spacing + str(node.question))
        
        print(spacing + '--> True:')
        DecisionTreeRegressor.print_tree(node.true_branch, spacing + "  ")

        print(spacing + '--> False:')
        DecisionTreeRegressor.print_tree(node.false_branch, spacing + "  ")

    def accuracy_on_combined_data(self, rows):
        correct = 0
        for row in rows:
            if self.classify(row[:-1], self.tree) == row[-1]:
                correct += 1
        return correct / len(rows) if rows else 0
    
    def prune(self, node, validation_rows):
        if isinstance(node, Leaf):
            return node

        # Prune true and false branches first
        node.true_branch = self.prune(node.true_branch, validation_rows)
        node.false_branch = self.prune(node.false_branch, validation_rows)

        # If both branches are now leaves, consider pruning (merging) them
        if isinstance(node.true_branch, Leaf) and isinstance(node.false_branch, Leaf):
            # Evaluate accuracy with this node
            accuracy_before_pruning = self.accuracy_on_combined_data(validation_rows)

            # Create a leaf node that replaces this DecisionNode
            merged_leaf = Leaf(validation_rows)

            # Temporarily replace this DecisionNode with merged_leaf
            original_node = node
            self.tree = merged_leaf

            # Evaluate accuracy without this node (i.e., with it replaced by merged_leaf)
            accuracy_after_pruning = self.accuracy_on_combined_data(validation_rows)

            # Restore the original tree structure if pruning did not improve accuracy
            if accuracy_after_pruning < accuracy_before_pruning:
                self.tree = original_node

            return merged_leaf if accuracy_after_pruning >= accuracy_before_pruning else original_node

        return node

In [24]:
# Preprocess the data (handling missing values and normalize features, ...)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv('housing.csv')

# Handling missing values (example: using median to fill missing values)
imputer = SimpleImputer(strategy='median')
df_num = df.drop('ocean_proximity', axis=1)  # Assuming 'ocean_proximity' is categorical
imputed_df = imputer.fit_transform(df_num)
df_num = pd.DataFrame(imputed_df, columns=df_num.columns)

# Normalize features
scaler = StandardScaler()
df_num_scaled = scaler.fit_transform(df_num)
df_num_scaled = pd.DataFrame(df_num_scaled, columns=df_num.columns)

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(df_num_scaled.drop('median_house_value', axis=1), df_num_scaled['median_house_value'], test_size=0.2, random_state=42)

# Convert data to lists for compatibility with our DecisionTreeClassifier
train_data = [list(x) + [y] for x, y in zip(X_train.values.tolist(), y_train.tolist())]
test_data = [list(x) + [y] for x, y in zip(X_test.values.tolist(), y_test.tolist())]


In [26]:
# Use the code on data
regressor = DecisionTreeRegressor(max_depth=5, min_samples_split=10)
regressor.fit(X_train.values.tolist(), y_train.tolist())

predictions = regressor.predict(test_data, method="average")

In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 0.4594402174223271
Mean Squared Error (MSE): 0.4226878346002094
Root Mean Squared Error (RMSE): 0.6501444720984786
R-squared (R²): 0.570492783284815
