### Automotive Efficiency Problem

a) **Decision Tree Usage**  
   - Show the usage of your decision tree for the **automotive efficiency problem**.  
   **[0.5 marks]**


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *
from tree.utils import *
from tree.add_utils import *

np.random.seed(42)

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
data = pd.read_csv(url, delim_whitespace=True, header=None,
                 names=["mpg", "cylinders", "displacement", "horsepower", "weight",
                        "acceleration", "model_year", "origin", "car_name"])


#Clean the data

data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')

data = data.dropna()

categorical_cols = ['cylinders', 'model_year', 'origin']
for col in categorical_cols:
    data[col] = data[col].astype('category')

data = data.drop(columns=['car_name'])

In [3]:
def check_if_real(attr: pd.Series) -> bool:
    if pd.api.types.is_categorical_dtype(attr) or attr.dtype == object:
        return False

    if pd.api.types.is_numeric_dtype(attr):
        unique_vals = attr.nunique()
        if unique_vals <= 5:
            return False
        else:
            return True

    return False

In [4]:
def info_gain_mse_real_real(x: pd.Series, y: pd.Series, threshold: float) -> float:
    """
    Compute information gain (based on MSE reduction) for a real-valued split.
    
    Args:
        x : feature column (numeric)
        y : target column (numeric)
        threshold : numeric split value
        
    Returns:
        gain : float
    """
    df = pd.DataFrame({"x": x, "y": y})
    
    left_y = df[df["x"] <= threshold]["y"]
    right_y = df[df["x"] > threshold]["y"]
    
    mse_total = ((y - y.mean())**2).mean()
    mse_left = ((left_y - left_y.mean())**2).mean() if not left_y.empty else 0
    mse_right = ((right_y - right_y.mean())**2).mean() if not right_y.empty else 0
    
    # Weighted MSE after split
    w_mse = (len(left_y)/len(y)) * mse_left + (len(right_y)/len(y)) * mse_right
    
    gain = mse_total - w_mse
    return gain

In [5]:
def best_real_attribute(X: pd.DataFrame, y: pd.Series):
    best_attr, best_split, best_gain = None, None, -float("inf")

    for col in X.columns:
        if pd.api.types.is_numeric_dtype(X[col]):
            values = sorted(X[col].unique())
            for i in range(1, len(values)):
                split = (values[i-1] + values[i]) / 2
                gain = info_gain_mse_real_real(X[col], y, split)
                if gain > best_gain:
                    best_gain = gain
                    best_attr = col
                    best_split = split

    return best_attr, best_split, best_gain



def best_discrete_attribute(X: pd.DataFrame, y: pd.Series):
    best_attr, best_gain = None, -float("inf")

    for col in X.columns:
        if not pd.api.types.is_numeric_dtype(X[col]):
            gain = info_gain_mse_disc_real(X[col], y)  
            if gain > best_gain:
                best_gain = gain
                best_attr = col

    return best_attr, best_gain


def best_attribute(X: pd.DataFrame, y: pd.Series):
    real_attr, real_split, real_gain = best_real_attribute(X, y)
    disc_attr, disc_gain = best_discrete_attribute(X, y)

    if real_gain >= disc_gain:
        return real_attr, real_split, real_gain, "real"
    else:
        return disc_attr, None, disc_gain, "discrete"




In [6]:
import pandas as pd
import numpy as np

class DecisionTreeRealOutput:
    def __init__(self, max_depth=10):
        self.max_depth = max_depth
        self.root = None

    class Node:
        def __init__(self, attribute=None, threshold=None, left=None, right=None, value=None, is_real=True):
            self.attribute = attribute      
            self.threshold = threshold      
            self.left = left
            self.right = right
            self.value = value              
            self.is_real = is_real          

    def fit(self, data: pd.DataFrame, target_col: str):
        self.root = self._build_tree(data, target_col, depth=0)

    def _build_tree(self, data, target_col, depth):
        X = data.drop(columns=[target_col])
        y = data[target_col]

       
        if depth >= self.max_depth or len(y.unique()) == 1 or X.empty:
            return self.Node(value=y.mean())

    
        real_attr, real_split, real_gain = best_real_attribute(X, y)
        disc_attr, disc_gain = best_discrete_attribute(X, y)

        
        if real_gain is None and disc_gain is None:
            return self.Node(value=y.mean())
        elif disc_gain is None or (real_gain is not None and real_gain >= disc_gain):
            attr, split, is_real = real_attr, real_split, True
        else:
            attr, split, is_real = disc_attr, disc_gain, False

        if attr is None:
            return self.Node(value=y.mean())

       
        if is_real:
            threshold = split
            left_mask = X[attr] <= threshold
            right_mask = X[attr] > threshold
        else:  
            threshold = split
            left_mask = X[attr] == threshold
            right_mask = X[attr] != threshold

        left_child = self._build_tree(data[left_mask], target_col, depth + 1)
        right_child = self._build_tree(data[right_mask], target_col, depth + 1)

        return self.Node(attribute=attr, threshold=threshold, left=left_child,
                         right=right_child, is_real=is_real)

    def predict_row(self, row, node):
        if node.value is not None:
            return node.value

        if node.is_real:
            if row[node.attribute] <= node.threshold:
                return self.predict_row(row, node.left)
            else:
                return self.predict_row(row, node.right)
        else:  # discrete
            if row[node.attribute] == node.threshold:  
                return self.predict_row(row, node.left)
            else:
                return self.predict_row(row, node.right)

    def predict(self, data: pd.DataFrame):
        return data.apply(lambda row: self.predict_row(row, self.root), axis=1)


In [None]:
target_col = "mpg"


from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, random_state=42)


tree = DecisionTreeRealOutput(max_depth=5)
tree.fit(train, target_col=target_col)


y_pred = tree.predict(test.drop(columns=[target_col]))
y_true = test[target_col]

rmse_val = rmse(y_true, y_pred)
mae_val = mae(y_true, y_pred)

print("DecisionTreeRealOutput Results on Auto MPG dataset:")
print(f"RMSE: {rmse_val:.3f}")
print(f"MAE:  {mae_val:.3f}")




DecisionTreeRealOutput Results on Auto MPG dataset:
RMSE: 7.301
MAE:  6.134


b) **Model Comparison**  
   - Compare the performance of your decision tree model with the **Decision Tree module from scikit-learn**.  
   **[0.5 marks]**

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error


sk_tree = DecisionTreeRegressor(max_depth=5, random_state=42)
sk_tree.fit(train.drop(columns=[target_col]), train[target_col])


y_pred_sk = sk_tree.predict(test.drop(columns=[target_col]))
y_true = test[target_col]


rmse_sk = mean_squared_error(y_true, y_pred_sk)
mae_sk  = mean_absolute_error(y_true, y_pred_sk)

print("Scikit-learn DecisionTreeRegressor Results:")
print(f"RMSE: {rmse_sk:.3f}")
print(f"MAE:  {mae_sk:.3f}")


rmse_custom = mean_squared_error(y_true, y_pred)
mae_custom  = mean_absolute_error(y_true, y_pred)

print("\nCustom DecisionTreeRealOutput Results:")
print(f"RMSE: {np.sqrt(rmse_custom):.3f}")
print(f"MAE:  {mae_custom:.3f}")

Scikit-learn DecisionTreeRegressor Results:
RMSE: 10.081
MAE:  2.296

Custom DecisionTreeRealOutput Results:
RMSE: 7.301
MAE:  6.134
