In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tree.base import DecisionTree
from metrics import *
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)

Q3.

a) Show the usage of your decision tree for the automotive efficiency problem. [0.5 marks]

b) Compare the performance of your model with the decision tree module from scikit learn. [0.5 marks]

In [2]:
!pip install ucimlrepo --quiet
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 
  
# metadata 
print(auto_mpg.metadata) 
  
# variable information 
print(auto_mpg.variables)


{'uci_id': 9, 'name': 'Auto MPG', 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg', 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv', 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption', 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 398, 'num_features': 7, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['mpg'], 'index_col': ['car_name'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1993, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5859H', 'creators': ['R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unknown values for th

In [3]:
# join X and y to check for null values
data = pd.concat([X, y], axis=1)
data.shape

(398, 8)

In [4]:
print("Number of NaN/Null values in training data:", data.isnull().sum().sum())
print("Number of duplicated samples in training data: ", data.duplicated().sum())

print("\nHorsepower column has missing values: ", data['horsepower'].isnull().sum())

# Drop rows with missing values
data = data.dropna()
print("Shape of data after dropping missing values: ", data.shape)

Number of NaN/Null values in training data: 6
Number of duplicated samples in training data:  0

Horsepower column has missing values:  6
Shape of data after dropping missing values:  (392, 8)


In [5]:
# separate X and y
X = data.drop('mpg', axis=1)
y = data['mpg']

In [6]:
print(X)
print("\nShape of X: ", X.shape)

     displacement  cylinders  horsepower  weight  acceleration  model_year  \
0           307.0          8       130.0    3504          12.0          70   
1           350.0          8       165.0    3693          11.5          70   
2           318.0          8       150.0    3436          11.0          70   
3           304.0          8       150.0    3433          12.0          70   
4           302.0          8       140.0    3449          10.5          70   
..            ...        ...         ...     ...           ...         ...   
393         140.0          4        86.0    2790          15.6          82   
394          97.0          4        52.0    2130          24.6          82   
395         135.0          4        84.0    2295          11.6          82   
396         120.0          4        79.0    2625          18.6          82   
397         119.0          4        82.0    2720          19.4          82   

     origin  
0         1  
1         1  
2         1  
3      

In [7]:
print(y)
print("\nShape of y: ", y.shape)

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 392, dtype: float64

Shape of y:  (392,)


# Our decision tree implementation 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("X_train size: ", X_train.shape)
print("y_train size: ", y_train.shape)
print("X_test size: ", X_test.shape)
print("y_test size: ", y_test.shape)

X_train size:  (274, 7)
y_train size:  (274,)
X_test size:  (118, 7)
y_test size:  (118,)


In [9]:
model = DecisionTree(criterion="information_gain", max_depth=5)
model.fit(X_train, y_train)

In [10]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_rmse = rmse(y_train_pred, y_train)
train_mae = mae(y_train_pred, y_train)

print("Train Metrics:")
print(f"Root Mean Squared Error: {train_rmse:.4f}")
print(f"Mean Absolute Error: {train_mae:.4f}")

test_rmse = rmse(y_test_pred, y_test)
test_mae = mae(y_test_pred, y_test)

print("\nTest Metrics:")
print(f"Root Mean Squared Error: {test_rmse:.4f}")
print(f"Mean Absolute Error: {test_mae:.4f}")

Train Metrics:
Root Mean Squared Error: 3.6662
Mean Absolute Error: 2.5947

Test Metrics:
Root Mean Squared Error: 3.8458
Mean Absolute Error: 2.9042


# SciKit Learn Decision Tree Regressor

In [11]:
sklearn_model = DecisionTreeRegressor(max_depth=5)
sklearn_model.fit(X_train, y_train)

In [12]:
y_train_pred_sklearn = sklearn_model.predict(X_train)
y_test_pred_sklearn = sklearn_model.predict(X_test)

train_rmse_sklearn = rmse(pd.Series(y_train_pred_sklearn), y_train)
train_mae_sklearn = mae(pd.Series(y_train_pred_sklearn), y_train)

print("Train Metrics (Sklearn):")
print(f"Root Mean Squared Error: {train_rmse_sklearn:.4f}")
print(f"Mean Absolute Error: {train_mae_sklearn:.4f}")

test_rmse_sklearn = rmse(pd.Series(y_test_pred_sklearn), y_test)
test_mae_sklearn = mae(pd.Series(y_test_pred_sklearn), y_test)

print("\nTest Metrics (Sklearn):")
print(f"Root Mean Squared Error: {test_rmse_sklearn:.4f}")
print(f"Mean Absolute Error: {test_mae_sklearn:.4f}")

Train Metrics (Sklearn):
Root Mean Squared Error: 1.9938
Mean Absolute Error: 1.4468

Test Metrics (Sklearn):
Root Mean Squared Error: 3.2169
Mean Absolute Error: 2.3546


# Performance Comparison

In [15]:
print("\nPerformance Comparison:\n")
print(f"Our Decision Tree - Train RMSE: {train_rmse:.4f}")
print(f"Our Decision Tree - Test RMSE: {test_rmse:.4f}")
print(f"Scikit-Learn Decision Tree - Train RMSE: {train_rmse_sklearn:.4f}")
print(f"Scikit-Learn Decision Tree - Test RMSE: {test_rmse_sklearn:.4f}")


Performance Comparison:

Our Decision Tree - Train RMSE: 3.6662
Our Decision Tree - Test RMSE: 3.8458
Scikit-Learn Decision Tree - Train RMSE: 1.9938
Scikit-Learn Decision Tree - Test RMSE: 3.2169


write subjective answer