## Modeling Vehicle Fuel Consumption Based on Car Specifications  
### Regression Tree Built from First Principles

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
from scipy.stats import boxcox

import sys
sys.path.append('..')

from models.tree_models.regression_tree import DecisionTreeRegressor

import os
df = pd.read_csv(os.path.join('..', 'data', 'raw', 'car.csv'))
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [2]:
df.drop(["car name"],axis=1, inplace=True)
df.replace("?", np.nan, inplace=True)
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')


missing = df.isna().sum()

for col,num in missing.items():
    print(col, num)


mpg 0
cylinders 0
displacement 0
horsepower 6
weight 0
acceleration 0
model year 0
origin 0


In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('mpg', axis=1)
y = df["mpg"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor(max_depth=None, min_samples_split=2)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score)



# 3. Calculate metrics on the original scale
# Use the original-scale arrays for calculation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("--- Metrics on Original Scale (e.g., Actual MPG) ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R^2): {r2:.4f}")