## Modeling Vehicle Fuel Consumption Based on Car Specifications  
### Regression Tree Built from First Principles

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
from scipy.stats import boxcox

import sys
sys.path.append('..')

from models.tree_models.random_forest import RandomForest 

import os
df = pd.read_csv(os.path.join('..', 'data', 'raw', 'car.csv'))
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [2]:
df.drop(["car name"],axis=1, inplace=True)
df.replace("?", np.nan, inplace=True)
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')


missing = df.isna().sum()

for col,num in missing.items():
    print(col, num)


mpg 0
cylinders 0
displacement 0
horsepower 6
weight 0
acceleration 0
model year 0
origin 0


In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('mpg', axis=1)
y = df["mpg"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
rf_reg = RandomForest(
        n_estimators=10,
        max_depth=5,
        task='regression',
        oob_score=True,
        random_state=42
    )
rf_reg.fit(X_train, y_train)
    
    # Predictions
predictions_reg = rf_reg.predict(X_test)
    
print(f"Test R² Score: {rf_reg.score(X_test, y_test):.3f}")
print(f"OOB Score: {rf_reg.oob_score_:.3f}")
print(f"First 5 predictions: {predictions_reg[:5]}")
print(f"First 5 true values: {y_test[:5]}")

Test R² Score: 0.814
OOB Score: -14.992
First 5 predictions: [32.62194922 28.69625366 19.72583137 15.6046707  14.54010836]
First 5 true values: 198    33.0
396    28.0
33     19.0
208    13.0
93     14.0
Name: mpg, dtype: float64
