## 06 Decision Tree for predicting the car fuel efficiency

### 6.1 Data Preparation

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

In [3]:
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

In [4]:
!wget $data -O car_fuel_efficiency.csv

--2025-11-04 16:06:34--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-11-04 16:06:35 (7.45 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [6]:
df = pd.read_csv("car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [16]:
df = df.fillna(0)

In [17]:
# Target Variable
y = df["fuel_efficiency_mpg"].values

In [18]:
# Feature
df_features = df.drop(columns=["fuel_efficiency_mpg"])

In [19]:
df.head(), df.shape

(   engine_displacement  num_cylinders  ...  num_doors  fuel_efficiency_mpg
 0                  170            3.0  ...        0.0            13.231729
 1                  130            5.0  ...        0.0            13.688217
 2                  170            0.0  ...        0.0            14.246341
 3                  220            4.0  ...        2.0            16.912736
 4                  210            1.0  ...        2.0            12.488369
 
 [5 rows x 11 columns],
 (9704, 11))

### 6.2 60/20/20 Split

In [20]:
# First Split: Train (60%) vs Temp (40%)
df_train, df_temp, y_train, y_temp = train_test_split(
    df_features, y, test_size=0.4, random_state=1
)

In [21]:
# Second Split: Validation (20%) vx Test (20%) from Temp (40%)
df_val, df_test, y_val, y_test = train_test_split(
    df_temp, y_temp, test_size=0.5, random_state=1
)

In [22]:
len(df_train), len(df_val), len(df_test)

(5822, 1941, 1941)

### 6.3 Turn Datframes into Matrices

In [23]:
dv = DictVectorizer(sparse=True)

train_dicts = df_train.to_dict(orient="records")
val_dicts = df_val.to_dict(orient="records")
test_dicts = df_test.to_dict(orient="records")

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

X_train.shape, X_val.shape, X_test.shape

((5822, 14), (1941, 14), (1941, 14))

### 6.4 Q1: Train Decision Stump and find split feature

In [24]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Map root split index -> feature name
feature_names = dv.get_feature_names_out()
root_feature_index = dt.tree_.feature[0]
root_feature_name = feature_names[root_feature_index]
root_feature_name

'vehicle_weight'

Q1: 'vehicle_weight'

### 6.5 Q2: Train Random Forest Regressor (best n_estimator)

In [25]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [26]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [30]:
rf10 = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf10.fit(X_train, y_train)

y_val_pred = rf10.predict(X_val)
val_rmse_10 = rmse(y_val, y_val_pred)
round(val_rmse_10, 3)

np.float64(0.46)

Q2: 0.45

### 6.6 Q3: When does RMSE stop improving?

In [31]:
results = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    score = rmse(y_val, y_pred)
    results.append((n, round(score, 3)))
    print(f"n_estimators: {n}, RMSE: {round(score, 3)}")

n_estimators: 10, RMSE: 0.46
n_estimators: 20, RMSE: 0.446
n_estimators: 30, RMSE: 0.44
n_estimators: 40, RMSE: 0.438
n_estimators: 50, RMSE: 0.437
n_estimators: 60, RMSE: 0.436
n_estimators: 70, RMSE: 0.436
n_estimators: 80, RMSE: 0.436
n_estimators: 90, RMSE: 0.435
n_estimators: 100, RMSE: 0.435
n_estimators: 110, RMSE: 0.435
n_estimators: 120, RMSE: 0.435
n_estimators: 130, RMSE: 0.435
n_estimators: 140, RMSE: 0.435
n_estimators: 150, RMSE: 0.435
n_estimators: 160, RMSE: 0.435
n_estimators: 170, RMSE: 0.435
n_estimators: 180, RMSE: 0.435
n_estimators: 190, RMSE: 0.435
n_estimators: 200, RMSE: 0.435


Q3: 80

### 6.7 Q4: XGBoost Paremeter Tunning

In [32]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

In [33]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [34]:
# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [35]:
max_depth_values = [10, 15, 20, 25]
eta = 0.1 # Learning rate
results = []

for depth in max_depth_values:
    scores = []
    for n in range(10, 201, 10):
        model = xgb.train(
            params={
                'objective': 'reg:squarederror',
                'max_depth': depth,
                'eta': eta,
                'seed': 1
            },
            dtrain=dtrain,
            num_boost_round=n,
        )
        y_pred = model.predict(dval)
        score = rmse(y_val, y_pred)
        scores.append(score)
    mean_rmse = np.mean(scores)
    results.append((depth, round(mean_rmse, 3)))

results

[(10, np.float64(0.477)),
 (15, np.float64(0.489)),
 (20, np.float64(0.489)),
 (25, np.float64(0.489))]

Q4: 10

### 6.8 Q5: Tuning ETA

In [36]:
from sklearn.ensemble import RandomForestRegressor

rf_feat = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf_feat.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
import numpy as np

feature_importances = rf_feat.feature_importances_
names = dv.get_feature_names_out()

# Data frame of feature importances
feat_df = pd.DataFrame({
    'feature': names,
    'importance': feature_importances
}).sort_values(by='importance', ascending=False)

feat_df.head(10)

Unnamed: 0,feature,importance
13,vehicle_weight,0.959878
6,horsepower,0.015933
0,acceleration,0.011442
3,engine_displacement,0.003159
7,model_year,0.003066
8,num_cylinders,0.002323
9,num_doors,0.001576
12,origin=USA,0.000496
10,origin=Asia,0.000431
11,origin=Europe,0.000419


Q5: 'vehicle_weight'

### 6.9 Q6: Train the model with XGBoost and find best eta

In [38]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

In [39]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [40]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [41]:
# Params per homework
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
    'eval_metric': 'rmse'
}

In [42]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [43]:
model_eta03 = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=False
)

# Validation RMSE after 100 rounds
y_val_pred03 = model_eta03.predict(dval)
rmse_eta03 = rmse(y_val, y_val_pred03)
round(rmse_eta03, 3)

np.float64(0.443)

In [44]:
# Params per homework 0.1
xgb_params_01 = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
    'eval_metric': 'rmse'
}

In [45]:
model_eta01 = xgb.train(
    params=xgb_params_01,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=False
)

y_val_pred_01 = model_eta01.predict(dval)
rmse_01 = rmse(y_val, y_val_pred_01)
round(rmse_01, 3)

np.float64(0.417)

Q6: eta = 0.1