# Decision Trees and Ensemble Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
# Load dataset
df = pd.read_csv('car_fuel_efficiency.csv')

In [3]:
# Fill missing values
df = df.fillna(0)

In [4]:
# Target variable
y = df['fuel_efficiency_mpg']
X = df.drop('fuel_efficiency_mpg', axis=1)

In [5]:
# Split dataset
X_train_full, X_temp, y_train_full, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

In [6]:
# Vectorize
train_dicts = X_train_full.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

In [7]:
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [8]:
# Question 1 - Decision Tree Regressor
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train_full)
feature_split = dv.feature_names_[dt.tree_.feature[0]]
print('Q1 - Feature used for splitting:', feature_split)

Q1 - Feature used for splitting: vehicle_weight


In [9]:
# Question 2 - Random Forest Regressor
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train_full)
y_pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print('Q2 - RMSE:', round(rmse, 3))

Q2 - RMSE: 0.46




In [10]:
# Question 3 - n_estimators tuning
scores = []
for n in range(10, 210, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train_full)
    y_pred = rf.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    scores.append((n, rmse))

rmse_df = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])
print('Q3 - RMSE trend by n_estimators:\n', rmse_df)



Q3 - RMSE trend by n_estimators:
     n_estimators      rmse
0             10  0.460282
1             20  0.446157
2             30  0.439778
3             40  0.438394
4             50  0.437170
5             60  0.435591
6             70  0.436112
7             80  0.436055
8             90  0.435410
9            100  0.435277
10           110  0.434897
11           120  0.435467
12           130  0.434923
13           140  0.435107
14           150  0.435191
15           160  0.435237
16           170  0.435208
17           180  0.435240
18           190  0.435398
19           200  0.435003




In [11]:
best_stop = rmse_df.loc[rmse_df['rmse'].diff().abs().lt(0.001), 'n_estimators']
if not best_stop.empty:
    print('RMSE stops improving after:', best_stop.iloc[0])
else:
    print('RMSE keeps improving up to 200 estimators.')

RMSE stops improving after: 70


In [12]:
# Question 4 - Best max_depth
results = []
for depth in [10, 15, 20, 25]:
    rmses = []
    for n in range(10, 210, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train_full)
        y_pred = rf.predict(X_val)
        rmses.append(mean_squared_error(y_val, y_pred, squared=False))
    results.append((depth, np.mean(rmses)))

results_df = pd.DataFrame(results, columns=['max_depth', 'mean_rmse'])
print('Q4 - Mean RMSE by depth:\n', results_df)



Q4 - Mean RMSE by depth:
    max_depth  mean_rmse
0         10   0.436247
1         15   0.437825
2         20   0.437693
3         25   0.437653




In [13]:
best_depth = results_df.loc[results_df['mean_rmse'].idxmin(), 'max_depth']
print('Best max_depth:', best_depth)

Best max_depth: 10


In [14]:
# Question 5 - Feature Importance
rf_imp = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_imp.fit(X_train, y_train_full)
feat_importances = pd.Series(rf_imp.feature_importances_, index=dv.feature_names_).sort_values(ascending=False)
print('Q5 - Most important feature:', feat_importances.head(4))

Q5 - Most important feature: vehicle_weight         0.959878
horsepower             0.015933
acceleration           0.011442
engine_displacement    0.003159
dtype: float64


In [15]:
# Question 6 - XGBoost tuning
dtrain = xgb.DMatrix(X_train, label=y_train_full)
dval = xgb.DMatrix(X_val, label=y_val)

xgb_params_1 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 0
}

In [16]:
xgb_params_2 = xgb_params_1.copy()
xgb_params_2['eta'] = 0.1

model_03 = xgb.train(xgb_params_1, dtrain, num_boost_round=100, evals=[(dval, 'val')], verbose_eval=False)
model_01 = xgb.train(xgb_params_2, dtrain, num_boost_round=100, evals=[(dval, 'val')], verbose_eval=False)

y_pred_03 = model_03.predict(dval)
y_pred_01 = model_01.predict(dval)

rmse_03 = mean_squared_error(y_val, y_pred_03, squared=False)
rmse_01 = mean_squared_error(y_val, y_pred_01, squared=False)

best_eta = 0.3 if rmse_03 < rmse_01 else 0.1 if rmse_01 < rmse_03 else 'both equal'
print(f'Q6 - RMSE eta=0.3: {rmse_03:.4f}, eta=0.1: {rmse_01:.4f}, Best eta: {best_eta}')

Q6 - RMSE eta=0.3: 0.4434, eta=0.1: 0.4167, Best eta: 0.1


