In [None]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-11-01 19:35:59--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.1’


2025-11-01 19:35:59 (55.9 MB/s) - ‘car_fuel_efficiency.csv.1’ saved [874188/874188]



In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [3]:
df = pd.read_csv('./car_fuel_efficiency.csv')

In [4]:
df.isnull().sum() # num_cylinders, horsepower, acceleration, num_doors with nulls.

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [5]:
df = df.fillna(0)

In [6]:
df.nunique()

engine_displacement      36
num_cylinders            14
horsepower              193
vehicle_weight         9704
acceleration            163
model_year               24
origin                    3
fuel_type                 2
drivetrain                2
num_doors                 9
fuel_efficiency_mpg    9704
dtype: int64

In [7]:
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [8]:
df.engine_displacement.unique()[:5]

for c in df.columns:
    print(f"{c}:")
    print(df[c].unique()[:5])
    print()

engine_displacement:
[170 130 220 210 190]

num_cylinders:
[3. 5. 0. 4. 1.]

horsepower:
[159.  97.  78.   0. 140.]

vehicle_weight:
[3413.43375861 3149.66493422 3079.03899737 2542.39240183 3460.87098999]

acceleration:
[17.7 17.8 15.1 20.2 14.4]

model_year:
[2003 2007 2018 2009 2008]

origin:
['Europe' 'USA' 'Asia']

fuel_type:
['Gasoline' 'Diesel']

drivetrain:
['All-wheel drive' 'Front-wheel drive']

num_doors:
[ 0.  2. -1.  1. -2.]

fuel_efficiency_mpg:
[13.23172891 13.68821744 14.246341   16.9127356  12.48836912]



## Train/Val/Test datasets

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]

In [20]:
d_train = df_train.to_dict(orient='records')
d_val = df_val.to_dict(orient='records')
d_test = df_test.to_dict(orient='records')

In [21]:
dv = DictVectorizer(sparse=True)

X_train = dv.fit_transform(d_train)
X_val = dv.fit_transform(d_val)
X_test = dv.fit_transform(d_test)

## Question 1

In [53]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [None]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))
# Answer is vehicle_weight, this is the root node.

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



## Question 2

In [56]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

In [None]:
rf.fit(X_train, y_train)

In [59]:
y_pred = rf.predict(X_val)

In [None]:
rmse = root_mean_squared_error(y_val, y_pred)
print(rmse)
# Answer is 0.4595777223092726

0.4595777223092726


## Question 3

In [93]:
def train(X_train, y_train, est, rs, depth):
    rf = RandomForestRegressor(
        n_estimators=est,
        random_state=rs,
        n_jobs=-1,
        max_depth=depth
    )
    rf.fit(X_train, y_train)
    return rf

def predict(model, X):
    y_pred = model.predict(X)
    return y_pred

In [None]:
for est in range(10, 200, 10):
    print(f"Estimator: {est}")
    rf = train(X_train, y_train, est=est, rs=1, depth=None)
    y_pred = predict(model=rf, X=X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(f"RMSE: {round(rmse, 3)}")

Estimator: 10
RMSE: 0.46
Estimator: 20
RMSE: 0.454
Estimator: 30
RMSE: 0.452
Estimator: 40
RMSE: 0.449
Estimator: 50
RMSE: 0.447
Estimator: 60
RMSE: 0.445
Estimator: 70
RMSE: 0.445
Estimator: 80
RMSE: 0.445
Estimator: 90
RMSE: 0.445
Estimator: 100
RMSE: 0.445
Estimator: 110
RMSE: 0.444
Estimator: 120
RMSE: 0.444
Estimator: 130
RMSE: 0.444
Estimator: 140
RMSE: 0.443
Estimator: 150
RMSE: 0.443
Estimator: 160
RMSE: 0.443
Estimator: 170
RMSE: 0.443
Estimator: 180
RMSE: 0.442
Estimator: 190
RMSE: 0.442


In [None]:
# Seems that is always improving, even if it is not too much improvement

## Question 4

In [95]:
def param_tuning(depth_range, estimator_range):
    scores = []

    for d in depth_range:
        print(f"Training with d={d}")
        for est in estimator_range:
            # print(f"Estimator: {est}")
            rf = train(X_train, y_train, est=est, rs=1, depth=d)
            y_pred = predict(model=rf, X=X_val)
            rmse = root_mean_squared_error(y_val, y_pred)
            rmse = round(rmse, 3)
            # print(f"RMSE: {round(rmse, 3)}")
            scores.append((d, est, rmse))
    
    return scores

In [96]:
d = range(10, 25+5, 5)
e = range(10, 200+10, 10)

scores = param_tuning(depth_range=d, estimator_range=e)

Training with d=10
Training with d=15
Training with d=20
Training with d=25


In [97]:
# len(scores)
scores[:5]

[(10, 10, 0.45),
 (10, 20, 0.447),
 (10, 30, 0.445),
 (10, 40, 0.443),
 (10, 50, 0.442)]

In [98]:
cols = ['depth', 'estimators', 'rmse']
df_scores = pd.DataFrame(scores, columns=cols)
df_scores.head()

Unnamed: 0,depth,estimators,rmse
0,10,10,0.45
1,10,20,0.447
2,10,30,0.445
3,10,40,0.443
4,10,50,0.442


In [99]:
df_scores.sort_values(by='rmse', ascending=False)

Unnamed: 0,depth,estimators,rmse
60,25,10,0.459
40,20,10,0.459
20,15,10,0.458
41,20,20,0.454
61,25,20,0.454
...,...,...,...
14,10,150,0.440
16,10,170,0.440
17,10,180,0.440
18,10,190,0.440


In [100]:
df_scores_mean = df_scores.groupby('depth')['rmse'].mean()
# df_scores_mean.sort_values(by='depth', ascending=True)
df_scores_mean

depth
10    0.44185
15    0.44550
20    0.44615
25    0.44590
Name: rmse, dtype: float64

In [None]:
# Answer is max depth = 10, with the minimum RMSE of 0.44185

## Question 5

## Question 6