In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


#### Fill missing values with zeros

In [3]:
df.fillna(0, inplace=True)

#### Do train/validation/test split with 60%/20%/20% distribution

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [5]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=1)

In [6]:
y_train = df_train.fuel_efficiency_mpg.values
y_val= df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

#### Use DictVectorizer(sparse=True) to turn the dataframes into matrices

In [7]:
dv = DictVectorizer(sparse=True)
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

### Question 1 - Decision Tree
Train a model with `max_depth=1`

In [8]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=1)

In [9]:
dt.fit(X_train, y_train)

In [10]:
dv.get_feature_names_out()[dt.tree_.feature[0]]

'vehicle_weight'

### Question 2 - Random Forest

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

np.float64(0.4640452243766381)

### Question 3 - n_estimators tuning

In [12]:
scores = []
for n in range(10, 210, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))

for s in scores:
    print(s)

(10, np.float64(0.4640452243766381))
(20, np.float64(0.45394078338988))
(30, np.float64(0.4511874909100202))
(40, np.float64(0.4470876467218648))
(50, np.float64(0.44459848214707653))
(60, np.float64(0.44376227648508065))
(70, np.float64(0.4427864551838832))
(80, np.float64(0.4423230261348978))
(90, np.float64(0.4420677079068682))
(100, np.float64(0.4415586004532457))
(110, np.float64(0.4415679075609138))
(120, np.float64(0.44093159191944886))
(130, np.float64(0.44072442396506706))
(140, np.float64(0.44066723990074963))
(150, np.float64(0.4406871361428503))
(160, np.float64(0.4406477480368662))
(170, np.float64(0.4403813290709333))
(180, np.float64(0.4404173371447007))
(190, np.float64(0.44067092074806763))
(200, np.float64(0.44039172974669116))


### Question 4 - Best max_depth (mean RMSE)
Loop over depths `[10, 15, 20, 25]` and average RMSE across n_estimators

In [13]:
results = {}
for d in [10, 15, 20, 25]:
    rmses = []
    for n in range(10, 210, 10):
        rf = RandomForestRegressor(max_depth=d, n_estimators=n, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmses.append(np.sqrt(mean_squared_error(y_val, y_pred)))
    results[d] = np.mean(rmses)

print(results)

{10: np.float64(0.44031634567908673), 15: np.float64(0.44328658142224303), 20: np.float64(0.4443558428195181), 25: np.float64(0.44413777057027265)}


### Question 5 - Feature Importance

In [14]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

sorted(zip(importances, feature_names), reverse=True)[:10]

[(np.float64(0.9595255248046702), 'vehicle_weight'),
 (np.float64(0.01585927515569577), 'horsepower'),
 (np.float64(0.01141458837336726), 'acceleration'),
 (np.float64(0.0033546270454755944), 'engine_displacement'),
 (np.float64(0.003269351633819643), 'model_year'),
 (np.float64(0.0022157645123981828), 'num_cylinders'),
 (np.float64(0.0014720822666615604), 'num_doors'),
 (np.float64(0.0005473393575927223), 'origin=USA'),
 (np.float64(0.0005192240643857286), 'origin=Europe'),
 (np.float64(0.00047719266529298534), 'origin=Asia')]

### Question 6 — XGBoost (eta comparison)

In [17]:
import xgboost as xgb

features = list(dv.get_feature_names_out())

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

for eta in [0.3, 0.1]:
    print(f'ETA={eta}')
    xgb_params['eta'] = eta
    model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
    y_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f'RMSE={rmse:.4f}')

ETA=0.3
RMSE=0.4526
ETA=0.1
RMSE=0.4305
