In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-11-03 23:40:58--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-11-03 23:40:58 (95.4 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Load the dataset
df = pd.read_csv('car_fuel_efficiency.csv')

# Fill missing values with zeros
df = df.fillna(0)

# Split the data into train/validation/test sets
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)  # 0.25 * 0.8 = 0.2

# Separate the target variable
y_train = df_train['fuel_efficiency_mpg']
y_val = df_val['fuel_efficiency_mpg']
y_test = df_test['fuel_efficiency_mpg']

df_train = df_train.drop(columns=['fuel_efficiency_mpg'])
df_val = df_val.drop(columns=['fuel_efficiency_mpg'])
df_test = df_test.drop(columns=['fuel_efficiency_mpg'])

# Convert dataframes to dictionaries and use DictVectorizer
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))

# Question 1: Decision Tree Regressor

In [5]:
from sklearn.tree import DecisionTreeRegressor

# Train a decision tree regressor with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Get the feature used for splitting
feature_names = dv.get_feature_names_out()
split_feature = feature_names[dt.tree_.feature[0]]
print(f"Feature used for splitting: {split_feature}")

Feature used for splitting: vehicle_weight


# Question 2: Random Forest Regressor

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
# Train a random forest regressor
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict on the validation dataset
y_pred = rf.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred,))
print(f"Validation RMSE: {rmse}")

Validation RMSE: 0.4595777223092726


# Question 3: Experiment with n_estimators

In [15]:
# Experiment with n_estimators
results = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results.append((n, rmse))

# Find the point where RMSE stops improving
for n, rmse in results:
    print(f"n_estimators={n}, RMSE={rmse}")

n_estimators=10, RMSE=0.4595777223092726
n_estimators=20, RMSE=0.45359067251247054
n_estimators=30, RMSE=0.45168672575457125
n_estimators=40, RMSE=0.44872083017369974
n_estimators=50, RMSE=0.4466568972416094
n_estimators=60, RMSE=0.44545970260811213
n_estimators=70, RMSE=0.44512632449869965
n_estimators=80, RMSE=0.4449843119777284
n_estimators=90, RMSE=0.4448614906399875
n_estimators=100, RMSE=0.44465186808680424
n_estimators=110, RMSE=0.44357876439860233
n_estimators=120, RMSE=0.4439118681233817
n_estimators=130, RMSE=0.443702590396687
n_estimators=140, RMSE=0.4433549955101688
n_estimators=150, RMSE=0.44289761494219454
n_estimators=160, RMSE=0.4427612219659299
n_estimators=170, RMSE=0.44280146504730905
n_estimators=180, RMSE=0.44236195357041347
n_estimators=190, RMSE=0.4424939711220692
n_estimators=200, RMSE=0.4424785084688597


# Question 4: Best max_depth

In [16]:
# Experiment with max_depth and n_estimators
depth_results = []
for max_depth in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        depth_results.append((max_depth, n, rmse))

# Find the best max_depth
depth_rmse = {}
for max_depth, n, rmse in depth_results:
    if max_depth not in depth_rmse:
        depth_rmse[max_depth] = []
    depth_rmse[max_depth].append(rmse)

mean_rmse = {k: sum(v) / len(v) for k, v in depth_rmse.items()}
best_max_depth = min(mean_rmse, key=mean_rmse.get)
print(f"Best max_depth: {best_max_depth}")

Best max_depth: 10


# Question 5: Feature Importance

In [17]:
# Train the model with the given parameters
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importance
importances = rf.feature_importances_
feature_importance = sorted(zip(dv.get_feature_names_out(), importances), key=lambda x: x[1], reverse=True)

# Find the most important feature
most_important_feature = feature_importance[0][0]
print(f"Most important feature: {most_important_feature}")

Most important feature: vehicle_weight


# Question 6: XGBoost

In [20]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Create a watchlist
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Train the model with eta=0.3
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Train the model with eta=0.1
xgb_params['eta'] = 0.1
model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Compare RMSE for both eta values
# The RMSE values will be printed during training in the logs

[0]	train-rmse:1.81393	val-rmse:1.85444
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316
[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[7]	train-rmse:0.41881	val-rmse:0.47254
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[11]	train-rmse:0.36361	val-rmse:0.43594
[12]	train-rmse:0.35850	val-rmse:0.43558
[13]	train-rmse:0.35365	val-rmse:0.43394
[14]	train-rmse:0.35025	val-rmse:0.43349
[15]	train-rmse:0.34666	val-rmse:0.43362
[16]	train-rmse:0.34459	val-rmse:0.43378
[17]	train-rmse:0.34128	val-rmse:0.43405
[18]	train-rmse:0.33822	val-rmse:0.43391


[19]	train-rmse:0.33709	val-rmse:0.43374
[20]	train-rmse:0.33553	val-rmse:0.43376
[21]	train-rmse:0.33243	val-rmse:0.43453
[22]	train-rmse:0.33031	val-rmse:0.43510
[23]	train-rmse:0.32815	val-rmse:0.43601
[24]	train-rmse:0.32670	val-rmse:0.43592
[0]	train-rmse:2.28944	val-rmse:2.34561
[1]	train-rmse:2.07396	val-rmse:2.12434
[2]	train-rmse:1.88066	val-rmse:1.92597
[3]	train-rmse:1.70730	val-rmse:1.74987
[4]	train-rmse:1.55163	val-rmse:1.59059
[5]	train-rmse:1.41247	val-rmse:1.44988
[6]	train-rmse:1.28796	val-rmse:1.32329
[7]	train-rmse:1.17660	val-rmse:1.20930
[8]	train-rmse:1.07736	val-rmse:1.10830
[9]	train-rmse:0.98883	val-rmse:1.02009
[10]	train-rmse:0.91008	val-rmse:0.94062
[11]	train-rmse:0.84030	val-rmse:0.87100
[12]	train-rmse:0.77874	val-rmse:0.80916
[13]	train-rmse:0.72417	val-rmse:0.75465
[14]	train-rmse:0.67626	val-rmse:0.70780
[15]	train-rmse:0.63402	val-rmse:0.66672
[16]	train-rmse:0.59690	val-rmse:0.63062
[17]	train-rmse:0.56447	val-rmse:0.60016
[18]	train-rmse:0.53619	va