In [24]:
import pandas as pd
import numpy as np

In [6]:
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [20]:
data.columns[data.isnull().any()]

Index(['num_cylinders', 'horsepower', 'acceleration', 'num_doors'], dtype='object')

In [21]:
data['horsepower'].median()

np.float64(149.0)

In [30]:
n = len(data)
n_val = int(n*0.2)
n_test = int(n*0.2)
n_train = n-n_val-n_test

In [37]:
n_train,n_val,n_test

(5824, 1940, 1940)

In [38]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [39]:
n, n_train+n_val+n_test

(9704, 9704)

In [40]:
df_train = data.iloc[idx[:n_train]]
df_val = data.iloc[idx[n_train:n_train+n_val]]
df_test = data.iloc[idx[n_train+n_val:]]

In [41]:
len(df_train), len(df_val), len(df_test)

(5824, 1940, 1940)

In [42]:
target = 'fuel_efficiency_mpg'
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [43]:
train_0 = df_train.copy()
val_0 = df_val.copy()
train_0['horsepower'] = train_0['horsepower'].fillna(0)
val_0['horsepower'] = val_0['horsepower'].fillna(0)

In [44]:
mean_hp = df_train['horsepower'].mean()
train_mean = df_train.copy()
val_mean = df_val.copy()
train_mean['horsepower'] = train_mean['horsepower'].fillna(mean_hp)
val_mean['horsepower'] = val_mean['horsepower'].fillna(mean_hp)

In [50]:
def train_and_evaluate(train_df, val_df):
    model = LinearRegression()
    X_train = train_df[features]
    y_train = train_df[target]
    X_val = val_df[features]
    y_val = val_df[target]

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse


In [51]:
rmse_0 = train_and_evaluate(train_0, val_0)
rmse_mean = train_and_evaluate(train_mean, val_mean)




In [52]:
(round(rmse_0, 2), round(rmse_mean, 2))

(np.float64(0.52), np.float64(0.46))

In [53]:
from sklearn.linear_model import Ridge


In [54]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [55]:
df_train_r = df_train.fillna(0)
df_val_r = df_val.fillna(0)

In [56]:
X_train = df_train_r[features]
y_train = df_train_r[target]
X_val = df_val_r[features]
y_val = df_val_r[target]

In [57]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

In [58]:
for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = rmse(y_val, y_pred)
    rmse_scores[r] = round(score, 2)

for r, score in rmse_scores.items():
    print(f"r={r}: RMSE={score}")

best_r = min(rmse_scores, key=lambda x: (rmse_scores[x], x))
best_r

r=0: RMSE=0.52
r=0.01: RMSE=0.52
r=0.1: RMSE=0.52
r=1: RMSE=0.52
r=5: RMSE=0.52
r=10: RMSE=0.52
r=100: RMSE=0.52


0

In [62]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_list = []
for seed in seeds:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_train = data.iloc[idx[:n_train]]
    df_val = data.iloc[idx[n_train:n_train + n_val]]
    df_train = df_train.fillna(0)
    df_val = df_val.fillna(0)
    model = Ridge(alpha=0)
    X_train = df_train[features]
    y_train = df_train[target]
    X_val = df_val[features]
    y_val = df_val[target]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = rmse(y_val, y_pred)
    rmse_list.append(score)


In [63]:
std_rmse = np.std(rmse_list)
std_rmse

np.float64(0.006989446427777364)

In [64]:
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_train = data.iloc[idx[:n_train]].reset_index(drop=True)
df_val   = data.iloc[idx[n_train:n_train + n_val]].reset_index(drop=True)
df_test  = data.iloc[idx[n_train + n_val:]].reset_index(drop=True)
df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)
df_full_train = df_full_train.fillna(0)
df_test = df_test.fillna(0)
model = Ridge(alpha=0.001)
X_train = df_full_train[features]
y_train = df_full_train[target]
X_test = df_test[features]
y_test = df_test[target]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse(y_test, y_pred)


np.float64(0.5156132022009574)