In [128]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [129]:
df.columns
used = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df_used = df[used]
df_used.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [130]:
horsepower_median = df_used['horsepower'].median()
horsepower_median

np.float64(149.0)

In [131]:
from sklearn.model_selection import train_test_split

x_columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
y_column = 'fuel_efficiency_mpg'
X = df_used[x_columns]
y = df_used[y_column]
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

X_train['horsepower'] = X_train['horsepower'].fillna(0)
X_val['horsepower'] = X_val['horsepower'].fillna(0)
X_test['horsepower'] = X_test['horsepower'].fillna(0)

reg_0 = LinearRegression().fit(X_train, y_train)
y_pred = reg_0.predict(X_val)
rmse_0 = root_mean_squared_error(y_val, y_pred)
rmse_0

0.5302460163624645

In [133]:
X_train['horsepower'] = X_train['horsepower'].fillna(horsepower_median)
X_val['horsepower'] = X_val['horsepower'].fillna(horsepower_median)
X_test['horsepower'] = X_test['horsepower'].fillna(horsepower_median)

reg_median = LinearRegression().fit(X_train, y_train)
y_pred = reg_median.predict(X_val)
rmse_median = root_mean_squared_error(y_val, y_pred)
rmse_median

0.5302460163624645

In [134]:
rmse_0 == rmse_median

True

In [135]:
from sklearn.linear_model import Ridge

rmse_scores = {}
X_train['horsepower'] = X_train['horsepower'].fillna(0)
X_val['horsepower'] = X_val['horsepower'].fillna(0)
X_test['horsepower'] = X_test['horsepower'].fillna(0)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    model = Ridge(alpha=r, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    rmse_scores[r] = rmse
best_r = min(rmse_scores, key=rmse_scores.get)
best_r

100

In [136]:
rmses = {}

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train['horsepower'] = X_train['horsepower'].fillna(0)
    X_val['horsepower'] = X_val['horsepower'].fillna(0)
    X_test['horsepower'] = X_test['horsepower'].fillna(0)
    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    rmse_scores[seed] = rmse
rmse_std = np.std(list(rmse_scores.values()))
round(rmse_std, 3)

np.float64(0.009)

In [137]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
X_full_train['horsepower'] = X_full_train['horsepower'].fillna(0)
X_test['horsepower'] = X_test['horsepower'].fillna(0)
model = Ridge(alpha=0.01, random_state=9)
model.fit(X_full_train, y_train)
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_val, y_pred)
rmse

0.5210335981836984