In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

# Load the CORRECT dataset
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)

# Prepare the dataset
selected_columns = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]
df = df[selected_columns]

# Rename for convenience
df = df.rename(columns={'fuel_efficiency_mpg': 'mpg'})

In [11]:
print(df.isnull().sum())

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
mpg                      0
dtype: int64


In [13]:
print(df['horsepower'].median())

149.0


In [14]:
# Splitting function
def split_data(df, seed):
    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - n_val - n_test
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    return df_train, df_val, df_test

# Helper functions
def prepare_X(df, fill_value):
    df = df.copy()
    df['horsepower'] = df['horsepower'].fillna(fill_value)
    X = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
    return X

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Split data with seed 42
df_train, df_val, df_test = split_data(df, seed=42)
y_train = df_train['mpg'].values
y_val = df_val['mpg'].values

# Option 1: Fill with 0
X_train_0 = prepare_X(df_train, 0)
X_val_0 = prepare_X(df_val, 0)
model_0 = LinearRegression().fit(X_train_0, y_train)
y_pred_0 = model_0.predict(X_val_0)
score_0 = rmse(y_val, y_pred_0)

# Option 2: Fill with mean
horsepower_mean = df_train['horsepower'].mean()
X_train_mean = prepare_X(df_train, horsepower_mean)
X_val_mean = prepare_X(df_val, horsepower_mean)
model_mean = LinearRegression().fit(X_train_mean, y_train)
y_pred_mean = model_mean.predict(X_val_mean)
score_mean = rmse(y_val, y_pred_mean)

print(f"RMSE with 0: {round(score_0, 2)}")
print(f"RMSE with mean: {round(score_mean, 2)}")

RMSE with 0: 0.52
RMSE with mean: 0.46


In [15]:
X_train = prepare_X(df_train, 0)
X_val = prepare_X(df_val, 0)

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
best_score = float('inf')
best_r = None

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = rmse(y_val, y_pred)
    print(f"r={r}, RMSE={round(score, 2)}")
    if score < best_score:
        best_score = score
        best_r = r

print(f"\nBest r is {best_r}")

r=0, RMSE=0.52
r=0.01, RMSE=0.52
r=0.1, RMSE=0.52
r=1, RMSE=0.52
r=5, RMSE=0.52
r=10, RMSE=0.52
r=100, RMSE=0.52

Best r is 0


In [16]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
scores = []

for seed in seeds:
    df_train, df_val, df_test = split_data(df, seed)
    y_train = df_train['mpg'].values
    y_val = df_val['mpg'].values
    
    X_train = prepare_X(df_train, 0)
    X_val = prepare_X(df_val, 0)
    
    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = rmse(y_val, y_pred)
    scores.append(score)

print(f"Standard deviation of scores: {round(np.std(scores), 3)}")

Standard deviation of scores: 0.007


In [17]:
# Split with seed 9
df_train, df_val, df_test = split_data(df, seed=9)

# Combine train and validation
df_full_train = pd.concat([df_train, df_val])
X_full_train = prepare_X(df_full_train, 0)
y_full_train = df_full_train['mpg'].values

# Prepare test set
X_test = prepare_X(df_test, 0)
y_test = df_test['mpg'].values

# Train model with r=0.001
model = Ridge(alpha=0.001)
model.fit(X_full_train, y_full_train)

# Evaluate on test set
y_pred = model.predict(X_test)
test_rmse = rmse(y_test, y_pred)

print(f"RMSE on the test dataset: {round(test_rmse, 2)}")

RMSE on the test dataset: 0.52
