In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from math import sqrt
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [9]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


In [11]:
dataset_columns = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]

df_new = df[dataset_columns]

In [12]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   horsepower           8996 non-null   float64
 2   vehicle_weight       9704 non-null   float64
 3   model_year           9704 non-null   int64  
 4   fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 379.2 KB


In [13]:
missing_counts = df_new.isnull().sum()
print(missing_counts[missing_counts > 0])

horsepower    708
dtype: int64


In [14]:
median_hp = df_new['horsepower'].median()
print(median_hp)

149.0


In [15]:
# First split: 60% train, 40% temp (val + test)
train_df, temp_df = train_test_split(df_new, test_size=0.4, random_state=42, shuffle=True)

# Second split: 50% of temp_df for validation, 50% for test (20% each of total)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

# Confirm shapes
print(f"Train set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")
print(f"Test set shape: {test_df.shape}")


Train set shape: (5822, 5)
Validation set shape: (1941, 5)
Test set shape: (1941, 5)


In [16]:
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

# Option 1: Fill missing horsepower with 0
train_0 = train_df.copy()
val_0 = val_df.copy()

train_0['horsepower'] = train_0['horsepower'].fillna(0)
val_0['horsepower'] = val_0['horsepower'].fillna(0)

# Train
lr_0 = LinearRegression()
lr_0.fit(train_0[features], train_0[target])

# Validation set RMSE
preds_0 = lr_0.predict(val_0[features])
rmse_0 = sqrt(mean_squared_error(val_0[target], preds_0))
rmse_0 = round(rmse_0, 2)


#. Option 2: Fill missing horsepower with mean
mean_hp = train_df['horsepower'].mean()

train_mean = train_df.copy()
val_mean = val_df.copy()

train_mean['horsepower'] = train_mean['horsepower'].fillna(mean_hp)
val_mean['horsepower'] = val_mean['horsepower'].fillna(mean_hp)

# Train
lr_mean = LinearRegression()
lr_mean.fit(train_mean[features], train_mean[target])

# Validation set RMSE
preds_mean = lr_mean.predict(val_mean[features])
rmse_mean = sqrt(mean_squared_error(val_mean[target], preds_mean))
rmse_mean = round(rmse_mean, 2)


# Compare results
print(f"RMSE (fill with 0): {rmse_0}")
print(f"RMSE (fill with mean): {rmse_mean}")

if rmse_mean < rmse_0:
    print("Filling with mean gives better RMSE.")
else:
    print("Filling with 0 gives better RMSE.")


RMSE (fill with 0): 0.52
RMSE (fill with mean): 0.46
Filling with mean gives better RMSE.


In [17]:
train_r = train_df.copy()
val_r = val_df.copy()

train_r['horsepower'] = train_r['horsepower'].fillna(0)
val_r['horsepower'] = val_r['horsepower'].fillna(0)

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(train_r[features], train_r[target])
    preds = model.predict(val_r[features])
    rmse = sqrt(mean_squared_error(val_r[target], preds))
    rmse_scores[r] = round(rmse, 2)

# Result
for r, rmse in rmse_scores.items():
    print(f"r = {r:<5} → RMSE = {rmse}")

best_r = min(rmse_scores, key=lambda x: (rmse_scores[x], x))
print(f"\n Best r = {best_r} with RMSE = {rmse_scores[best_r]}")


r = 0     → RMSE = 0.52
r = 0.01  → RMSE = 0.52
r = 0.1   → RMSE = 0.52
r = 1     → RMSE = 0.52
r = 5     → RMSE = 0.52
r = 10    → RMSE = 0.52
r = 100   → RMSE = 0.52

 Best r = 0 with RMSE = 0.52


In [18]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

for seed in seeds:
    # Split data 60% train, 20% val, 20% test
    train_df, temp_df = train_test_split(df_new, test_size=0.4, random_state=seed, shuffle=True)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed, shuffle=True)
    
    train_df['horsepower'] = train_df['horsepower'].fillna(0)
    val_df['horsepower'] = val_df['horsepower'].fillna(0)
    
    # Train linear regression (no regularization)
    model = LinearRegression()
    model.fit(train_df[features], train_df[target])
    
    # Validation set RMSE
    preds = model.predict(val_df[features])
    rmse = sqrt(mean_squared_error(val_df[target], preds))
    rmse_scores.append(rmse)

# Standard deviation of RMSEs
std_rmse = np.std(rmse_scores)
std_rmse = round(std_rmse, 3)

print("RMSE scores per seed:", np.round(rmse_scores, 3))
print("Standard deviation of RMSE:", std_rmse)

RMSE scores per seed: [0.518 0.517 0.52  0.522 0.511 0.515 0.515 0.536 0.519 0.52 ]
Standard deviation of RMSE: 0.006


In [19]:
# Split dataset using seed 9
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=9, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=9, shuffle=True)

# Combine train + validation
train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Filling missing 'horsepower' with 0
train_val_df['horsepower'] = train_val_df['horsepower'].fillna(0)
test_df['horsepower'] = test_df['horsepower'].fillna(0)

# Train Ridge regression with r=0.001
model = Ridge(alpha=0.001)
model.fit(train_val_df[features], train_val_df[target])

# Predict on test set and compute RMSE
preds_test = model.predict(test_df[features])
rmse_test = sqrt(mean_squared_error(test_df[target], preds_test))
rmse_test = round(rmse_test, 2)

print("RMSE on test dataset:", rmse_test)

RMSE on test dataset: 0.52
