# Homework 2

In [1]:
# Homework 2: simple setup
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

# Load the CSV (put the file next to the notebook)
df = pd.read_csv("car_fuel_efficiency.csv")

# Keep only required columns
cols = ['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']
df = df[cols].copy()

print("Loaded dataset. Shape:", df.shape)
print("Columns:", df.columns.tolist())

Loaded dataset. Shape: (9704, 5)
Columns: ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']


## Q1 — Check which column has missing values?

In [2]:
na_counts = df.isna().sum()
print("Missing values per column:\n", na_counts)
print("\nColumns that have missing values:", na_counts[na_counts>0].index.tolist())

Missing values per column:
 engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Columns that have missing values: ['horsepower']


## Q2 — Median of horsepower

In [3]:
median_hp = df['horsepower'].median()
print("Median horsepower:", median_hp)

Median horsepower: 149.0


## Split & prepare the dataset

In [6]:
def split_data(df, seed=42):
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    n = len(df_shuffled)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    train = df_shuffled.iloc[:n_train].reset_index(drop=True)
    val   = df_shuffled.iloc[n_train:n_train+n_val].reset_index(drop=True)
    test  = df_shuffled.iloc[n_train+n_val:].reset_index(drop=True)
    return train, val, test

# Prepare X and y, handling horsepower NA with chosen strategy
def prepare_X_y(df_in, fill_strategy='zero', mean_value=None):
    df2 = df_in.copy()
    if fill_strategy == 'zero':
        df2['horsepower'] = df2['horsepower'].fillna(0)
    elif fill_strategy == 'mean':
        df2['horsepower'] = df2['horsepower'].fillna(mean_value)
    X = df2[['engine_displacement','horsepower','vehicle_weight','model_year']].values
    y = df2['fuel_efficiency_mpg'].values
    return X, y

def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())

## Q3 — Filling missing horsepower with 0 vs with training mean

In [7]:
# Split using seed=42
train, val, test = split_data(df, seed=42)

# Compute training mean for horsepower (must use training only)
train_hp_mean = train['horsepower'].mean()

# Option A: fill with 0
X_train_0, y_train = prepare_X_y(train, fill_strategy='zero')
X_val_0, y_val = prepare_X_y(val, fill_strategy='zero')
model0 = LinearRegression()
model0.fit(X_train_0, y_train)
pred_val_0 = model0.predict(X_val_0)
rmse_0 = round(rmse(y_val, pred_val_0), 2)

# Option B: fill with mean (training mean)
X_train_m, y_train = prepare_X_y(train, fill_strategy='mean', mean_value=train_hp_mean)
X_val_m, y_val = prepare_X_y(val, fill_strategy='mean', mean_value=train_hp_mean)
model1 = LinearRegression()
model1.fit(X_train_m, y_train)
pred_val_m = model1.predict(X_val_m)
rmse_m = round(rmse(y_val, pred_val_m), 2)

print("RMSE fill with 0:", rmse_0)
print("RMSE fill with mean:", rmse_m)

RMSE fill with 0: 0.52
RMSE fill with mean: 0.46


## Q5 — How seed influences score (seeds 0..9). Fill 0, no regularization. Compute std of RMSEs.

In [8]:
seeds = list(range(10))
rmse_seeds = []
for s in seeds:
    tr, va, te = split_data(df, seed=s)
    X_tr, y_tr = prepare_X_y(tr, fill_strategy='zero')
    X_va, y_va = prepare_X_y(va, fill_strategy='zero')
    model = LinearRegression()
    model.fit(X_tr, y_tr)
    pred_va = model.predict(X_va)
    rmse_s = rmse(y_va, pred_va)
    rmse_seeds.append(rmse_s)

std_rmse = round(np.std(rmse_seeds), 3)
print("RMSEs for seeds 0..9:", [round(x,3) for x in rmse_seeds])
print("Standard deviation (np.std):", std_rmse)

RMSEs for seeds 0..9: [np.float64(0.521), np.float64(0.522), np.float64(0.523), np.float64(0.516), np.float64(0.511), np.float64(0.529), np.float64(0.532), np.float64(0.51), np.float64(0.515), np.float64(0.513)]
Standard deviation (np.std): 0.007


## Q6 — seed 9, combine train+val, fill 0, r=0.001 → calculate RMSE on test dataset

In [9]:
# Split with seed=9
tr, va, te = split_data(df, seed=9)
train_comb = pd.concat([tr, va]).reset_index(drop=True)

# Fill 0 and train with r=0.001
X_train_c, y_train_c = prepare_X_y(train_comb, fill_strategy='zero')
X_test_c, y_test_c   = prepare_X_y(te, fill_strategy='zero')

model_c = Ridge(alpha=0.001)
model_c.fit(X_train_c, y_train_c)
pred_test = model_c.predict(X_test_c)
rmse_test = round(rmse(y_test_c, pred_test), 3)

print("RMSE on test (seed=9, r=0.001):", rmse_test)

RMSE on test (seed=9, r=0.001): 0.515
