# Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

columns_to_use = [
    'engine_displacement',
    'horsepower', 
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]
df = df[columns_to_use]

In [3]:
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


#### Question 1
There's one column with missing values. What is it?

In [4]:
# Check for missing values in each column
missing_values = df.isnull().sum()

print("Missing values in each column:")
print(missing_values)

Missing values in each column:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


#### Question 2
What's the median (50% percentile) for variable 'horsepower'?

In [5]:
# Calculate median horsepower
median_horsepower = df['horsepower'].median()
print(f"Median horsepower: {median_horsepower}")

Median horsepower: 149.0


Prepare and split the dataset
- Shuffle the dataset (the filtered one you created above), use seed 42.
- Split your data in train/val/test sets, with 60%/20%/20% distribution.
  
Use the same code as in the lectures

#### Question 3
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?

In [6]:
# function to work with
def prepare_X(df, fill_method='zero', train_mean=None):
    df_num = df.copy()
    
    if fill_method == 'zero':
        df_num = df_num.fillna(0)
    elif fill_method == 'mean':
        if train_mean is not None:
            df_num['horsepower'] = df_num['horsepower'].fillna(train_mean)
        else:
            df_num = df_num.fillna(df_num.mean())
    
    X = df_num.values
    return X

In [7]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

### Compare filling with 0 vs mean

In [8]:
# Split the data (using seed 42 as specified)
np.random.seed(42)
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

# Remove target from features
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [9]:
# Option 1: Fill with 0
X_train_zero = prepare_X(df_train, 'zero')
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train)

X_val_zero = prepare_X(df_val, 'zero')
y_pred_zero = w0_zero + X_val_zero.dot(w_zero)
rmse_zero = rmse(y_val, y_pred_zero)

In [10]:
# Option 2: Fill with mean (using training mean only)
train_horsepower_mean = df_train['horsepower'].mean()
X_train_mean = prepare_X(df_train, 'mean', train_horsepower_mean)
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

X_val_mean = prepare_X(df_val, 'mean', train_horsepower_mean)
y_pred_mean = w0_mean + X_val_mean.dot(w_mean)
rmse_mean = rmse(y_val, y_pred_mean)

print(f"RMSE with 0: {round(rmse_zero, 2)}")
print(f"RMSE with mean: {round(rmse_mean, 2)}")
print(f"Better option: {'With 0' if rmse_zero < rmse_mean else 'With mean'}")

RMSE with 0: 0.52
RMSE with mean: 0.46
Better option: With mean


#### Question 4
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

In [11]:
# Fill with 0 and try different r values
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
best_r = None
best_rmse = float('inf')

for r in r_values:
    X_train_reg = prepare_X(df_train, 'zero')
    w0_reg, w_reg = train_linear_regression_reg(X_train_reg, y_train, r=r)
    
    X_val_reg = prepare_X(df_val, 'zero')
    y_pred_reg = w0_reg + X_val_reg.dot(w_reg)
    rmse_reg = rmse(y_val, y_pred_reg)
    
    print(f"r={r}: RMSE = {round(rmse_reg, 4)}")
    
    if rmse_reg < best_rmse:
        best_rmse = rmse_reg
        best_r = r

print(f"Best r: {best_r}")

r=0: RMSE = 0.5174
r=0.01: RMSE = 0.5171
r=0.1: RMSE = 0.5188
r=1: RMSE = 0.5222
r=5: RMSE = 0.5229
r=10: RMSE = 0.523
r=100: RMSE = 0.5231
Best r: 0.01


#### Question 5
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [12]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

print("\nQuestion 5 Results:")
for seed in seeds:
    np.random.seed(seed)
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    df_train_seed = df.iloc[idx[:n_train]]
    df_val_seed = df.iloc[idx[n_train:n_train+n_val]]
    
    y_train_seed = df_train_seed.fuel_efficiency_mpg.values
    y_val_seed = df_val_seed.fuel_efficiency_mpg.values
    
    del df_train_seed['fuel_efficiency_mpg']
    del df_val_seed['fuel_efficiency_mpg']
    
    X_train_seed = prepare_X(df_train_seed, 'zero')
    w0_seed, w_seed = train_linear_regression(X_train_seed, y_train_seed)
    
    X_val_seed = prepare_X(df_val_seed, 'zero')
    y_pred_seed = w0_seed + X_val_seed.dot(w_seed)
    rmse_seed = rmse(y_val_seed, y_pred_seed)
    
    rmse_scores.append(rmse_seed)
    print(f"Seed {seed}: RMSE = {round(rmse_seed, 3)}")

std_rmse = np.std(rmse_scores)
print(f"\nStandard deviation of RMSE scores: {round(std_rmse, 3)}")


Question 5 Results:
Seed 0: RMSE = 0.521
Seed 1: RMSE = 0.521
Seed 2: RMSE = 0.523
Seed 3: RMSE = 0.516
Seed 4: RMSE = 0.511
Seed 5: RMSE = 0.528
Seed 6: RMSE = 0.531
Seed 7: RMSE = 0.509
Seed 8: RMSE = 0.515
Seed 9: RMSE = 0.513

Standard deviation of RMSE scores: 0.007


#### Question 6
- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.
- What's the RMSE on the test dataset?

In [13]:
# Split with seed 9
np.random.seed(9)
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_train_final = df.iloc[idx[:n_train]]
df_val_final = df.iloc[idx[n_train:n_train+n_val]]
df_test_final = df.iloc[idx[n_train+n_val:]]

# Combine train and validation
df_train_val = pd.concat([df_train_final, df_val_final])
df_train_val = df_train_val.reset_index(drop=True)

y_train_val = df_train_val.fuel_efficiency_mpg.values
y_test_final = df_test_final.fuel_efficiency_mpg.values

del df_train_val['fuel_efficiency_mpg']
del df_test_final['fuel_efficiency_mpg']

# Train on combined train+val with r=0.001
X_train_val = prepare_X(df_train_val, 'zero')
w0_final, w_final = train_linear_regression_reg(X_train_val, y_train_val, r=0.001)

X_test_final = prepare_X(df_test_final, 'zero')
y_pred_test = w0_final + X_test_final.dot(w_final)
rmse_test_final = rmse(y_test_final, y_pred_test)

print(f"Test RMSE with r=0.001: {round(rmse_test_final, 2)}")

Test RMSE with r=0.001: 0.52
