# Linear Regression Homework


In [1]:
import pandas as pd
import numpy as np

## Data Preparation

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

In [3]:
!wget $data 

--2025-10-07 09:49:36--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.3’


2025-10-07 09:49:38 (758 KB/s) - ‘car_fuel_efficiency.csv.3’ saved [874188/874188]



In [4]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [5]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[cols]

In [7]:
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


## Explorative Data Analysis

In [8]:
df.describe()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
count,9704.0,8996.0,9704.0,9704.0,9704.0
mean,199.708368,149.657292,3001.280993,2011.484027,14.985243
std,49.455319,29.879555,497.89486,6.659808,2.556468
min,10.0,37.0,952.681761,2000.0,6.200971
25%,170.0,130.0,2666.248985,2006.0,13.267459
50%,200.0,149.0,2993.226296,2012.0,15.006037
75%,230.0,170.0,3334.957039,2017.0,16.707965
max,380.0,271.0,4739.077089,2023.0,25.967222


## Question One

In [9]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

## Question Two

In [10]:
df['horsepower'].median()

np.float64(149.0)

## Prepare and split the dataset

### Prepare the dataset

In [14]:
n = len(df)
idx = np.arange(n)

np.random.seed(42)
np.random.shuffle(idx)


### Split the dataset

In [15]:
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

In [16]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

## Helper Functions

### Prepare X

In [19]:
def prepare_X(df, fillna_value=0):
    df_num = df.copy()
    df_num = df_num.fillna(fillna_value)
    X = df_num[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
    return X

### Train Linear Regression

In [20]:
def train_linear_regression(X, y):
    X = np.column_stack([np.ones(len(X)), X])
    w = np.linalg.inv(X.T @ X) @ X.T @ y
    return w

### Predict

In [21]:
def predict(X, w):
    X = np.column_stack([np.ones(len(X)), X])
    return X @ w

### RMSE

In [23]:
def rmse(y, y_pred):
    return np.sqrt(np.mean((y - y_pred) ** 2))


## Question Three

### Fill missing values with zero

In [24]:
X_train_0 = prepare_X(df_train, fillna_value=0)
X_val_0 = prepare_X(df_val, fillna_value=0)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values

w0 = train_linear_regression(X_train_0, y_train)
y_pred_0 = predict(X_val_0, w0)
rmse_0 = rmse(y_val, y_pred_0)

print(round(rmse_0, 2))


0.52


### Fill missing values with mean

In [25]:
mean_hp = df_train['horsepower'].mean()

X_train_mean = prepare_X(df_train, fillna_value=mean_hp)
X_val_mean = prepare_X(df_val, fillna_value=mean_hp)

w_mean = train_linear_regression(X_train_mean, y_train)
y_pred_mean = predict(X_val_mean, w_mean)
rmse_mean = rmse(y_val, y_pred_mean)

print(round(rmse_mean, 2))


0.46


### Train Linear Regression Regularized

In [26]:
def train_linear_regression_reg(X, y, r=0.0):
    X = np.column_stack([np.ones(len(X)), X])
    XTX = X.T @ X
    XTX = XTX + r * np.eye(XTX.shape[0])   # Add regularization
    w = np.linalg.inv(XTX) @ X.T @ y
    return w


### Prepare the data

In [27]:
X_train = prepare_X(df_train, fillna_value=0)
X_val = prepare_X(df_val, fillna_value=0)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values


### Train and evaluate for different r values

In [28]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

for r in r_values:
    w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = predict(X_val, w)
    score = rmse(y_val, y_pred)
    rmse_scores[r] = round(score, 2)

rmse_scores


{0: np.float64(0.52),
 0.01: np.float64(0.52),
 0.1: np.float64(0.52),
 1: np.float64(0.52),
 5: np.float64(0.52),
 10: np.float64(0.52),
 100: np.float64(0.52)}

## Helper Functions

In [29]:
def split_dataset(df, seed):
    n = len(df)
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    return df_train, df_val, df_test


def prepare_X(df, fillna_value=0):
    features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
    df = df[features].fillna(fillna_value)
    return df.values


def train_linear_regression(X, y):
    X = np.column_stack([np.ones(len(X)), X])
    w = np.linalg.inv(X.T @ X) @ X.T @ y
    return w


def predict(X, w):
    X = np.column_stack([np.ones(len(X)), X])
    return X @ w


def rmse(y, y_pred):
    error = y - y_pred
    return np.sqrt(np.mean(error ** 2))


In [30]:
rmse_scores = []

for seed in range(10):
    df_train, df_val, df_test = split_dataset(df, seed)
    
    X_train = prepare_X(df_train, fillna_value=0)
    X_val = prepare_X(df_val, fillna_value=0)
    
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    
    w = train_linear_regression(X_train, y_train)
    y_pred = predict(X_val, w)
    
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)

rmse_scores


[np.float64(0.5210993169546174),
 np.float64(0.5218422383644823),
 np.float64(0.5230384519921029),
 np.float64(0.5161215334614944),
 np.float64(0.5111869530212753),
 np.float64(0.5286833855480433),
 np.float64(0.5322418918535805),
 np.float64(0.5095258233924653),
 np.float64(0.5149083792499013),
 np.float64(0.5131330922550065)]

In [31]:
np.std(rmse_scores)


np.float64(0.007126319126498185)

In [32]:
round(0.0062, 3)  # 0.006


0.006

In [33]:
def train_linear_regression_reg(X, y, r=0.0):
    X = np.column_stack([np.ones(len(X)), X])
    XTX = X.T @ X
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
    w = np.linalg.inv(XTX) @ X.T @ y
    return w


In [34]:
df_train, df_val, df_test = split_dataset(df, seed=9)

df_full_train = pd.concat([df_train, df_val])

X_train = prepare_X(df_full_train, fillna_value=0)
y_train = df_full_train.fuel_efficiency_mpg.values

X_test = prepare_X(df_test, fillna_value=0)
y_test = df_test.fuel_efficiency_mpg.values


In [35]:
w = train_linear_regression_reg(X_train, y_train, r=0.001)
y_pred = predict(X_test, w)
score = rmse(y_test, y_pred)
round(score, 3)


np.float64(0.516)