In [4]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
# to make sure the plots can be displayed in the book - for rendering the figures instead of dumping their 'object'/type
%matplotlib inline  


In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-28 13:08:54--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.1’


2025-10-28 13:08:54 (89.6 MB/s) - ‘car_fuel_efficiency.csv.1’ saved [874188/874188]



In [5]:
df=pd.read_csv('car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:
df.shape


(9704, 11)

In [7]:
df.tail()
list(df.columns)

['engine_displacement',
 'num_cylinders',
 'horsepower',
 'vehicle_weight',
 'acceleration',
 'model_year',
 'origin',
 'fuel_type',
 'drivetrain',
 'num_doors',
 'fuel_efficiency_mpg']

In [8]:
df['fuel_efficiency_mpg'].round(3)

0       13.232
1       13.688
2       14.246
3       16.913
4       12.488
         ...  
9699    15.102
9700    17.962
9701    17.187
9702    15.332
9703    14.884
Name: fuel_efficiency_mpg, Length: 9704, dtype: float64

## Q1

In [9]:
df[['engine_displacement','horsepower','vehicle_weight','model_year']].isna().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
dtype: int64

## Q2

In [10]:
df['horsepower'].median()

np.float64(149.0)

## Q3

In [11]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']

In [12]:
df_new = df[base]

In [13]:
n = len(df_new)
n_val = n_test = int(n*0.2)
n_train = n - (n_val + n_test)

df_train = df_new.iloc[0:n-(n_test+n_val)]
df_val = df_new.iloc[n_train:n_train+n_val]
df_test = df_new.iloc[n_train+n_val:]

df_train.shape, df_test.shape, df_val.shape

((5824, 5), (1940, 5), (1940, 5))

In [14]:
idx = np.arange(n)
np.random.seed(42) 
np.random.shuffle(idx)

In [15]:
df_train = df_new.iloc[idx[:n_train]]
df_val = df_new.iloc[idx[n_train:n_train+n_val]]
df_test = df_new.iloc[idx[n_train+n_val:]]

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train=np.log1p(df_train.fuel_efficiency_mpg.values)
y_test=np.log1p(df_test.fuel_efficiency_mpg.values)
y_val=np.log1p(df_val.fuel_efficiency_mpg.values)

In [18]:
# TRAIN 1 - NULLS REPLACED WITH MEAN 
# TRAIN 2 - NULLS REPLACED WITH 0

# Train1: fill NaNs with mean
df_train1 = df_train.copy()
df_val1   = df_val.copy()
df_test1  = df_test.copy()

# Train2: fill NaNs with 0
df_train2 = df_train.copy()
df_val2   = df_val.copy()
df_test2  = df_test.copy()


In [19]:
#fill NaNs using training mean
mean_hp = df_train1['horsepower'].mean()
df_train1['horsepower'] = df_train1['horsepower'].fillna(mean_hp)
df_val1['horsepower']   = df_val1['horsepower'].fillna(mean_hp)
df_test1['horsepower']  = df_test1['horsepower'].fillna(mean_hp)

# fill NaNs with 0
df_train2['horsepower'] = df_train2['horsepower'].fillna(0)
df_val2['horsepower']   = df_val2['horsepower'].fillna(0)
df_test2['horsepower']  = df_test2['horsepower'].fillna(0)

In [20]:
# Remove target column from feature sets

target = 'fuel_efficiency_mpg'
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

df_train1 = df_train1.drop(columns=[target])
df_val1   = df_val1.drop(columns=[target])
df_test1  = df_test1.drop(columns=[target])

df_train2 = df_train2.drop(columns=[target])
df_val2   = df_val2.drop(columns=[target])
df_test2  = df_test2.drop(columns=[target])

In [21]:
# Prepare Feature Matrices

X_train1 = df_train1[features].values
X_val1   = df_val1[features].values
X_test1  = df_test1[features].values

X_train2 = df_train2[features].values
X_val2   = df_val2[features].values
X_test2  = df_test2[features].values



In [22]:
# applying model on 'mean' dataset - df_train1

# function to train
def train_linear_regression(X,y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones,X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

# Function to predict
def predict(X, w0, w):
    return w0 + X.dot(w)

# Function to calculate RMSE
def rmse(y_true, y_pred):
    se = (y_true - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [23]:
# Train model on Train1 dataset (mean imputation)

w0, w = train_linear_regression(X_train1, y_train)


# Predict on validation
y_val_pred = predict(X_val1, w0, w)
val_rmse = rmse(y_val, y_val_pred)
print("Validation RMSE (Train1 - mean imputation):", val_rmse)


Validation RMSE (Train1 - mean imputation): 0.037327717015389235


In [24]:
# Train2 - Zero imputation
w0_2, w2 = train_linear_regression(X_train2, y_train)


y_val_pred2 = predict(X_val2, w0_2, w2)
val_rmse2 = rmse(y_val, y_val_pred2)
print("Validation RMSE (Train2 - zero imputation):", val_rmse2)


Validation RMSE (Train2 - zero imputation): 0.039979257825777034


#### Answer - RMSE IS LESSER IN MEAN IMPUTATION CASE 

## Q4

##### we're using train2, test2, and val2 for this - req imputation with 0 


In [65]:
def train_linear_regression_reg(X,y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones,X])
    XTX = X.T.dot(X)
    
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]


for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w0,w=train_linear_regression_reg(X_train2,y_train,r=r)
    y_pred = w0 + X_val2.dot(w)
    
    final_rmse=rmse(y_val,y_pred).round(2)
    print(f'{r}  -  {w0}  -  {final_rmse}')

0  -  3.664666247383519  -  0.04
0.01  -  3.168813446992993  -  0.04
0.1  -  1.4288369989813987  -  0.04
1  -  0.22012861947491938  -  0.04
5  -  0.04624885656830112  -  0.04
10  -  0.023271722731612262  -  0.04
100  -  0.0023413689852454602  -  0.04


## Q5

In [35]:
score = []
def q5(r=0):
    idx = np.arange(n)
    np.random.seed(r) 
    np.random.shuffle(idx)
    df_train = df_new.iloc[idx[:n_train]]
    df_val = df_new.iloc[idx[n_train:n_train+n_val]]
    df_test = df_new.iloc[idx[n_train+n_val:]]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train=np.log1p(df_train.fuel_efficiency_mpg.values)
    y_test=np.log1p(df_test.fuel_efficiency_mpg.values)
    y_val=np.log1p(df_val.fuel_efficiency_mpg.values)

    df_train['horsepower'] = df_train['horsepower'].fillna(0)
    df_val['horsepower']   = df_val['horsepower'].fillna(0)
    df_test['horsepower']  = df_test['horsepower'].fillna(0)

    X_train = df_train[features].values
    X_val   = df_val[features].values
    X_test  = df_test[features].values

    w0, w = train_linear_regression(X_train, y_train)
    y_val_pred = predict(X_val, w0, w)
    val_rmse = rmse(y_val, y_val_pred)
    score.append(val_rmse)
    print(f"Validation RMSE for seed {r}: {val_rmse}")


for r in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    q5(r)

std_rmse = np.std(score).round(3)
print("Standard deviation of all RMSEs:", std_rmse)


Validation RMSE for seed 0: 0.03801775537080148
Validation RMSE for seed 1: 0.03927885333921255
Validation RMSE for seed 2: 0.03944653052537959
Validation RMSE for seed 3: 0.038727637037731855
Validation RMSE for seed 4: 0.03727535850054727
Validation RMSE for seed 5: 0.03938438834004637
Validation RMSE for seed 6: 0.03890763931360936
Validation RMSE for seed 7: 0.03837971626977783
Validation RMSE for seed 8: 0.04018986975230982
Validation RMSE for seed 9: 0.03860764644229973
Standard deviation of all RMSEs: 0.001


## Q6

In [63]:
def q5(r=0):
    idx = np.arange(n)
    np.random.seed(r) 
    np.random.shuffle(idx)
    df_train = df_new.iloc[idx[:n_train]]
    df_val = df_new.iloc[idx[n_train:n_train+n_val]]

    df_full = pd.concat([df_train,df_val])
          
    y_full= np.concatenate([y_train,y_val])
    
    df_full = df_full.reset_index(drop=True)

    df_full['horsepower'] = df_full['horsepower'].fillna(0)
    
    X_full = df_full[features].values


    w0, w = train_linear_regression_reg(X_full, y_full, r=0.001)

    #print(w0,w)

    X_test = df_test[features].fillna(0).values

    #print(X_test)
    
    y_test_pred = predict(X_test, w0, w)

    print(rmse(y_test, y_test_pred))


In [64]:
q5(9)

0.1562925322262958
