In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
# to make sure the plots can be displayed in the book - for rendering the figures instead of dumping their 'object'/type
%matplotlib inline  


In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-28 13:08:54--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.1’


2025-10-28 13:08:54 (89.6 MB/s) - ‘car_fuel_efficiency.csv.1’ saved [874188/874188]



In [80]:
df=pd.read_csv('car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [81]:
df.shape


(9704, 11)

In [82]:
df.tail()
list(df.columns)

['engine_displacement',
 'num_cylinders',
 'horsepower',
 'vehicle_weight',
 'acceleration',
 'model_year',
 'origin',
 'fuel_type',
 'drivetrain',
 'num_doors',
 'fuel_efficiency_mpg']

In [83]:
df['fuel_efficiency_mpg'].round(3)

0       13.232
1       13.688
2       14.246
3       16.913
4       12.488
         ...  
9699    15.102
9700    17.962
9701    17.187
9702    15.332
9703    14.884
Name: fuel_efficiency_mpg, Length: 9704, dtype: float64

## Q1

In [84]:
df[['engine_displacement','horsepower','vehicle_weight','model_year']].isna().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
dtype: int64

## Q2

In [85]:
df['horsepower'].median()

np.float64(149.0)

## Q3

In [111]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']

In [112]:
df_new = df[base]

In [113]:
n = len(df_new)
n_val = n_test = int(n*0.2)
n_train = n - (n_val + n_test)

df_train = df_new.iloc[0:n-(n_test+n_val)]
df_val = df_new.iloc[n_train:n_train+n_val]
df_test = df_new.iloc[n_train+n_val:]

df_train.shape, df_test.shape, df_val.shape

((5824, 5), (1940, 5), (1940, 5))

In [114]:
idx = np.arange(n)
np.random.seed(42) 
np.random.shuffle(idx)

In [115]:
df_train = df_new.iloc[idx[:n_train]]
df_val = df_new.iloc[idx[n_train:n_train+n_val]]
df_test = df_new.iloc[idx[n_train+n_val:]]

In [116]:
df_train.reset_index(drop=True)
df_test.reset_index(drop=True)
df_val.reset_index(drop=True)

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,180,112.0,2772.134410,2020,15.366341
1,260,138.0,3014.061530,2022,14.601182
2,140,181.0,3652.186151,2016,12.247032
3,270,105.0,2753.040847,2023,15.136400
4,170,224.0,4163.375137,2003,10.340419
...,...,...,...,...,...
1935,290,205.0,3403.401496,2019,13.868607
1936,200,159.0,2752.516039,2009,15.665403
1937,230,202.0,3303.000688,2008,13.613127
1938,220,169.0,3293.420251,2008,13.733343


In [117]:
y_train=np.log1p(df_train.fuel_efficiency_mpg.values)
y_test=np.log1p(df_test.fuel_efficiency_mpg.values)
y_val=np.log1p(df_val.fuel_efficiency_mpg.values)

In [118]:
# TRAIN 1 - NULLS REPLACED WITH MEAN 
# TRAIN 2 - NULLS REPLACED WITH 0

df_train1 = df_train.copy()
df_train2 = df_train.copy()


df_test1 = df_test.copy()
df_test2 = df_test.copy()


df_val1 = df_test.copy()
df_val2 = df_test.copy()

In [77]:
del df_train['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg'] 
del df_val['fuel_efficiency_mpg'] 

In [119]:
df_train1['horsepower']=df_train1['horsepower'].fillna(df_train1['horsepower'].mean())
df_val1['horsepower']=df_val1['horsepower'].fillna(df_val1['horsepower'].mean())
df_test1['horsepower']=df_test1['horsepower'].fillna(df_test1['horsepower'].mean())

In [120]:
# applying model on 'mean' dataset - df_train1

# function to train
def train_linear_regression(X,y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones,X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

# Function to predict
def predict(X, w0, w):
    return w0 + X.dot(w)

# Function to calculate RMSE
def rmse(y_true, y_pred):
    se = (y_true - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [124]:
# Prepare data

features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year'] 
target = 'fuel_efficiency_mpg'

X_train1 = df_train1[features].values
y_train1 = df_train1[target].values
X_val1 = df_val1[features].values
y_val1 = df_val1[target].values

df_train1

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
483,220,144.0,2535.887591,2009,16.642943
7506,160,141.0,2741.170484,2019,16.298377
8795,230,155.0,2471.880237,2017,18.591822
1688,150,206.0,3748.164469,2015,11.818843
6217,300,111.0,2135.716359,2006,19.402209
...,...,...,...,...,...
1696,260,139.0,2606.972984,2009,16.964054
5685,280,132.0,4004.214323,2014,10.059094
6735,210,152.0,2500.175687,2020,17.593237
2052,250,154.0,2254.313245,2002,18.925748


In [58]:
X_train2 = df_train2.to_numpy()
X_train2
train_linear_regression(X_train2,y_train)

(np.float64(nan), array([nan, nan, nan, nan]))