In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
pd.__version__

'2.3.1'

In [3]:
#This will save the file locally as car_fuel_efficiency.csv.
#!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv -O car_fuel_efficiency.csv
#df = pd.read_csv("car_fuel_efficiency.csv")
#df.head()


In [4]:
#You can skip saving the file and just load it straight from GitHub:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

In [5]:
df = pd.read_csv(url)

In [6]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [7]:
print("Number of records(rows):", df.shape[0])
print("Number of columns:", df.shape[1])

Number of records(rows): 9704
Number of columns: 11


In [8]:
#How many fuel types are presented in the dataset?
# Look at the column names first
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [9]:
# Unique fuel types
df['fuel_type'].unique()

array(['Gasoline', 'Diesel'], dtype=object)

In [11]:
#Count
df['fuel_type'].nunique()

2

In [14]:
# Count missing values per each column
missing_per_column = df.isnull().sum()
missing_per_column

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [15]:
# Filter only columns that have missing values
missing_columns = missing_per_column[missing_per_column > 0]

print("Number of columns with missing values:", missing_columns.shape[0])
print("\nColumns with missing values:")
print(missing_columns)

Number of columns with missing values: 4

Columns with missing values:
num_cylinders    482
horsepower       708
acceleration     930
num_doors        502
dtype: int64


In [16]:
#What's the maximum fuel efficiency of cars from Asia?
# Filter for cars from Asia and get the max fuel efficiency
max_asia_mpg = df[df['origin'] == 'Asia']['fuel_efficiency_mpg'].max()
print("Maximum fuel efficiency for cars from Asia:", max_asia_mpg)

Maximum fuel efficiency for cars from Asia: 23.759122836520497


In [17]:
# Find the median value of horsepower column in the dataset
median_hp = df['horsepower'].median()
print("Median horsepower:", median_hp)

Median horsepower: 149.0


In [18]:
# Calculate the most frequent value of the same horsepower column
# mode() → returns a Series of the most frequent value(s).
# [0] → picks the first one if there are multiple modes
most_frequent_hp = df['horsepower'].mode()[0]
print("Most frequent horsepower value:", most_frequent_hp)

Most frequent horsepower value: 152.0


In [19]:
# Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step
# Fill missing horsepower values with the most frequent value
# fillna(most_frequent_hp) → replaces all NaN in the column with the most frequent value.
# Assigning back to df['horsepower'] updates the column in the DataFrame.
#The isnull().sum() check confirms that missing values are gone.

df['horsepower'] = df['horsepower'].fillna(most_frequent_hp)

# Verify that there are no more missing values
print(df['horsepower'].isnull().sum())

0


In [20]:
# Now, calculate the median value of horsepower once again.
median_hp = df['horsepower'].median()
print("Median horsepower after filling missing values:", median_hp)

Median horsepower after filling missing values: 152.0


In [21]:
print ("The Median horsepower increased. Yes, it increased.")

The Median horsepower increased. Yes, it increased.


In [23]:
# Select all the cars from Asia
asian_cars = df[df['origin'] == 'Asia']
asian_cars.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
8,250,1.0,174.0,2714.21931,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
12,320,5.0,145.0,2783.868974,15.1,2010,Asia,Diesel,All-wheel drive,1.0,16.17582
14,200,6.0,160.0,3582.687368,14.9,2007,Asia,Diesel,All-wheel drive,0.0,11.871091
20,150,3.0,197.0,2231.808142,18.7,2011,Asia,Gasoline,Front-wheel drive,1.0,18.889083
21,160,4.0,133.0,2659.431451,,2016,Asia,Gasoline,Front-wheel drive,-1.0,16.07773


In [24]:
#see how many Asian care we have
len(asian_cars)

3247

In [29]:
#Select only columns vehicle_weight and model_year and first 7 values
asian_cars = asian_cars[['vehicle_weight', 'model_year']]
asian_cars.head(7)

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [30]:
# Select only the columns and first 7 rows, and save in a new DataFrame
asian_cars_first7 = asian_cars.head(7)

# Check the result
asian_cars_first7

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [31]:
#Get the underlying NumPy array. Let's call it X.
X = asian_cars_first7.values

# Check the shape and first few rows
print("Shape of X:", X.shape)
print(X[:7])  # show first 7 rows

Shape of X: (7, 2)
[[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]


In [32]:
# Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX
# X.T flips rows and columns
XT = X.T

In [33]:
# Multiply XT with X
# This performs matrix-matrix multiplication
XTX = XT.dot(X)

In [35]:
print("XTX matrix:")
print(XTX)
print("Now XTX contains the dot product of X transpose and X, which is often used in linear regression and other ML calculations.")

XTX matrix:
[[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]
Now XTX contains the dot product of X transpose and X, which is often used in linear regression and other ML calculations.


In [36]:
# Invert XTX
XTX_inv = np.linalg.inv(XTX)

# Check the result
print("Inverse of XTX:")
print(XTX_inv)

Inverse of XTX:
[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]


In [37]:
#XTX must be square and non-singular (determinant ≠ 0) to be invertible.
#In this case, XTX is (2, 2) because you have 2 columns (vehicle_weight and model_year), so inversion is possible as long as the columns are linearly independent.
#A square matrix is invertible only if its determinant ≠ 0.
#Determinant = 0 → matrix is singular, meaning some columns are linearly dependent.
#You can check if the determinant is non-zero before inverting:

det = np.linalg.det(XTX)
print("Determinant of XTX:", det)

if det != 0:
    XTX_inv = np.linalg.inv(XTX)
    print("Inverse computed successfully!")
else:
    print("XTX is singular and cannot be inverted.")

Determinant of XTX: 49647390896215.67
Inverse computed successfully!


In [38]:
# Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200]
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

In [39]:
# Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w
# w=(XTX)−1XTy

# Step 1: Multiply XTX_inv with X.T
temp = XTX_inv.dot(X.T)

# Step 2: Multiply the result by y
w = temp.dot(y)

# Check the result
print("w:", w)

w: [0.01386421 0.5049067 ]


In [40]:
# What's the sum of all the elements of the result?
sum_w = np.sum(w)
print("Sum of elements in w:", sum_w)

Sum of elements in w: 0.5187709081074016
