# Homework 1: Introduction to Machine Learning for Machine Learning Zoomcamp 2025

### Q1. Pandas version
What's the version of Pandas that you installed?

You can get the version information using the __version__ field:

In [25]:
import pandas as pd
import numpy as np

In [3]:
pd.__version__

'2.2.3'

In [4]:
df = pd.read_csv('car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


### Q2. Records count
How many records are in the dataset?

In [5]:
df.shape

(9704, 11)

### Q3. Fuel types
How many fuel types are presented in the dataset?

In [6]:
df.fuel_type.unique()

array(['Gasoline', 'Diesel'], dtype=object)

### Q4. Missing values
How many columns in the dataset have missing values?

In [8]:
df.isna().sum().sort_values(ascending=False)

acceleration           930
horsepower             708
num_doors              502
num_cylinders          482
engine_displacement      0
model_year               0
vehicle_weight           0
origin                   0
fuel_type                0
drivetrain               0
fuel_efficiency_mpg      0
dtype: int64

### Q5. Max fuel efficiency
What's the maximum fuel efficiency of cars from Asia?

In [10]:
asian_cars = df[df['origin'] == 'Asia']

In [12]:
asian_cars.fuel_efficiency_mpg.max()

23.759122836520497

### Q6. Median value of horsepower
- Find the median value of horsepower column in the dataset.
- Next, calculate the most frequent value of the same horsepower column.
- Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
- Now, calculate the median value of horsepower once again.

Has it changed?
- Yes, it increased
- Yes, it decreased
- No

In [13]:
# Step 1: Calculate initial median (ignoring NaN)
initial_median = df['horsepower'].median()
print(f"1. Initial median of horsepower: {initial_median}")

1. Initial median of horsepower: 149.0


In [14]:
# Step 2: Calculate most frequent value (mode)
mode_values = df['horsepower'].mode()
most_frequent = mode_values[0]  # Takes first value in case of tie
print(f"2. Most frequent value (mode) of horsepower: {most_frequent}")

2. Most frequent value (mode) of horsepower: 152.0


In [15]:
# Step 3: Fill NaN values with the most frequent value
df_filled = df.copy()
df_filled['horsepower'] = df_filled['horsepower'].fillna(most_frequent)

In [16]:
# Step 4: Calculate new median after filling
new_median = df_filled['horsepower'].median()
print(f"3. New median of horsepower after filling: {new_median}")

3. New median of horsepower after filling: 152.0


### Q7. Sum of weights
- Select all the cars from Asia
- Select only columns vehicle_weight and model_year
- Select the first 7 values
- Get the underlying NumPy array. Let's call it X.
- Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
- Invert XTX.
- Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
- Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
- What's the sum of all the elements of the result?

In [18]:
# Step 1: Select all cars from Asia
asian_cars = df[df['origin'] == 'Asia']
print("Asian cars:")
print(asian_cars[['vehicle_weight', 'model_year']])
print()

Asian cars:
      vehicle_weight  model_year
8        2714.219310        2016
12       2783.868974        2010
14       3582.687368        2007
20       2231.808142        2011
21       2659.431451        2016
...              ...         ...
9688     3948.404625        2018
9692     3680.341381        2016
9693     2545.070139        2012
9698     3107.427820        2005
9703     2908.043477        2005

[3247 rows x 2 columns]



In [19]:
# Step 2: Select only columns vehicle_weight and model_year
selected_data = asian_cars[['vehicle_weight', 'model_year']]

In [20]:
# Step 3: Select the first 7 values (we already have them)
first_7 = selected_data.head(7)

print("First 7 values:")
print(first_7)
print()

First 7 values:
    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019



In [21]:
# Step 4: Get the underlying NumPy array X
X = first_7.values
print("Matrix X:")
print(X)
print(f"Shape: {X.shape}")
print()

Matrix X:
[[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]
Shape: (7, 2)



In [22]:
# Step 5: Compute XTX = X.T * X
XTX = X.T @ X
print("Matrix XTX:")
print(XTX)
print(f"Shape: {XTX.shape}")
print()

Matrix XTX:
[[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]
Shape: (2, 2)



In [26]:
# Step 6: Invert XTX
try:
    XTX_inv = np.linalg.inv(XTX)
    print("Inverse of XTX:")
    print(XTX_inv)
    print(f"Shape: {XTX_inv.shape}")
except np.linalg.LinAlgError:
    print("Matrix is singular, cannot invert")
    XTX_inv = np.linalg.pinv(XTX)  # Use pseudo-inverse if regular inverse fails
    print("Using pseudo-inverse:")
    print(XTX_inv)
print()

Inverse of XTX:
[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]
Shape: (2, 2)



In [27]:
# Step 7: Create array y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print("Array y:")
print(y)
print(f"Shape: {y.shape}")
print()

Array y:
[1100 1300  800  900 1000 1100 1200]
Shape: (7,)



In [28]:
# Step 8: Compute w = (XTX)^(-1) * XT * y
# w = XTX_inv @ X.T @ y
XT = X.T
print("Matrix X.T:")
print(XT)
print(f"Shape: {XT.shape}")
print()

Matrix X.T:
[[2714.21930965 2783.86897424 3582.68736772 2231.8081416  2659.43145076
  2844.22753389 3761.99403819]
 [2016.         2010.         2007.         2011.         2016.
  2014.         2019.        ]]
Shape: (2, 7)



In [29]:
# Multiply step by step
step1 = XTX_inv @ XT
print("XTX_inv @ X.T:")
print(step1)
print(f"Shape: {step1.shape}")
print()

XTX_inv @ X.T:
[[-1.31202622e-04 -8.63909858e-05  3.72634923e-04 -4.02726650e-04
  -1.62513724e-04 -5.52342829e-05  4.65094049e-04]
 [ 2.62636846e-04  1.96990690e-04 -4.73392228e-04  6.58944477e-04
   3.08357831e-04  1.51636137e-04 -6.07979633e-04]]
Shape: (2, 7)



In [30]:
w = step1 @ y
print("Result w:")
print(w)
print(f"Shape: {w.shape}")
print()

Result w:
[0.01386421 0.5049067 ]
Shape: (2,)



In [31]:
# Step 9: Sum of all elements of w
sum_w = np.sum(w)
print(f"Sum of all elements of w: {sum_w}")

Sum of all elements of w: 0.5187709081074007
