In [23]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [6]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [7]:
print("Pandas version:", pd.__version__)

Pandas version: 2.3.1


In [8]:
print("Number of records:", len(df))

Number of records: 9704


In [10]:
print("Unique fuel types:", df['fuel_type'].nunique())
print("Fuel types list:", df['fuel_type'].unique())

Unique fuel types: 2
Fuel types list: ['Gasoline' 'Diesel']


In [12]:
print("Missing values per column:\n", df.isnull().sum())
print("Total missing values:", df.isnull().sum().sum())

missing_cols = (df.isnull().sum() > 0).sum()
print("Columns with missing values:", missing_cols)

Missing values per column:
 engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64
Total missing values: 2622
Columns with missing values: 4


In [14]:
asia_cars = df[df['origin'] == 'Asia']
max_eff = asia_cars['fuel_efficiency_mpg'].max()
print("Max fuel efficiency of cars from Asia:", max_eff)

Max fuel efficiency of cars from Asia: 23.759122836520497


* Median value of horsepower
1. Find the median value of horsepower column in the dataset.
2. Next, calculate the most frequent value of the same horsepower column.
3. Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
4. Calculate the median value of horsepower once again.

In [15]:
median_before = df['horsepower'].median()
print("Median horsepower before filling:", median_before)

Median horsepower before filling: 149.0


In [16]:
most_frequent_hp = df['horsepower'].mode()[0]
print("Most frequent horsepower:", most_frequent_hp)

Most frequent horsepower: 152.0


In [17]:
df['horsepower_filled'] = df['horsepower'].fillna(most_frequent_hp)

In [18]:
median_after = df['horsepower_filled'].median()
print("Median horsepower after filling:", median_after)

Median horsepower after filling: 152.0


In [19]:
if median_after > median_before:
    print("Yes, it increased")
elif median_after < median_before:
    print("Yes, it decreased")
else:
    print("No")

Yes, it increased


* Sum of weights
1. Select all the cars from Asia
2. Select only columns vehicle_weight and model_year
3. Select the first 7 values
4. Get the underlying NumPy array. Let's call it X.
5. Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
Invert XTX.
6. Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
7. Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
8. What's the sum of all the elements of the result?

In [20]:
X = df[df['origin'] == 'Asia'][['vehicle_weight', 'model_year']].head(7).to_numpy()
print("X:\n", X)

X:
 [[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]


In [21]:
XTX = X.T @ X
print("XTX:\n", XTX)

XTX:
 [[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]


In [24]:
XTX_inv = np.linalg.inv(XTX)

In [25]:
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

In [26]:
w = XTX_inv @ X.T @ y
print("w:", w)

w: [0.01386421 0.5049067 ]


In [27]:
print("Sum of weights:", w.sum())

Sum of weights: 0.5187709081074016
