In [1]:
import pandas as pd
import numpy as np

In [2]:
# Pandas version
pd.__version__

'2.2.2'

In [3]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


In [5]:
# Row Count
print(df.shape[0])

9704


In [6]:
# Fuel Types
print("Distinct fuel types:", df['fuel_type'].unique())

Distinct fuel types: ['Gasoline' 'Diesel']


In [7]:
# Missing values
missing_counts = df.isnull().sum()
num_cols_with_missing = (missing_counts > 0).sum()
print("Number of columns with missing values:", num_cols_with_missing)

Number of columns with missing values: 4


In [8]:
# Max fuel efficient car in Asia
asia_cars = df[df['origin'] == 'Asia']
max_val_asia = asia_cars['fuel_efficiency_mpg'].max()
print("Maximum fuel efficiency for Asia:", max_val_asia)

Maximum fuel efficiency for Asia: 23.759122836520497


In [9]:
# Horspower median calculation
median_before = df['horsepower'].median()
print("Median horsepower (before filling):", median_before)
most_frequent = df['horsepower'].mode()[0]
print("Most frequent horsepower value:", most_frequent)

#Filling most frequesnt values
df['horsepower'] = df['horsepower'].fillna(most_frequent)

median_after = df['horsepower'].median()
print("Median horsepower (after filling):", median_after)
print("Has the median changed?", "Yes" if median_before != median_after else "No")

Median horsepower (before filling): 149.0
Most frequent horsepower value: 152.0
Median horsepower (after filling): 152.0
Has the median changed? Yes


In [42]:
# Step 1: Select all cars from Asia
asia_cars = df[df['origin'] == 'Asia']
X = asia_cars[['vehicle_weight', 'model_year']].head(7).to_numpy()

# X^T * X
XTX = X.T @ X

# Invert XTX
XTX_inv = np.linalg.inv(XTX)


y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

# w = (XTX_inv @ X.T) @ y
w = XTX_inv @ X.T @ y

# Sum of all elements of w
sum_w = np.sum(w)

print("Result w:", w)
print("Sum of all elements of w:", sum_w)

Result w: [0.01386421 0.5049067 ]
Sum of all elements of w: 0.5187709081074016
