# Homework 1: Car Fuel Efficiency

In [2]:
#load libraries
import pandas as pd
import numpy as np

In [3]:
#load the dataset
df = pd.read_csv("car_fuel_efficiency.csv")
print("Loaded dataset. Shape:", df.shape)
print("Columns:", df.columns.tolist())

Loaded dataset. Shape: (9704, 11)
Columns: ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']


## Q1. Pandas version

### Question: What version of Pandas did you install?

In [4]:
print('Pandas version:', pd.__version__)

Pandas version: 2.2.3


## Q2. Records count

### Question: How many records are in the dataset?

In [5]:
# Count rows
n_rows = len(df)
print('Number of records:', n_rows)

Number of records: 9704


## Q3. Fuel types

### Question: How many fuel types are presented in the dataset?

In [8]:
fuel_col = 'fuel_type'
print('Unique fuel types and counts:')
print(df[fuel_col].value_counts(dropna=False))
print('Number of unique fuel types (excluding NaN):', df[fuel_col].nunique(dropna=True))

Unique fuel types and counts:
fuel_type
Gasoline    4898
Diesel      4806
Name: count, dtype: int64
Number of unique fuel types (excluding NaN): 2


## Q4. Missing values

### Question: How many columns in the dataset have missing values?

In [9]:
na_counts = df.isna().sum()
cols_with_na = na_counts[na_counts > 0]
print('Columns with missing values:', len(cols_with_na))
print(cols_with_na)

Columns with missing values: 4
num_cylinders    482
horsepower       708
acceleration     930
num_doors        502
dtype: int64


## Q5. Max fuel efficiency for Asia

### Question: What's the maximum fuel efficiency of cars from Asia?

In [11]:
origin_col = 'origin' # expected column name
fe_col = 'fuel_efficiency_mpg' # expected fuel efficiency column


# Filter rows where origin is Asia (case-insensitive)
asia = df[origin_col].astype(str).str.contains('Asia', case=False, na=False)
max_fe_asia = df.loc[asia, fe_col].max()
print('Maximum fuel efficiency (Asia):', round(max_fe_asia, 2))

Maximum fuel efficiency (Asia): 23.76


## Q6. Median horsepower and effect of filling missing values with the mode

### Question: Compute the median of the horsepower column. Then compute the mode (most frequent value). Fill missing horsepower values with the mode using fillna, recompute the median, and say whether the median changed.

In [14]:
hp_col = 'horsepower'


median_before = df[hp_col].median()
mode_hp = df[hp_col].mode(dropna=True).iloc[0]


# Create a copy and fill missing values with the mode
df_hp_filled = df.copy()
df_hp_filled[hp_col] = df_hp_filled[hp_col].fillna(mode_hp)
median_after = df_hp_filled[hp_col].median()


print('Median BEFORE filling:', median_before)
print('Mode used to fill:', mode_hp)
print('Median AFTER filling:', median_after)


if median_after > median_before:
    print('Has it changed? Yes, it increased')
elif median_after < median_before:
    print('Has it changed? Yes, it decreased')
else:
    print('Has it changed? No')

Median BEFORE filling: 149.0
Mode used to fill: 152.0
Median AFTER filling: 152.0
Has it changed? Yes, it increased


## Q7. Linear algebra (normal equation) — simple steps

### Question: Follow these steps exactly:

#### Select all cars from Asia.
#### Select only columns vehicle_weight and model_year.
#### Take the first 7 rows.
#### Convert to a NumPy array X.
#### Compute XTX = X.T @ X.
#### Invert XTX.
#### Let y = [1100,1300,800,900,1000,1100,1200].
#### Compute w = (XTX)^{-1} X.T y.
#### What is the sum of w?

In [16]:
weight_col = 'vehicle_weight'
year_col = 'model_year'


# Filter Asia and select columns
asia_rows = df[df['origin'].astype(str).str.contains('Asia', case=False, na=False)]
X_df = asia_rows[[weight_col, year_col]].head(7)
print('First 7 rows (vehicle_weight, model_year):')
print(X_df)


# Convert to NumPy
X = X_df.to_numpy(dtype=float)


# Normal equation steps
XTX = X.T @ X
XTX_inv = np.linalg.inv(XTX)


y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200], dtype=float)


w = XTX_inv @ X.T @ y
print('w:', w)
print('Sum of elements of w:', round(w.sum(), 4))

First 7 rows (vehicle_weight, model_year):
    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019
w: [0.01386421 0.5049067 ]
Sum of elements of w: 0.5188
