In [3]:
import pandas as pd
import numpy as np
from numpy.linalg import inv

# Load
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

print("Columns:", df.columns.tolist())
print("Shape:", df.shape)
print()

# Q2. Records count
q2 = len(df)
print("Q2 (records count):", q2)

# Q3. Fuel types (unique count)
fuel_col = next((c for c in df.columns if c.lower() in ("fuel", "fuel_type")), None)
q3 = int(df[fuel_col].nunique())
print("Q3 (fuel types):", q3, "→ unique:", sorted(df[fuel_col].dropna().unique().tolist()))

# Q4. How many columns have missing values
q4 = int(df.isna().any().sum())
total_na = int(df.isna().sum().sum())
print("Q4 (columns with missing):", q4, f"(total missing cells: {total_na})")

# Q5. Max fuel efficiency of cars from Asia
origin_col = next((c for c in df.columns if c.lower() == "origin"), None)
fe_col = next((c for c in df.columns if any(k in c.lower() for k in ["mpg","fuel_eff","efficiency"])), None)

asia = df[df[origin_col] == "Asia"]
asia_fe = pd.to_numeric(asia[fe_col], errors="coerce")
q5_val = float(asia_fe.max())
print(f"Q5 (max fuel efficiency, Asia): {q5_val:.2f}")

# Q6. Median horsepower, then fillna with mode, median again
hp_col = next((c for c in df.columns if ("horse" in c.lower()) or (c.lower() == "hp")), None)
hp = pd.to_numeric(df[hp_col], errors="coerce")
med1 = float(hp.median())
mode_hp = float(hp.mode(dropna=True).iloc[0])
med2 = float(hp.fillna(mode_hp).median())
if med2 > med1: q6 = "Yes, it increased"
elif med2 < med1: q6 = "Yes, it decreased"
else: q6 = "No"
print(f"Q6 (median HP): before={med1:.2f}, after={med2:.2f} → {q6}")

# Q7. Linear algebra mini-regression on Asia weights/year
X = asia[["vehicle_weight", "model_year"]].head(7).to_numpy(dtype=float)
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200], dtype=float)
w = inv(X.T @ X) @ X.T @ y
q7_sum = float(w.sum())
print(f"Q7 (sum of w elements): {q7_sum:.6f}")

# Map to choices for convenience
def nearest(val, choices): 
    choices = np.array(choices, dtype=float)
    return float(choices[np.argmin(np.abs(choices - val))])

print("Q2:", nearest(q2, [4704, 8704, 9704, 17704]))
print("Q3:", nearest(q3, [1, 2, 3, 4]))
print("Q4:", nearest(q4, [0, 1, 2, 3, 4]))
print("Q5:", nearest(q5_val, [13.75, 23.75, 33.75, 43.75]))
print("Q6:", q6)
print("Q7:", nearest(q7_sum, [0.051, 0.51, 5.1, 51]))


Columns: ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']
Shape: (9704, 11)

Q2 (records count): 9704
Q3 (fuel types): 2 → unique: ['Diesel', 'Gasoline']
Q4 (columns with missing): 4 (total missing cells: 2622)
Q5 (max fuel efficiency, Asia): 23.76
Q6 (median HP): before=149.00, after=152.00 → Yes, it increased
Q7 (sum of w elements): 0.518771
Q2: 9704.0
Q3: 2.0
Q4: 4.0
Q5: 23.75
Q6: Yes, it increased
Q7: 0.51
