In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from ml_zoomcamp.utils import clean_column_names, load_data

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

## Data Preparation


In [2]:
csv_uri = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-02-car-price/data.csv"
df = load_data(csv_uri, DATA_DIR)
df = clean_column_names(df)

In [3]:
df.schema

Schema([('make', String),
        ('model', String),
        ('year', Int64),
        ('engine_fuel_type', String),
        ('engine_hp', Int64),
        ('engine_cylinders', Int64),
        ('transmission_type', String),
        ('driven_wheels', String),
        ('number_of_doors', Int64),
        ('market_category', String),
        ('vehicle_size', String),
        ('vehicle_style', String),
        ('highway_mpg', Int64),
        ('city_mpg', Int64),
        ('popularity', Int64),
        ('msrp', Int64)])

In [4]:
df.glimpse()

Rows: 11914
Columns: 16
$ make              <str> 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW'
$ model             <str> '1 Series M', '1 Series', '1 Series', '1 Series', '1 Series', '1 Series', '1 Series', '1 Series', '1 Series', '1 Series'
$ year              <i64> 2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013
$ engine_fuel_type  <str> 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)', 'premium unleaded (required)'
$ engine_hp         <i64> 335, 300, 300, 230, 230, 230, 300, 300, 230, 230
$ engine_cylinders  <i64> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6
$ transmission_type <str> 'MANUAL', 'MANUAL', 'MANUAL', 'MANUAL', 'MANUAL', 'MANUAL', 'MANUAL', 'MANUAL', 'MANUAL', 'MANUAL'
$ driven_wheels     <str> 'rear wheel drive', 'rear wheel d

In [5]:
df.describe()

statistic,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
str,str,str,f64,str,f64,f64,str,str,f64,str,str,str,f64,f64,f64,f64
"""count""","""11914""","""11914""",11914.0,"""11911""",11845.0,11884.0,"""11914""","""11914""",11908.0,"""11914""","""11914""","""11914""",11914.0,11914.0,11914.0,11914.0
"""null_count""","""0""","""0""",0.0,"""3""",69.0,30.0,"""0""","""0""",6.0,"""0""","""0""","""0""",0.0,0.0,0.0,0.0
"""mean""",,,2010.384338,,249.38607,5.628829,,,3.436093,,,,26.637485,19.733255,1554.911197,40594.737032
"""std""",,,7.57974,,109.19187,1.780559,,,0.881315,,,,8.863001,8.987798,1441.855347,60109.103604
"""min""","""Acura""","""1 Series""",1990.0,"""diesel""",55.0,0.0,"""AUTOMATED_MANUAL""","""all wheel drive""",2.0,"""Crossover""","""Compact""","""2dr Hatchback""",12.0,7.0,2.0,2000.0
"""25%""",,,2007.0,,170.0,4.0,,,2.0,,,,22.0,16.0,549.0,21000.0
"""50%""",,,2015.0,,227.0,6.0,,,4.0,,,,26.0,18.0,1385.0,29995.0
"""75%""",,,2016.0,,300.0,6.0,,,4.0,,,,30.0,22.0,2009.0,42235.0
"""max""","""Volvo""","""xD""",2017.0,"""regular unleaded""",1001.0,16.0,"""UNKNOWN""","""rear wheel drive""",4.0,"""Performance,Hybrid""","""Midsize""","""Wagon""",354.0,137.0,5657.0,2065902.0


In [6]:
df.head()

make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
str,str,i64,str,i64,i64,str,str,i64,str,str,str,i64,i64,i64,i64
"""BMW""","""1 Series M""",2011,"""premium unleaded (required)""",335,6,"""MANUAL""","""rear wheel drive""",2,"""Factory Tuner,Luxury,High-Perf…","""Compact""","""Coupe""",26,19,3916,46135
"""BMW""","""1 Series""",2011,"""premium unleaded (required)""",300,6,"""MANUAL""","""rear wheel drive""",2,"""Luxury,Performance""","""Compact""","""Convertible""",28,19,3916,40650
"""BMW""","""1 Series""",2011,"""premium unleaded (required)""",300,6,"""MANUAL""","""rear wheel drive""",2,"""Luxury,High-Performance""","""Compact""","""Coupe""",28,20,3916,36350
"""BMW""","""1 Series""",2011,"""premium unleaded (required)""",230,6,"""MANUAL""","""rear wheel drive""",2,"""Luxury,Performance""","""Compact""","""Coupe""",28,18,3916,29450
"""BMW""","""1 Series""",2011,"""premium unleaded (required)""",230,6,"""MANUAL""","""rear wheel drive""",2,"""Luxury""","""Compact""","""Convertible""",28,18,3916,34500


In [7]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") != 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32
"""engine_hp""",69
"""engine_cylinders""",30
"""number_of_doors""",6
"""engine_fuel_type""",3
