## dataset link 

https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data

https://www.kaggle.com/datasets/sukritchatterjee/used-cars-dataset-cardekho

In [8]:
import pandas as pd
from pathlib import Path

CACHE = "dataset/autovalu_mvp.parquet"
RAW = "dataset/vehicles.csv"


In [9]:
# Step 2 — Load dataset with caching (final cell)


if Path(CACHE).exists():
    # Fast path: load compact dataset
    df = pd.read_parquet(CACHE, engine="pyarrow")
    print("Loaded compact dataset:", df.shape)
else:
    # Slow path: load raw CSV and reduce it, then save
    df_raw = pd.read_csv(RAW, low_memory=False, on_bad_lines="skip")

    # Drop useless columns
    cols_to_drop = ["id","url","region_url","VIN","image_url","description",
                    "county","lat","long","posting_date"]
    df_reduced = df_raw.drop(columns=[c for c in cols_to_drop if c in df_raw.columns], errors="ignore").copy()

    # Convert to categories
    cat_cols = ["manufacturer","model","condition","cylinders","fuel","title_status",
                "transmission","drive","size","type","paint_color","state","region"]
    for c in cat_cols:
        if c in df_reduced.columns and df_reduced[c].dtype == "object":
            df_reduced[c] = df_reduced[c].astype("category")

    # Numeric compression
    if "year" in df_reduced.columns:
        df_reduced["year"] = pd.to_numeric(df_reduced["year"], errors="coerce").astype("Int16")
    if "price" in df_reduced.columns:
        df_reduced["price"] = pd.to_numeric(df_reduced["price"], errors="coerce", downcast="integer")
    if "odometer" in df_reduced.columns:
        df_reduced["odometer"] = pd.to_numeric(df_reduced["odometer"], errors="coerce", downcast="float")

    # Save compact version
    df_reduced.to_parquet(CACHE, engine="pyarrow", compression="zstd", index=False)
    df = df_reduced
    print("Built and cached compact dataset:", df.shape)


Loaded compact dataset: (426880, 16)
