# DVM front-view


In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import torch
import webdataset as wds
import PIL.Image as Image
import lightning as L
import polars as pl
from sklearn.model_selection import train_test_split

BASE = Path("~/Downloads/dvm").expanduser()

TABLES = BASE / "tables_V2.0"
IMAGES = BASE / "Confirmed_fronts"
OUTPUT = Path("data/dvm")

OUTPUT.mkdir(parents=True, exist_ok=True)

SEED = 1337

L.seed_everything(SEED)

## Load tables

In [None]:
ad_data = pl.read_csv(
    TABLES / "Ad_table (extra).csv",
    infer_schema_length=None,
).with_columns(
    # remove characters (some include unit)
    pl.col("Runned_Miles").str.replace_all("[^0-9]", "").cast(pl.Float64),
    pl.col("Engin_size").str.replace_all("[^0-9]", "").cast(pl.Float64),
)

ad_data.columns = [col.strip() for col in ad_data.columns]

ad_data


In [None]:
image_data = pl.read_csv(TABLES / "Image_table.csv")
image_data

In [None]:
price_data = pl.read_csv(TABLES / "Price_table.csv")
price_data

## Filter to front-view

In [None]:
front_images = (
    image_data.filter(
        (pl.col("Quality_check") == "P"),
        (pl.col("Predicted_viewpoint") == 0),
    )
    .with_columns(
        pl.col("Image_ID")
        .str.split("$$")
        .list.slice(0, 2)
        .list.join("$$")
        .alias("Adv_ID")
    )
    .unique("Adv_ID")
)

front_images

## Merge tables

In [None]:
df = ad_data.join(
    price_data.select("Genmodel_ID", "Entry_price", "Year"),
    left_on=["Genmodel_ID", "Reg_year"],
    right_on=["Genmodel_ID", "Year"],
).join(
    front_images.select("Adv_ID", "Image_name"),
    on="Adv_ID",
)

assert df["Adv_ID"].n_unique() == len(df)
df


## Verify images exist

In [None]:
def image_path(image_name: str) -> Path:
    parts = image_name.split("$$")
    return IMAGES / parts[0] / parts[2] / image_name


def image_exists(image_name: str) -> bool:
    return image_path(image_name).is_file()


# Check which images exist
missing = {
    image_name
    for image_name in tqdm(df["Image_name"], desc="Checking image paths")
    if not image_exists(image_name)
}

print(f"Missing images: {len(missing):,} / {len(df):,}")

# Keep only samples with existing images
df = df.filter(~pl.col("Image_name").is_in(missing))

## Process features

In [None]:
# Drop rows with missing values in key columns
df = df.drop_nulls(
    (
        "Runned_Miles",
        "Price",
        "Engin_size",
        "Seat_num",
        "Door_num",
        "Wheelbase",
        "Height",
        "Width",
        "Length",
    )
)
print(f"After dropping NaN: {len(df):,}")

In [None]:
import numpy as np

physical_features = ("Wheelbase", "Height", "Width", "Length")

_n_rows = len(df)
df = df.with_columns(
    pl.col(col).add(np.random.randint(-50, 50, size=_n_rows))
    for col in physical_features
)

## Remove infrequent car models

In [None]:
MIN_SAMPLES = 100

# Count samples per model
model_counts = df["Genmodel_ID"].value_counts()
valid_models = model_counts.filter(pl.col("count") >= MIN_SAMPLES)[
    "Genmodel_ID"
].to_list()

print(f"Models with >= {MIN_SAMPLES} samples: {len(valid_models)}")
print(f"Models removed: {len(model_counts) - len(valid_models)}")


df = df.filter(pl.col("Genmodel_ID").is_in(valid_models))
print(f"Final dataset size: {len(df):,}")

In [None]:
df

# Enumerate model labels

In [None]:
# Readable model names
df = df.with_columns(
    pl.concat_str(
        "Maker",
        "Genmodel",
        separator=" ",
    ).alias("model_name"),
).with_columns(
    # change all 21_1 model names to "Citroen DS3" and all 7_11 to "Audi A6"
    pl.when(pl.col("Genmodel_ID").eq("21_1"))
    .then(pl.lit("Citroen DS3"))
    .when(pl.col("Genmodel_ID").eq("7_11"))
    .then(pl.lit("Audi A6"))
    .otherwise(pl.col("model_name"))
    .alias("model_name")
)
df["model_name"].unique()


In [None]:
# Remap Genmodel_ID to contiguous integers
model_to_idx = (
    df.select("Genmodel_ID", "model_name")
    .unique()
    .sort("Genmodel_ID")
    .with_row_index()
    .rename({"index": "label"})
)
model_to_idx


In [None]:
# assert no duplicates in any col of model_to_idx
assert (
    model_to_idx.group_by("Genmodel_ID")
    .agg("model_name")
    .filter(pl.col("model_name").list.len() > 1)
    .is_empty()
)


In [None]:
# Add label column to df
df = df.join(
    model_to_idx.select("Genmodel_ID", "label"),
    on="Genmodel_ID",
)

n_classes = len(model_to_idx)
print(f"Number of classes: {n_classes}")

## Splits

In [None]:
# Stratified split: 80% train+val, 20% test
train_val_ids, test_ids = train_test_split(
    df["Adv_ID"].to_list(),
    test_size=0.2,
    random_state=SEED,
    stratify=df["label"].to_list(),
)

# Split train+val: 80% train, 20% val (of the 80%)
train_val_df = df.filter(pl.col("Adv_ID").is_in(train_val_ids))
train_ids, val_ids = train_test_split(
    train_val_df["Adv_ID"].to_list(),
    test_size=0.2,
    random_state=SEED,
    stratify=train_val_df["label"].to_list(),
)

print(f"Train: {len(train_ids):,}, Val: {len(val_ids):,}, Test: {len(test_ids):,}")
print(
    f"Ratios: {len(train_ids) / len(df):.1%} / {len(val_ids) / len(df):.1%} / {len(test_ids) / len(df):.1%}"
)

train_df = df.filter(pl.col("Adv_ID").is_in(train_ids)).sample(
    fraction=1.0,
    with_replacement=False,
    seed=SEED,
    shuffle=True,
)

val_df = df.filter(pl.col("Adv_ID").is_in(val_ids))
test_df = df.filter(pl.col("Adv_ID").is_in(test_ids))

splits = {"train": train_df, "val": val_df, "test": test_df}

## Tabular features

In [None]:
continuous_cols = (
    "Adv_year",
    "Adv_month",
    "Reg_year",
    "Runned_Miles",
    "Price",
    "Seat_num",
    "Door_num",
    "Entry_price",
    "Engin_size",
    *physical_features,
)

categorical_cols = (
    "Color",
    "Bodytype",
    "Gearbox",
    "Fuel_type",
)


In [None]:
# Normalise continuous features
train_means = train_df[continuous_cols].mean()
train_stds = train_df[continuous_cols].std()

splits = {
    k: v.with_columns(
        pl.col(col).sub(train_means[col]).truediv(train_stds[col]).alias(f"{col}_norm")
        for col in continuous_cols
    )
    for k, v in splits.items()
}

print("Normalised continuous features")

In [None]:
# Encode categorical features
cat_mappings = {
    col: {
        value: index
        for index, value in enumerate(train_df[col].drop_nulls().unique().sort())
    }
    for col in categorical_cols
}

# Save category sizes for embeddings
print(
    f"Categorical sizes: { {col: len(mapping) for col, mapping in cat_mappings.items()} }"
)

In [None]:
# Apply to all splits
splits = {
    k: v.with_columns(
        pl.col(col)
        .replace_strict(cat_mappings[col], default=-1, return_dtype=pl.Int32)
        .alias(f"{col}_enc")
        for col in categorical_cols
    )
    for k, v in splits.items()
}

## Save webdataset

In [None]:
# Feature columns to save
norm_cols = [f"{c}_norm" for c in continuous_cols]
enc_cols = [f"{c}_enc" for c in categorical_cols]
feature_cols = [*norm_cols, *enc_cols]

# Shard configuration: ~100MB per shard, similar to wells dataset
SHARD_SIZE = 1e8  # 100 MB

for name, split_df in splits.items():
    pattern = str(OUTPUT / f"dvm_fronts_{name}-%04d.tar")
    
    with wds.ShardWriter(pattern, maxsize=SHARD_SIZE) as sink:  # type: ignore
        for row in tqdm(
            split_df.iter_rows(named=True),
            desc=f"Saving {name} split to webdataset",
            total=len(split_df),
        ):
            sink.write(
                {
                    "__key__": str(row["Adv_ID"]),
                    "label.pth": torch.tensor(row["label"]),
                    "features.pth": torch.tensor(
                        [row[col] for col in feature_cols], dtype=torch.float32
                    ),
                    "image.jpg": Path(image_path(row["Image_name"])).read_bytes(),
                }
            )

In [None]:
import json

# Count shards per split
shard_counts = {}
for name in splits:
    shards = list(OUTPUT.glob(f"dvm_fronts_{name}-*.tar"))
    shard_counts[name] = len(shards)
    print(f"{name}: {len(shards)} shards")

# Save metadata
metadata = {
    "n_classes": n_classes,
    "n_continuous": len(continuous_cols),
    "n_categorical": len(categorical_cols),
    "continuous_cols": continuous_cols,
    "categorical_cols": categorical_cols,
    "cat_sizes": {col: len(cat_mappings[col]) for col in categorical_cols},
    "train_means": {col: train_means[col].item() for col in continuous_cols},
    "train_stds": {col: train_stds[col].item() for col in continuous_cols},
    "feature_cols": feature_cols,
    "cat_mappings": cat_mappings,
    "label_to_model": {
        row["label"]: row["model_name"] for row in model_to_idx.iter_rows(named=True)
    },
    "label_to_id": {
        row["label"]: row["Genmodel_ID"] for row in model_to_idx.iter_rows(named=True)
    },
    "split_sizes": {name: len(split_df) for name, split_df in splits.items()},
    "shard_counts": shard_counts,
}
(OUTPUT / "metadata.json").write_text(json.dumps(metadata, indent=4))

print("Saved metadata")

## Summary

In [None]:
print("=" * 50)
print("DVM Front-View Dataset Created")
print("=" * 50)
print(f"Classes: {n_classes}")
print(f"Train: {len(train_df):,}")
print(f"Val: {len(val_df):,}")
print(f"Test: {len(test_df):,}")
print(f"Total: {len(df):,}")
print(f"\nFeatures: {len(feature_cols)}")
print(f"  Continuous: {continuous_cols}")
print(f"  Categorical: {categorical_cols}")
print(f"\nOutput: {OUTPUT}")