# BBox Check Notebook

Проверка качества `YOLO` bbox и `content`-фильтра на текущем `manifest.parquet`.

Что можно сделать:
- посмотреть случайные примеры с рамками
- отфильтровать по `split` / `class_name`
- быстро проверить подозрительные кейсы


In [None]:
from pathlib import Path
import hashlib
import math
import random

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image, ImageDraw

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / 'data').exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

MANIFEST_PATH = PROJECT_ROOT / 'data' / 'processed' / 'manifest.parquet'
assert MANIFEST_PATH.exists(), f'Manifest not found: {MANIFEST_PATH}'

raw_df = pd.read_parquet(MANIFEST_PATH)


def _build_image_id(class_id, image_path, image_slot, source_url):
    payload = f"{class_id}|{image_path}|{image_slot}|{source_url or ''}"
    return hashlib.sha1(payload.encode('utf-8')).hexdigest()[:20]


def _to_image_level_manifest(manifest: pd.DataFrame) -> pd.DataFrame:
    if manifest.empty:
        return manifest.copy()

    if 'image_path' in manifest.columns:
        out = manifest.copy()
        if 'image_id' not in out.columns:
            out['image_id'] = out.apply(
                lambda row: _build_image_id(
                    class_id=row.get('class_id'),
                    image_path=row.get('image_path'),
                    image_slot=row.get('image_slot') or 0,
                    source_url=row.get('url'),
                ),
                axis=1,
            )
        if 'split' not in out.columns:
            out['split'] = 'unsplit'
        return out

    rows = []
    for row in manifest.to_dict(orient='records'):
        for slot in (1, 2):
            image_path = row.get(f'image_{slot}_path')
            if image_path is None or pd.isna(image_path):
                continue

            rows.append(
                {
                    'listing_id': row.get('listing_id'),
                    'image_id': _build_image_id(
                        class_id=row.get('class_id'),
                        image_path=image_path,
                        image_slot=slot,
                        source_url=row.get('url'),
                    ),
                    'source': row.get('source'),
                    'url': row.get('url'),
                    'scraped_at': row.get('scraped_at'),
                    'make': row.get('make'),
                    'model': row.get('model'),
                    'body_type': row.get('body_type'),
                    'generation': row.get('generation'),
                    'year': row.get('year'),
                    'class_name': row.get('class_name'),
                    'class_id': row.get('class_id'),
                    'image_path': image_path,
                    'image_slot': slot,
                    'image_phash': row.get(f'image_{slot}_phash'),
                    'width': row.get(f'image_{slot}_width'),
                    'height': row.get(f'image_{slot}_height'),
                    'bytes': row.get(f'image_{slot}_bytes'),
                    'format': row.get(f'image_{slot}_format'),
                    'split': row.get('split') or 'unsplit',
                    'car_detected': row.get(f'image_{slot}_car_detected'),
                    'car_conf': row.get(f'image_{slot}_car_conf'),
                    'car_bbox_x1': row.get(f'image_{slot}_car_bbox_x1'),
                    'car_bbox_y1': row.get(f'image_{slot}_car_bbox_y1'),
                    'car_bbox_x2': row.get(f'image_{slot}_car_bbox_x2'),
                    'car_bbox_y2': row.get(f'image_{slot}_car_bbox_y2'),
                    'car_bbox_area_ratio': row.get(f'image_{slot}_car_bbox_area_ratio'),
                    'exterior_score': row.get(f'image_{slot}_exterior_score'),
                    'interior_score': row.get(f'image_{slot}_interior_score'),
                    'content_label': row.get(f'image_{slot}_content_label'),
                }
            )

    return pd.DataFrame(rows)


df = _to_image_level_manifest(raw_df)
if df.empty:
    raise ValueError(f'Image-level manifest is empty: {MANIFEST_PATH}')

df = df.drop(columns=['content_keep', 'content_reason', 'content_error'], errors='ignore')
df = df.sort_values(by=['class_id', 'image_id'], kind='stable').reset_index(drop=True)

print('PROJECT_ROOT:', PROJECT_ROOT)
print('Manifest:', MANIFEST_PATH)
print('Input rows:', len(raw_df), 'Input cols:', len(raw_df.columns))
print('Rows:', len(df))
print('Classes:', df['class_name'].nunique())
print('Split counts:', df['split'].value_counts(dropna=False).to_dict())


In [None]:
required_cols = [
    'image_id',
    'class_id',
    'class_name',
    'split',
    'image_path',
    'car_bbox_x1',
    'car_bbox_y1',
    'car_bbox_x2',
    'car_bbox_y2',
    'content_label',
    'exterior_score',
]
missing_cols = [c for c in required_cols if c not in df.columns]
if missing_cols:
    print('Missing expected columns:', missing_cols)
else:
    print('All expected image-level columns are present.')


In [None]:
def _safe_float(v):
    if v is None or pd.isna(v):
        return None
    return float(v)


def resolve_image_path(path_value):
    p = Path(str(path_value)).expanduser()
    if not p.is_absolute():
        p = PROJECT_ROOT / p
    return p


def draw_image_row(row, ax=None):
    image_path = resolve_image_path(row['image_path'])
    if not image_path.exists():
        if ax is None:
            _, ax = plt.subplots(figsize=(5, 3))
        ax.set_title(f'Missing file: {image_path.name}')
        ax.axis('off')
        return ax

    img = Image.open(image_path).convert('RGB')
    draw = ImageDraw.Draw(img)

    x1 = _safe_float(row.get('car_bbox_x1'))
    y1 = _safe_float(row.get('car_bbox_y1'))
    x2 = _safe_float(row.get('car_bbox_x2'))
    y2 = _safe_float(row.get('car_bbox_y2'))

    has_bbox = all(v is not None for v in [x1, y1, x2, y2])
    if has_bbox:
        draw.rectangle([(x1, y1), (x2, y2)], outline='lime', width=4)

    if ax is None:
        _, ax = plt.subplots(figsize=(6, 4))

    ax.imshow(img)
    ax.axis('off')

    title = (
        f"image_id={row.get('image_id')} | class={row.get('class_name')} | split={row.get('split')}\n"
        f"slot={row.get('image_slot')} label={row.get('content_label')}, "
        f"conf={row.get('car_conf')}, area={row.get('car_bbox_area_ratio')}, exterior={row.get('exterior_score')}"
    )
    ax.set_title(title, fontsize=8)
    return ax


def show_grid(rows_df, ncols=4, figsize_scale=4.2):
    n = len(rows_df)
    nrows = max(1, math.ceil(n / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * figsize_scale, nrows * figsize_scale))
    if nrows == 1 and ncols == 1:
        axes = [[axes]]
    elif nrows == 1:
        axes = [axes]
    elif ncols == 1:
        axes = [[a] for a in axes]

    flat_axes = [ax for row_axes in axes for ax in row_axes]
    for idx, ax in enumerate(flat_axes):
        if idx < n:
            draw_image_row(rows_df.iloc[idx], ax=ax)
        else:
            ax.axis('off')
    plt.tight_layout()


def inspect_image(image_id):
    row = df[df['image_id'].astype(str) == str(image_id)].iloc[0]
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    draw_image_row(row, ax=ax)
    plt.tight_layout()


In [None]:
# Быстрый случайный просмотр
sample_n = 100
sample = df.sample(min(sample_n, len(df)), random_state=RANDOM_SEED).reset_index(drop=True)
show_grid(sample, ncols=4)


In [None]:
# Фильтры для ручной проверки
split = 'train'   # train / val / test / unsplit / None
class_name = None # пример: 'audi_a7_generation1_liftback'
only_not_exterior = False

q = df.copy()
if split is not None:
    q = q[q['split'] == split]
if class_name is not None:
    q = q[q['class_name'] == class_name]
if only_not_exterior:
    q = q[q['content_label'] != 'exterior']

q = q.reset_index(drop=True)
print('Filtered rows:', len(q))
display_cols = [
    'image_id',
    'image_slot',
    'class_name',
    'split',
    'content_label',
    'exterior_score',
    'car_conf',
    'car_bbox_area_ratio',
]
q[[c for c in display_cols if c in q.columns]].head(20)


In [None]:
# Визуализация отфильтрованной выборки
preview_n = 8
preview = q.sample(min(preview_n, len(q)), random_state=RANDOM_SEED).reset_index(drop=True)
show_grid(preview, ncols=4)


In [None]:
# Точечная проверка конкретного image_id
image_id = str(df.iloc[0]['image_id'])
inspect_image(image_id)


In [None]:
# Быстрые агрегаты по bbox
agg = {
    'rows': len(df),
    'classes': int(df['class_name'].nunique()),
    'bbox_non_null': int(df['car_bbox_x1'].notna().sum()) if 'car_bbox_x1' in df.columns else 0,
    'exterior_mean': float(df['exterior_score'].dropna().mean()) if 'exterior_score' in df.columns else None,
    'content_label_counts': df['content_label'].value_counts(dropna=False).head(10).to_dict()
    if 'content_label' in df.columns
    else {},
}
agg


In [None]:
df.columns


In [None]:
df.head()