# BBox Check Notebook

Проверка качества `YOLO` bbox и `content`-фильтра на текущем `manifest.parquet`.

Что можно сделать:
- посмотреть случайные примеры с рамками
- отфильтровать по `split` / `class_name`
- быстро проверить подозрительные кейсы


In [None]:
from pathlib import Path
import math
import random

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image, ImageDraw

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / 'data').exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

MANIFEST_PATH = PROJECT_ROOT / 'data' / 'processed' / 'manifest.parquet'
assert MANIFEST_PATH.exists(), f'Manifest not found: {MANIFEST_PATH}'

df = pd.read_parquet(MANIFEST_PATH)
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Manifest:', MANIFEST_PATH)
print('Rows:', len(df))
print('Classes:', df['class_name'].nunique())
print('Split counts:', df['split'].value_counts(dropna=False).to_dict())


In [None]:
bbox_cols_manifest = [
    'image_1_car_bbox_x1', 'image_1_car_bbox_y1', 'image_1_car_bbox_x2', 'image_1_car_bbox_y2',
    'image_2_car_bbox_x1', 'image_2_car_bbox_y1', 'image_2_car_bbox_x2', 'image_2_car_bbox_y2',
]
missing_bbox_cols = [c for c in bbox_cols_manifest if c not in df.columns]
if missing_bbox_cols:
    print('Missing bbox columns:', missing_bbox_cols)
else:
    print('All bbox columns are present.')

content_cols = [
    'image_1_content_label', 'image_2_content_label',
    'image_1_content_keep', 'image_2_content_keep',
    'image_1_exterior_score', 'image_2_exterior_score',
]
missing_content_cols = [c for c in content_cols if c not in df.columns]
if missing_content_cols:
    print('Missing content columns:', missing_content_cols)
else:
    print('All content columns are present.')


In [None]:
def _safe_float(v):
    if pd.isna(v):
        return None
    return float(v)


def resolve_image_path(path_value):
    p = Path(str(path_value))
    if not p.is_absolute():
        p = PROJECT_ROOT / p
    return p


def draw_row_image(row, slot=1, ax=None):
    image_path_col = f'image_{slot}_path'
    x1_col = f'image_{slot}_car_bbox_x1'
    y1_col = f'image_{slot}_car_bbox_y1'
    x2_col = f'image_{slot}_car_bbox_x2'
    y2_col = f'image_{slot}_car_bbox_y2'

    image_path = resolve_image_path(row[image_path_col])
    if not image_path.exists():
        if ax is None:
            _, ax = plt.subplots(figsize=(5, 3))
        ax.set_title(f'Missing file: {image_path.name}')
        ax.axis('off')
        return ax

    img = Image.open(image_path).convert('RGB')
    draw = ImageDraw.Draw(img)

    x1 = _safe_float(row.get(x1_col))
    y1 = _safe_float(row.get(y1_col))
    x2 = _safe_float(row.get(x2_col))
    y2 = _safe_float(row.get(y2_col))

    has_bbox = all(v is not None for v in [x1, y1, x2, y2])
    if has_bbox:
        draw.rectangle([(x1, y1), (x2, y2)], outline='lime', width=4)

    if ax is None:
        _, ax = plt.subplots(figsize=(6, 4))

    ax.imshow(img)
    ax.axis('off')

    conf = row.get(f'image_{slot}_car_conf')
    area = row.get(f'image_{slot}_car_bbox_area_ratio')
    label = row.get(f'image_{slot}_content_label')
    keep = row.get(f'image_{slot}_content_keep')
    exterior = row.get(f'image_{slot}_exterior_score')

    title = (
        f"slot={slot} | listing={row['listing_id']}\n"
        f"label={label}, keep={keep}, conf={conf}, area={area}, exterior={exterior}"
    )
    ax.set_title(title, fontsize=9)
    return ax


def show_grid(rows_df, slot=1, ncols=4, figsize_scale=4.2):
    n = len(rows_df)
    nrows = max(1, math.ceil(n / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * figsize_scale, nrows * figsize_scale))
    if nrows == 1 and ncols == 1:
        axes = [[axes]]
    elif nrows == 1:
        axes = [axes]
    elif ncols == 1:
        axes = [[a] for a in axes]

    flat_axes = [ax for row_axes in axes for ax in row_axes]
    for idx, ax in enumerate(flat_axes):
        if idx < n:
            draw_row_image(rows_df.iloc[idx], slot=slot, ax=ax)
        else:
            ax.axis('off')
    plt.tight_layout()


def inspect_listing(listing_id):
    row = df[df['listing_id'].astype(str) == str(listing_id)].iloc[0]
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    draw_row_image(row, slot=1, ax=axes[0])
    draw_row_image(row, slot=2, ax=axes[1])
    plt.tight_layout()


In [None]:
# Быстрый случайный просмотр
sample_n = 12
sample = df.sample(min(sample_n, len(df)), random_state=RANDOM_SEED).reset_index(drop=True)
show_grid(sample, slot=1, ncols=4)


In [None]:
# Фильтры для ручной проверки
split = 'train'   # train / val / test / None
class_name = None # пример: 'audi_a7_generation1_liftback'
only_not_exterior = False

q = df.copy()
if split is not None:
    q = q[q['split'] == split]
if class_name is not None:
    q = q[q['class_name'] == class_name]
if only_not_exterior:
    q = q[(q['image_1_content_label'] != 'exterior') | (q['image_2_content_label'] != 'exterior')]

q = q.reset_index(drop=True)
print('Filtered rows:', len(q))
display_cols = [
    'listing_id', 'class_name', 'split',
    'image_1_content_label', 'image_2_content_label',
    'image_1_content_keep', 'image_2_content_keep',
]
q[display_cols].head(20)


In [None]:
# Визуализация отфильтрованной выборки
preview_n = 8
preview = q.sample(min(preview_n, len(q)), random_state=RANDOM_SEED).reset_index(drop=True)
show_grid(preview, slot=1, ncols=4)


In [None]:
# Точечная проверка конкретного listing_id
listing_id = str(df.iloc[0]['listing_id'])
inspect_listing(listing_id)


In [None]:
# Быстрые агрегаты по bbox
agg = {
    'rows': len(df),
    'classes': int(df['class_name'].nunique()),
    'image_1_bbox_non_null': int(df['image_1_car_bbox_x1'].notna().sum()),
    'image_2_bbox_non_null': int(df['image_2_car_bbox_x1'].notna().sum()),
    'image_1_exterior_mean': float(df['image_1_exterior_score'].dropna().mean()),
    'image_2_exterior_mean': float(df['image_2_exterior_score'].dropna().mean()),
}
agg
