In [1]:
# Go to the root directory of the project

import os

while os.getcwd().split(os.sep)[-1] != 'archipelago-2025-cv-hack':
    os.chdir('..')

!ls

best.pt  notebooks	 README.md  scripts    uv.lock
data	 pyproject.toml  runs	    solutions


In [2]:
# Load ground truth annotations for training and validation sets created with `solutions/grisha/yolo11_sliced/yolo_to_gt_csv.py`

import pandas as pd

train_df = pd.read_csv('data/merged_sliced/train_gt.csv')
val_df = pd.read_csv('data/merged_sliced/val_gt.csv')

print(val_df.head(1))
print(train_df.head(1))

                                      image_id  label        xc        yc  \
0  pub_02_1_000899_581_2458_2112_3994_3648.jpg      0  0.104492  0.941406   

          w         h  w_img  h_img  
0  0.030599  0.033854   1536   1536  
                                            image_id  label        xc  \
0  01_train-s1_DataSet_Human_Rescue_1_001578_1061...      0  0.110352   

         yc         w         h  w_img  h_img  
0  0.367839  0.059245  0.050781   1536   1536  


In [4]:
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
from pathlib import Path


# Here we will sample train images to match min resized side of bbox and bbox area distribution
# between train and validation sets. We are also going to match annotated / background images
# distribution between train and validation sets.


val_images_dir   = Path("data/merged_sliced/val")
train_images_dir = Path("data/merged_sliced/train")


def add_scale_features(df: pd.DataFrame) -> pd.DataFrame:
    # max side per image
    max_side = df[['w_img', 'h_img']].max(axis=1)
    # relative-to-max-side (dimensionless in [0,1])
    w_rel = df['w'] * (df['w_img'] / max_side)
    h_rel = df['h'] * (df['h_img'] / max_side)

    df = df.copy()
    df['short_side_rel'] = pd.concat([w_rel, h_rel], axis=1).min(axis=1)
    df['long_side_rel']  = pd.concat([w_rel, h_rel], axis=1).max(axis=1)
    df['log_ar']         = np.log((df['long_side_rel'] + 1e-9) / (df['short_side_rel'] + 1e-9))
    return df


def plot_hist_shared(val_df, train_df, col, title, clip_percentile=99):
    # Determine clip threshold from combined data
    all_values = pd.concat([val_df[col], train_df[col]])
    xmax = np.percentile(all_values, clip_percentile)

    fig = make_subplots(rows=2, cols=1,
                        shared_xaxes=True,
                        subplot_titles=(f"VAL {col}", f"TRAIN {col}"))

    # VAL histogram
    h_val = px.histogram(val_df[val_df[col] <= xmax], x=col,
                         nbins=60, histnorm='probability', opacity=0.7).data[0]
    h_val.name = "VAL"
    fig.add_trace(h_val, row=1, col=1)

    # TRAIN histogram
    h_train = px.histogram(train_df[train_df[col] <= xmax], x=col,
                           nbins=60, histnorm='probability', opacity=0.7).data[0]
    h_train.name = "TRAIN"
    fig.add_trace(h_train, row=2, col=1)

    fig.update_layout(height=600, title_text=title)
    fig.update_yaxes(title_text="Fraction", row=1, col=1)
    fig.update_yaxes(title_text="Fraction", row=2, col=1)
    fig.update_xaxes(title_text=col, row=2, col=1, range=[0, xmax])
    fig.show()


def plot_2d_density_shared(val_df, train_df, x, y, title):
    fig = make_subplots(rows=2, cols=1,
                        shared_xaxes=True,
                        shared_yaxes=True,
                        subplot_titles=(f"VAL {x} vs {y}", f"TRAIN {x} vs {y}"))

    # VAL 2D density
    d_val = px.density_heatmap(val_df, x=x, y=y, nbinsx=40, nbinsy=30,
                               histnorm='probability').data[0]
    fig.add_trace(d_val, row=1, col=1)

    # TRAIN 2D density
    d_train = px.density_heatmap(train_df, x=x, y=y, nbinsx=40, nbinsy=30,
                                 histnorm='probability').data[0]
    fig.add_trace(d_train, row=2, col=1)

    fig.update_layout(height=800, title_text=title)
    fig.update_xaxes(title_text=x, row=2, col=1)
    fig.update_yaxes(title_text=y, row=1, col=1)
    fig.update_yaxes(title_text=y, row=2, col=1)
    fig.show()


# 1) Add features
val_f   = add_scale_features(val_df)
train_f = add_scale_features(train_df)

# 2) Define 2D bins from val
short_bins = np.linspace(val_f['short_side_rel'].min(), val_f['short_side_rel'].max(), 18)
logar_bins = np.linspace(val_f['log_ar'].min(), val_f['log_ar'].max(), 12)

# 3) Compute val histogram fractions
H_val, _, _ = np.histogram2d(val_f['short_side_rel'], val_f['log_ar'],
                             bins=[short_bins, logar_bins])
val_frac = H_val / (H_val.sum() + 1e-12)

# 4) Match train annotated bboxes to val distribution
N_target = len(train_f)
parts = []
for i in range(len(short_bins)-1):
    for j in range(len(logar_bins)-1):
        m = (
            (train_f['short_side_rel'] >= short_bins[i]) &
            (train_f['short_side_rel'] <  short_bins[i+1]) &
            (train_f['log_ar']         >= logar_bins[j]) &
            (train_f['log_ar']         <  logar_bins[j+1])
        )
        bin_items = train_f[m]
        target_n  = int(val_frac[i, j] * N_target)
        if target_n <= 0 or bin_items.empty:
            continue
        if len(bin_items) > target_n:
            parts.append(bin_items.sample(target_n, random_state=42))
        else:
            parts.append(bin_items)

train_matched = pd.concat(parts, ignore_index=True) if parts else train_f.iloc[0:0]
annotated_selected_imgs = set(train_matched['image_id'].astype(str).unique())

# --- Helper to compute annotated/bg image ratio from folder ---
def get_annotated_and_bg_images(gt_df: pd.DataFrame, images_dir: Path):
    all_imgs = {p.name if not p.name.endswith(".txt") else None for p in images_dir.iterdir() if p.is_file()}
    all_imgs.discard(None)  # Remove None if any .txt files are present
    ann_imgs = set(gt_df['image_id'].unique())
    bg_imgs  = all_imgs - ann_imgs
    return ann_imgs, bg_imgs

# 5) Compute ratios
val_ann_imgs, val_bg_imgs = get_annotated_and_bg_images(val_df, val_images_dir)
p_bg_val = len(val_bg_imgs) / max(1, len(val_ann_imgs) + len(val_bg_imgs))

train_ann_imgs, train_bg_imgs = get_annotated_and_bg_images(train_df, train_images_dir)
p_bg_train = len(train_bg_imgs) / max(1, len(train_ann_imgs) + len(train_bg_imgs))

print(f"[VAL] images={len(val_ann_imgs)+len(val_bg_imgs)} | ann={len(val_ann_imgs)} | bg={len(val_bg_imgs)} | bg_frac={p_bg_val:.3f}")
print(f"[TRAIN BEFORE] images={len(train_ann_imgs)+len(train_bg_imgs)} | ann={len(train_ann_imgs)} | bg={len(train_bg_imgs)} | bg_frac={p_bg_train:.3f}")

# 6) Add background images to match val's bg ratio
B_sel_target = int(round((p_bg_val * len(annotated_selected_imgs)) / max(1e-12, (1 - p_bg_val))))
bg_candidates = list(train_bg_imgs)

rng = np.random.default_rng(42)
if len(bg_candidates) < B_sel_target:
    print(f"[WARN] Not enough backgrounds ({len(bg_candidates)}) to reach target {B_sel_target}, taking all.")
    selected_bg_imgs = set(bg_candidates)
else:
    selected_bg_imgs = set(rng.choice(bg_candidates, size=B_sel_target, replace=False))

# Final selection
final_selected_imgs = sorted(annotated_selected_imgs.union(selected_bg_imgs))
A_after = len(annotated_selected_imgs)
B_after = len(selected_bg_imgs)
p_bg_after = B_after / max(1, (A_after + B_after))

print(f"[TRAIN AFTER] images={len(final_selected_imgs)} | ann={A_after} | bg={B_after} | bg_frac={p_bg_after:.3f} (target from VAL={p_bg_val:.3f})")

# Save list
with open("data/merged_sliced/selected_train_images.txt", "w") as f:
    for im in final_selected_imgs:
        f.write(im + "\n")
print("[SAVE] selected_train_images.txt written.")

def print_distribution_stats(df, name, cols=('short_side_rel', 'log_ar')):
    print(f"\n{name} distribution stats:")
    for col in cols:
        series = df[col].to_numpy()
        stats = {
            'mean': np.mean(series),
            'std': np.std(series, ddof=1),
            'p1': np.percentile(series, 1),
            'p5': np.percentile(series, 5),
            'p25': np.percentile(series, 25),
            'p50': np.percentile(series, 50),
            'p75': np.percentile(series, 75),
            'p95': np.percentile(series, 95),
            'p99': np.percentile(series, 99),
        }
        print(f"  {col}: " + ", ".join(f"{k}={v:.4f}" for k, v in stats.items()))

# Print distribution stats
print_distribution_stats(val_f, "VAL BEFORE")
print_distribution_stats(train_f, "TRAIN BEFORE")
print_distribution_stats(train_matched, "TRAIN AFTER (annotated)")

# --- Visualisations ---
plot_hist_shared(val_f, train_f, 'short_side_rel', 'short_side_rel BEFORE')
plot_hist_shared(val_f, train_f, 'log_ar', 'log_ar BEFORE')
plot_2d_density_shared(train_f, val_f, 'short_side_rel', 'log_ar', 'TRAIN annotated BEFORE')

plot_hist_shared(val_f, train_matched, 'short_side_rel', 'short_side_rel AFTER')
plot_hist_shared(val_f, train_matched, 'log_ar', 'log_ar AFTER')
plot_2d_density_shared(train_matched, val_f, 'short_side_rel', 'log_ar', 'TRAIN annotated AFTER')

[VAL] images=26980 | ann=2592 | bg=24388 | bg_frac=0.904
[TRAIN BEFORE] images=549234 | ann=76547 | bg=472687 | bg_frac=0.861
[WARN] Not enough backgrounds (472687) to reach target 627003, taking all.
[TRAIN AFTER] images=539326 | ann=66639 | bg=472687 | bg_frac=0.876 (target from VAL=0.904)
[SAVE] selected_train_images.txt written.

VAL BEFORE distribution stats:
  short_side_rel: mean=0.0478, std=0.0160, p1=0.0098, p5=0.0215, p25=0.0384, p50=0.0475, p75=0.0566, p95=0.0755, p99=0.0996
  log_ar: mean=0.4629, std=0.3869, p1=0.0010, p5=0.0307, p25=0.1796, p50=0.3830, p75=0.6572, p95=1.1130, p99=1.9175

TRAIN BEFORE distribution stats:
  short_side_rel: mean=0.0436, std=0.0175, p1=0.0098, p5=0.0150, p25=0.0332, p50=0.0430, p75=0.0534, p95=0.0742, p99=0.0872
  log_ar: mean=0.3978, std=0.3462, p1=0.0000, p5=0.0278, p25=0.1398, p50=0.3159, p75=0.5688, p95=1.0093, p99=1.6818

TRAIN AFTER (annotated) distribution stats:
  short_side_rel: mean=0.0463, std=0.0150, p1=0.0104, p5=0.0221, p25=0.037