Import

In [None]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
import os, glob, gzip, json, math
from collections import defaultdict, Counter
import matplotlib.pyplot as plt

Konstante

In [None]:
INPUT_GLOB = "/Users/tunahansari/football_ra/data/tracking/SB_tracking_*.json.gz"

In [None]:
# Spielfeldmaße und Schwellenwerte
FIELD_LEN = 120.0
FIELD_WID = 53.33
THRESH_FAIL_XY_NUMERIC = 0.80
THRESH_WARN_XY_NUMERIC = 0.99
THRESH_WARN_OOB = 0.005
THRESH_WARN_PLAYER_ID = 0.95

In [None]:
TOP_FILES = 3            # wie viele Dateien mit höchstem OOB-Anteil prüfen
TOP_PLAYS_PER_FILE = 2   # pro Datei wie viele Plays (mit höchstem OOB-Anteil)

Kompatibilitätsprüfung

In [None]:
def _to_float(v):
    try:
        return float(v)
    except:
        return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None

def _pct(n, d):
    return (n / d) if d else 0.0

# Prüft eine Tracking-Datei auf die wichtigsten Felder
def scan_file(file_path):
    name = os.path.basename(file_path)
    try:
        with gzip.open(file_path, "rt", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        return {"file": name, "status": "FAIL", "reason": f"json_error:{e}"}

    plays = data.get("plays", [])
    if not isinstance(plays, list) or len(plays) == 0:
        return {"file": name, "status": "FAIL", "reason": "no_plays"}

    n_plays = len(plays)
    n_tracks = 0
    steps_total = 0
    xy_numeric = 0
    oob = 0
    tracks_total = 0
    tracks_with_player_id = 0
    plays_with_ltr = 0
    plays_with_yard = 0

    # Beispiel-Schlüssel sammeln
    # (nur die ersten 10 Plays, um die Ausgabe zu begrenzen)
    example_play_keys = set()
    example_track_keys = set()
    example_step_keys = set()
    for p in plays[:10]:
        example_play_keys |= set(p.keys())

    for play in plays:
        if play.get("offense_left_to_right") is not None:
            plays_with_ltr += 1
        if _to_float(play.get("play_yardline")) is not None:
            plays_with_yard += 1

        tracks = play.get("tracks", [])
        if isinstance(tracks, list):
            n_tracks += len(tracks)
        else:
            continue

        for tr in tracks:
            tracks_total += 1
            example_track_keys |= set(tr.keys())

            player = tr.get("player") or tr.get("track_player") or {}
            if isinstance(player, dict) and player.get("player_id") is not None:
                tracks_with_player_id += 1

            steps = tr.get("steps") or tr.get("track_steps") or []
            for s in steps[:10]:
                if isinstance(s, dict):
                    example_step_keys |= set(s.keys())

            for s in steps:
                steps_total += 1
                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is not None and y is not None:
                    xy_numeric += 1
                    if not (0.0 <= x <= FIELD_LEN) or not (0.0 <= y <= FIELD_WID):
                        oob += 1

    r_xy_numeric = _pct(xy_numeric, steps_total)
    r_oob = _pct(oob, max(xy_numeric, 1))
    r_player_id = _pct(tracks_with_player_id, max(tracks_total, 1))
    r_play_ltr = _pct(plays_with_ltr, max(n_plays, 1))
    r_yardline = _pct(plays_with_yard, max(n_plays, 1))

    # Status-Logik
    status = "PASS"
    reasons = []
    if steps_total == 0:
        status = "FAIL"
        reasons.append("no_steps")
    if r_xy_numeric < THRESH_FAIL_XY_NUMERIC:
        status = "FAIL"
        reasons.append(f"xy_numeric<{int(THRESH_FAIL_XY_NUMERIC*100)}%")
    elif r_xy_numeric < THRESH_WARN_XY_NUMERIC:
        if status != "FAIL":
            status = "WARN"
            reasons.append(f"xy_numeric<{int(THRESH_WARN_XY_NUMERIC*100)}%")
    if r_oob > THRESH_WARN_OOB:
        if status != "FAIL":
            status = "WARN"
            reasons.append(f"oob>{THRESH_WARN_OOB*100:.1f}%")
    if r_player_id < THRESH_WARN_PLAYER_ID:
        if status != "FAIL":
            status = "WARN"
            reasons.append(f"player_id<{int(THRESH_WARN_PLAYER_ID*100)}%")

    return {
        "file": name,
        "status": status,
        "reason": ";".join(reasons),
        "n_plays": n_plays,
        "n_tracks": n_tracks,
        "steps_total": steps_total,
        "r_xy_numeric": r_xy_numeric,
        "r_oob": r_oob,
        "r_player_id": r_player_id,
        "r_play_ltr": r_play_ltr,
        "r_yardline": r_yardline,
        "example_play_keys": ", ".join(sorted(list(example_play_keys))[:30]),
        "example_track_keys": ", ".join(sorted(list(example_track_keys))[:30]),
        "example_step_keys": ", ".join(sorted(list(example_step_keys))[:30]),
    }

# Scan ausführen
files = sorted(glob.glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"Keine Dateien für Muster: {INPUT_GLOB}")

print(f"Tracking-Scan: {len(files)} Dateien gefunden")
results = []
for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    print(f"[{i}/{len(files)}] {name} ... ", end="")
    res = scan_file(fp)
    print(f"{res['status']} {('('+res['reason']+')') if res['reason'] else ''}")
    results.append(res)

df_scan = pd.DataFrame(results)
print("\nStatus-Übersicht:")
print(df_scan["status"].value_counts())

# DataFrame anzeigen
try:
    from IPython.display import display
    display(df_scan.sort_values(["status", "file"]))
except:
    print(df_scan.head(20).to_string(index=False))


Datenqualitätsprüfung

In [None]:
# Häufigkeit der Gründe für WARN/FAIL
from collections import Counter

reason_counts = Counter()
for r in df_scan['reason'].fillna(''):
    for part in [p for p in r.split(';') if p]:
        reason_counts[part] += 1
print("WARN/FAIL-Gründe (Häufigkeit):")
for k,v in reason_counts.most_common():
    print(f"  {k}: {v}")

# Top-10 mit höchstem OOB-Anteil
cols = ["file","status","r_oob","r_xy_numeric","r_player_id"]
print("\nTop-10 OOB:")
display(df_scan.sort_values("r_oob", ascending=False)[cols].head(10))

# Dateien mit xy_numeric<99% 
print("\nxy_numeric<99%:")
display(df_scan[df_scan["r_xy_numeric"] < 0.99][cols].sort_values("r_xy_numeric").head(20))

# Präsenzraten der Play-Felder
print("\nDurchschnittliche Präsenz (über Dateien):")
print("offense_left_to_right  (mean r_play_ltr):", df_scan["r_play_ltr"].mean().round(3))
print("play_yardline          (mean r_yardline):", df_scan["r_yardline"].mean().round(3))


In [None]:
def _to_float(v):
    try:
        return float(v)
    except Exception:
        return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None
#
def _oob_overshoot_mag(x, y):
    """Außerhalb des Rechtecks.
       0 wenn in bounds, sonst Distanz zum nächstliegenden Rand."""
    ox = 0.0
    oy = 0.0
    if x is not None and y is not None:
        if x < 0: ox = 0 - x
        elif x > FIELD_LEN: ox = x - FIELD_LEN
        if y < 0: oy = 0 - y
        elif y > FIELD_WID: oy = y - FIELD_WID
    return math.hypot(ox, oy)

def _bin_overshoot(m):
    """Bins für OOB-Schweregrad in yards."""
    if m <= 0:          return "in_bounds"
    elif m <= 0.5:      return "<=0.5y"
    elif m <= 1.0:      return "0.5–1y"
    elif m <= 2.0:      return "1–2y"
    else:               return ">2y"

# Hauptanalyse
files = sorted(glob.glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"Keine Dateien für Muster: {INPUT_GLOB}")

print(f"Zusatz-QA: {len(files)} Dateien")

# Aggregatoren pro Datei
per_file = []

# Play-Qualität (xy-Anteil) pro Play
play_quality = {}  
play_lengths = []  

# Positions-Stats
position_counts_global = Counter()
tracks_with_pos = 0
tracks_total    = 0

for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    print(f"[{i:03d}/{len(files)}] Analysiere {name} ...", flush=True)

    # Zähler für Datei
    steps_total = 0
    steps_xy_ok = 0
    oob_bins = Counter()   # in_bounds, (<=0.5y, 0.5–1y, 1–2y, >2y)
    plays_in_file = 0
    plays_with_tss = 0
    positions_in_file = Counter()
    tracks_with_pos_file = 0
    tracks_total_file = 0

    #  # Für Play-Längen, pro Play max(tss >= 0)
    play_max_tss = {}  

    # Für xy-Qualität pro Play
    play_xy_steps = defaultdict(lambda: {"steps_total":0, "xy_numeric":0})

    with gzip.open(fp, "rt", encoding="utf-8") as f:
        data = json.load(f)

    plays = data.get("plays", [])
    plays_in_file = len(plays)

    for play in plays:
        play_uuid = play.get("play_uuid")

        tracks = play.get("tracks", []) or []
        for tr in tracks:
            tracks_total += 1
            tracks_total_file += 1

            # Positionsfeld (nur Statistik)
            player = tr.get("player", tr.get("track_player", {})) or {}
            pos = player.get("position_code")
            if pos:
                positions_in_file[pos] += 1
                position_counts_global[pos] += 1
                tracks_with_pos      += 1
                tracks_with_pos_file += 1

            steps = tr.get("steps", tr.get("track_steps", [])) or []
            for s in steps:
                steps_total += 1
                play_xy_steps[(name, play_uuid)]["steps_total"] += 1

                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is not None and y is not None:
                    steps_xy_ok += 1
                    play_xy_steps[(name, play_uuid)]["xy_numeric"] += 1
                    # OOB-Schweregrad
                    m = _oob_overshoot_mag(x, y)
                    oob_bins[_bin_overshoot(m)] += 1
                else:
                    # kein xy -> zählt nur zu steps_total/play_xy_steps.steps_total
                    pass

                # Play-Länge (nur 'Analyse', keine Normierung)
                tss = _to_float(s.get("time_since_snap"))
                if tss is not None and tss >= 0:
                    prev = play_max_tss.get(play_uuid)
                    play_max_tss[play_uuid] = tss if (prev is None or tss > prev) else prev

    # Datei-Ergebnis
    valid_xy = steps_xy_ok
    in_bounds = oob_bins.get("in_bounds", 0)
    oob_count = (valid_xy - in_bounds)

    # Anteile bezogen auf gültige xy
    def _share(key):
        denom = max(valid_xy, 1)
        return oob_bins.get(key, 0) / denom

    per_file.append({
        "file": name,
        "steps_total": steps_total,
        "xy_valid": valid_xy,
        "xy_valid_share": (valid_xy / max(steps_total,1)),
        "oob_share_total": (oob_count / max(valid_xy,1)),
        "oob_<=0.5y": _share("<=0.5y"),
        "oob_0.5–1y": _share("0.5–1y"),
        "oob_1–2y": _share("1–2y"),
        "oob_>2y": _share(">2y"),
        "tracks_with_pos_share": tracks_with_pos_file / max(tracks_total_file,1),
        "plays_in_file": plays_in_file,
    })

    # Play-Längen sammeln
    for puid, tmax in play_max_tss.items():
        play_lengths.append(tmax)

    # Play-Qualität (xy-Anteil) sammeln
    for key, d in play_xy_steps.items():
        play_quality[key] = {
            "file": key[0],
            "play_uuid": key[1],
            "steps_total": d["steps_total"],
            "xy_valid": d["xy_numeric"],
            "xy_valid_share": d["xy_numeric"] / max(d["steps_total"],1)
        }

# Ergebnisse in DataFrames
df_files = pd.DataFrame(per_file).sort_values("oob_share_total", ascending=False)
df_plays = pd.DataFrame(play_quality.values()).sort_values("xy_valid_share")

print("\n Zusatz-QA fertig.\n")

# 1) OOB-Schweregrad
print("1) OOB-Schweregrad (global, Anteil an gültigen Punkten) – gemittelt über Dateien:")
cols_oob = ["oob_share_total","oob_<=0.5y","oob_0.5–1y","oob_1–2y","oob_>2y"]
print(df_files[cols_oob].mean().round(4).to_string())

print("\nTop-10 Dateien nach OOB-Gesamtanteil:")
display(df_files[["file","oob_share_total","oob_<=0.5y","oob_0.5–1y","oob_1–2y","oob_>2y","xy_valid_share"]].head(10))

# 2) xy-Lücken – schlechteste Plays
print("\n2) Schlechteste 15 Plays nach xy_valid_share:")
display(df_plays[["file","play_uuid","steps_total","xy_valid","xy_valid_share"]].head(15))

# 3) Positionsabdeckung
print("\n3) Positionsabdeckung:")
pos_total = sum(position_counts_global.values())
pos_df = (pd.Series(position_counts_global, name="count")
            .sort_values(ascending=False)
            .to_frame())
pos_df["share"] = pos_df["count"] / max(pos_total,1)
display(pos_df)

print("\nAnteil Tracks mit position_code – pro Datei (Top 10 niedrigste):")
display(df_files[["file","tracks_with_pos_share"]].sort_values("tracks_with_pos_share").head(10))

# 4) Play-Längen (Sekunden ab Snap)
if play_lengths:
    arr = np.array(play_lengths)
    summary = {
        "count": int(arr.size),
        "min": round(float(arr.min()), 3),
        "p25": round(float(np.percentile(arr, 25)), 3),
        "median": round(float(np.percentile(arr, 50)), 3),
        "p75": round(float(np.percentile(arr, 75)), 3),
        "p90": round(float(np.percentile(arr, 90)), 3),
        "p95": round(float(np.percentile(arr, 95)), 3),
        "max": round(float(arr.max()), 3),
        ">=4s": int((arr >= 4.0).sum()),
        ">=5s": int((arr >= 5.0).sum()),
        ">=6s": int((arr >= 6.0).sum()),
    }
    print("\n4) Play-Längen (Sekunden, nur wenn time_since_snap vorhanden):")
    for k,v in summary.items():
        print(f"  {k}: {v}")
else:
    print("\n4) Play-Längen: Keine time_since_snap gefunden – Länge nicht auswertbar.")


Vollständigkeit, Konsistenz und Plausibilitätsprüfung

In [None]:
# Plotting der OOB-Anteile pro Play
MAX_STEPS_PLOT = None     

import os, glob, gzip, json, math
from collections import defaultdict, Counter
import numpy as np
import matplotlib.pyplot as plt

FIELD_LEN, FIELD_WID = 120.0, 53.33

def _to_float(v):
    try:
        return float(v)
    except Exception:
        return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None

def file_oob_ratio(file_path):
    """Grober OOB-Anteil pro Datei (nur gültige xy zählen als Basis)."""
    steps_valid = 0
    oob = 0
    with gzip.open(file_path, "rt", encoding="utf-8") as f:
        data = json.load(f)
    for play in data.get("plays", []):
        for tr in play.get("tracks", []) or []:
            for s in tr.get("steps", tr.get("track_steps", [])) or []:
                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None:
                    continue
                steps_valid += 1
                if not (0.0 <= x <= FIELD_LEN) or not (0.0 <= y <= FIELD_WID):
                    oob += 1
    return (oob / steps_valid) if steps_valid else 0.0

def plays_oob_stats(file_path):
    """Per-Play OOB-Anteil + Rohpunkte (lazy) für spätere Auswahl/Plot."""
    with gzip.open(file_path, "rt", encoding="utf-8") as f:
        data = json.load(f)
    by_play = defaultdict(lambda: {"valid":0, "oob":0})

    # Für Plot speichern wir pro Play nur die Punkte (x,y) als zwei Listen (in/out)
    raw_points = defaultdict(lambda: {"in": [], "out": []})

    for play in data.get("plays", []):
        puid = play.get("play_uuid")
        for tr in play.get("tracks", []) or []:
            steps = tr.get("steps", tr.get("track_steps", [])) or []
            for idx, s in enumerate(steps):
                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None:
                    continue
                by_play[puid]["valid"] += 1
                in_bounds = (0.0 <= x <= FIELD_LEN) and (0.0 <= y <= FIELD_WID)
                if in_bounds:
                    raw_points[puid]["in"].append((x,y))
                else:
                    by_play[puid]["oob"] += 1
                    raw_points[puid]["out"].append((x,y))

    rows = []
    for puid, d in by_play.items():
        valid = d["valid"]
        oob = d["oob"]
        share = (oob / valid) if valid else 0.0
        rows.append((puid, valid, oob, share))
    rows.sort(key=lambda t: t[3], reverse=True)  # nach OOB-Anteil
    return rows, raw_points

def plot_play_scatter(file_name, play_uuid, raw_points):
    """Ein Plot pro Play: In-bounds Punkte '.' und OOB Punkte 'x'. Keine Farben gesetzt."""
    pts_in  = raw_points[play_uuid]["in"]
    pts_out = raw_points[play_uuid]["out"]

    # Steps begrenzen (nur für sehr große Plays)
    if MAX_STEPS_PLOT is not None:
        pts_in  = pts_in[:MAX_STEPS_PLOT]
        pts_out = pts_out[:MAX_STEPS_PLOT]

    plt.figure(figsize=(7.0, 3.6))

    # In-bounds als Punkte
    if pts_in:
        xi, yi = zip(*pts_in)
        plt.plot(xi, yi, '.', markersize=2, label="in-bounds")

    # OOB als X-Marker
    if pts_out:
        xo, yo = zip(*pts_out)
        plt.plot(xo, yo, 'x', markersize=3, label="OOB")

    # Feldrahmen
    plt.axvline(0); plt.axvline(FIELD_LEN)
    plt.axhline(0); plt.axhline(FIELD_WID)
    plt.xlim(-2, FIELD_LEN+2)
    plt.ylim(-2, FIELD_WID+2)
    plt.xlabel("x (yards)")
    plt.ylabel("y (yards)")
    plt.title(f"{file_name} | play={play_uuid}")
    plt.legend()
    plt.show()

# Dateien nach OOB-Anteil sortieren und Top auswählen
files = sorted(glob.glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"Keine Dateien für Muster: {INPUT_GLOB}")
print(f" Wähle Top-{TOP_FILES} Dateien mit höchstem OOB-Anteil …")
file_scores = []
for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    print(f"  [{i:03d}/{len(files)}] Scanne {name} …", end="", flush=True)
    score = file_oob_ratio(fp)
    file_scores.append((name, fp, score))
    print(f" OOB={score:.4%}")
file_scores.sort(key=lambda t: t[2], reverse=True)
top_files = file_scores[:TOP_FILES]
print("\n Top-Dateien:")
for name, _, sc in top_files:
    print(f"   {name}: OOB≈{sc:.2%}")

# Aus jeder Top-Datei die schlimmsten Plays wählen und plotten
for name, fp, sc in top_files:
    print(f"\n Datei: {name} (OOB≈{sc:.2%}) → ermittle Top-{TOP_PLAYS_PER_FILE} Plays …")
    rows, raw_points = plays_oob_stats(fp)
    picks = rows[:TOP_PLAYS_PER_FILE]
    for (puid, valid, oob, share) in picks:
        print(f"  • Play {puid}: valid={valid}, oob={oob}, OOB-Anteil={share:.2%} → plot")
        plot_play_scatter(name, puid, raw_points)


Ursachen und das Ausmaß von Out-of-Bounds-Problemen

In [None]:
def _to_float(v):
    try: return float(v)
    except Exception: return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None

def _oob(x, y):
    return not (0.0 <= x <= FIELD_LEN) or not (0.0 <= y <= FIELD_WID)

def _overshoot_mag(x, y):
    ox = (0 - x) if x < 0 else (x - FIELD_LEN) if x > FIELD_LEN else 0.0
    oy = (0 - y) if y < 0 else (y - FIELD_WID) if y > FIELD_WID else 0.0
    return math.hypot(ox, oy)

def _bin_overshoot(m):
    if m <= 0:   return "in_bounds"
    if m <= 0.5: return "<=0.5y"
    if m <= 1.0: return "0.5–1y"
    if m <= 2.0: return "1–2y"
    return ">2y"

def file_oob_ratio(fp):
    steps_valid = 0; oob = 0
    with gzip.open(fp, "rt", encoding="utf-8") as f:
        data = json.load(f)
    for play in data.get("plays", []):
        for tr in play.get("tracks", []) or []:
            for s in tr.get("steps", tr.get("track_steps", [])) or []:
                x = _to_float(_first(s.get("x"), s.get("ngs_x"))); y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None: continue
                steps_valid += 1
                if _oob(x,y): oob += 1
    return (oob/steps_valid) if steps_valid else 0.0

def plays_oob_with_cal(fp):
    """Gibt pro Play: valid, oob, oob_cal_true, oob_cal_false, OOB-Bins zurück (Liste von Dicts, sortiert)."""
    with gzip.open(fp, "rt", encoding="utf-8") as f:
        data = json.load(f)
    per = {}
    bins = {}
    for play in data.get("plays", []):
        puid = play.get("play_uuid")
        if puid not in per:
            per[puid] = {"play_uuid": puid, "valid":0, "oob":0, "oob_cal_true":0, "oob_cal_false":0}
            bins[puid] = Counter()
        for tr in play.get("tracks", []) or []:
            for s in tr.get("steps", tr.get("track_steps", [])) or []:
                x = _to_float(_first(s.get("x"), s.get("ngs_x"))); y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None: continue
                per[puid]["valid"] += 1
                if _oob(x,y):
                    per[puid]["oob"] += 1
                    cal = s.get("calibration_fault")
                    if cal is None: cal = s.get("step_calibration_fault")
                    if bool(cal): per[puid]["oob_cal_true"] += 1
                    else:         per[puid]["oob_cal_false"] += 1
                    bins[puid][_bin_overshoot(_overshoot_mag(x,y))] += 1
    rows = []
    for puid, d in per.items():
        valid = d["valid"]; oob = d["oob"]
        share = (oob/valid) if valid else 0.0
        row = {
            "play_uuid": puid,
            "valid": valid,
            "oob": oob,
            "oob_share": share,
            "oob_cal_true": d["oob_cal_true"],
            "oob_cal_true_share": (d["oob_cal_true"]/oob) if oob else 0.0,
            "oob_cal_false": d["oob_cal_false"],
            "oob_cal_false_share": (d["oob_cal_false"]/oob) if oob else 0.0,
            "oob_<=0.5y": bins[puid]["<=0.5y"] / max(oob,1),
            "oob_0.5–1y": bins[puid]["0.5–1y"] / max(oob,1),
            "oob_1–2y":   bins[puid]["1–2y"]   / max(oob,1),
            "oob_>2y":    bins[puid][">2y"]    / max(oob,1),
        }
        rows.append(row)
    rows.sort(key=lambda r: r["oob_share"], reverse=True)
    return rows

# Auswahl Top-Dateien nach OOB- Anteil
files = sorted(glob.glob(INPUT_GLOB))
if not files: raise FileNotFoundError("Keine Dateien gefunden.")
scores = [(os.path.basename(fp), fp, file_oob_ratio(fp)) for fp in files]
scores.sort(key=lambda t: t[2], reverse=True)
top = scores[:TOP_FILES]
print("Top-Dateien (höchster OOB-Anteil):")
for name, _, sc in top:
    print(f"  {name}: OOB≈{sc:.2%}")

# Prüfe Plays in den Top-Dateien
all_rows = []
for name, fp, sc in top:
    print(f"\n {name} — prüfe Plays …")
    rows = plays_oob_with_cal(fp)[:TOP_PLAYS_PER_FILE]
    for r in rows:
        r["file"] = name
        all_rows.append(r)
        print(f"  • play={r['play_uuid']} | OOB={r['oob']}/{r['valid']} ({r['oob_share']:.2%}), "
              f"cal_true={r['oob_cal_true']}/{r['oob']} ({r['oob_cal_true_share']:.2%})")

df_check = pd.DataFrame(all_rows, columns=[
    "file","play_uuid","valid","oob","oob_share",
    "oob_cal_true","oob_cal_true_share","oob_cal_false","oob_cal_false_share",
    "oob_<=0.5y","oob_0.5–1y","oob_1–2y","oob_>2y"
]).sort_values(["file","oob_share"], ascending=[True, False])

display(df_check)

# Aggregierte Aussage über die Stichprobe:
if not df_check.empty:
    agg = {
        "plays_geprueft": len(df_check),
        "median_oob_share": df_check["oob_share"].median(),
        "median_cal_true_share": df_check["oob_cal_true_share"].median(),
        "mean_cal_true_share": df_check["oob_cal_true_share"].mean(),
        "mean_oob_gt2y_share": df_check["oob_>2y"].mean(),
    }
    print("\nZusammenfassung (Stichprobe):")
    for k,v in agg.items():
        print(f"  {k}: {v:.3f}" if isinstance(v, float) else f"  {k}: {v}")
