Import

In [None]:
# Kurzer Tracking-Scan für StatsBomb-Dateien
# Keine Zeitlogik, kein Merge, nur grobe Kompatibilitätsprüfung.
# Pfad ggf. anpassen:
INPUT_GLOB = "/Users/tunahansari/football_ra/data/tracking/SB_tracking_*.json.gz"

import os
import glob
import gzip
import json
from pathlib import Path
import pandas as pd
import numpy as np

# Spielfeldmaße und Schwellenwerte
FIELD_LEN = 120.0
FIELD_WID = 53.33
THRESH_FAIL_XY_NUMERIC = 0.80
THRESH_WARN_XY_NUMERIC = 0.99
THRESH_WARN_OOB = 0.005
THRESH_WARN_PLAYER_ID = 0.95

def _to_float(v):
    try:
        return float(v)
    except:
        return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None

def _pct(n, d):
    return (n / d) if d else 0.0

def scan_file(file_path):
    # Prüft eine Tracking-Datei auf die wichtigsten Felder
    name = os.path.basename(file_path)
    try:
        with gzip.open(file_path, "rt", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        return {"file": name, "status": "FAIL", "reason": f"json_error:{e}"}

    plays = data.get("plays", [])
    if not isinstance(plays, list) or len(plays) == 0:
        return {"file": name, "status": "FAIL", "reason": "no_plays"}

    n_plays = len(plays)
    n_tracks = 0
    steps_total = 0
    xy_numeric = 0
    oob = 0
    tracks_total = 0
    tracks_with_player_id = 0
    plays_with_ltr = 0
    plays_with_yard = 0

    # Beispiel-Keys für Übersicht
    example_play_keys = set()
    example_track_keys = set()
    example_step_keys = set()
    for p in plays[:10]:
        example_play_keys |= set(p.keys())

    for play in plays:
        if play.get("offense_left_to_right") is not None:
            plays_with_ltr += 1
        if _to_float(play.get("play_yardline")) is not None:
            plays_with_yard += 1

        tracks = play.get("tracks", [])
        if isinstance(tracks, list):
            n_tracks += len(tracks)
        else:
            continue

        for tr in tracks:
            tracks_total += 1
            example_track_keys |= set(tr.keys())

            player = tr.get("player") or tr.get("track_player") or {}
            if isinstance(player, dict) and player.get("player_id") is not None:
                tracks_with_player_id += 1

            steps = tr.get("steps") or tr.get("track_steps") or []
            for s in steps[:10]:
                if isinstance(s, dict):
                    example_step_keys |= set(s.keys())

            for s in steps:
                steps_total += 1
                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is not None and y is not None:
                    xy_numeric += 1
                    if not (0.0 <= x <= FIELD_LEN) or not (0.0 <= y <= FIELD_WID):
                        oob += 1

    r_xy_numeric = _pct(xy_numeric, steps_total)
    r_oob = _pct(oob, max(xy_numeric, 1))
    r_player_id = _pct(tracks_with_player_id, max(tracks_total, 1))
    r_play_ltr = _pct(plays_with_ltr, max(n_plays, 1))
    r_yardline = _pct(plays_with_yard, max(n_plays, 1))

    # Status-Logik
    status = "PASS"
    reasons = []
    if steps_total == 0:
        status = "FAIL"
        reasons.append("no_steps")
    if r_xy_numeric < THRESH_FAIL_XY_NUMERIC:
        status = "FAIL"
        reasons.append(f"xy_numeric<{int(THRESH_FAIL_XY_NUMERIC*100)}%")
    elif r_xy_numeric < THRESH_WARN_XY_NUMERIC:
        if status != "FAIL":
            status = "WARN"
            reasons.append(f"xy_numeric<{int(THRESH_WARN_XY_NUMERIC*100)}%")
    if r_oob > THRESH_WARN_OOB:
        if status != "FAIL":
            status = "WARN"
            reasons.append(f"oob>{THRESH_WARN_OOB*100:.1f}%")
    if r_player_id < THRESH_WARN_PLAYER_ID:
        if status != "FAIL":
            status = "WARN"
            reasons.append(f"player_id<{int(THRESH_WARN_PLAYER_ID*100)}%")

    return {
        "file": name,
        "status": status,
        "reason": ";".join(reasons),
        "n_plays": n_plays,
        "n_tracks": n_tracks,
        "steps_total": steps_total,
        "r_xy_numeric": r_xy_numeric,
        "r_oob": r_oob,
        "r_player_id": r_player_id,
        "r_play_ltr": r_play_ltr,
        "r_yardline": r_yardline,
        "example_play_keys": ", ".join(sorted(list(example_play_keys))[:30]),
        "example_track_keys": ", ".join(sorted(list(example_track_keys))[:30]),
        "example_step_keys": ", ".join(sorted(list(example_step_keys))[:30]),
    }

# Scan ausführen
files = sorted(glob.glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"Keine Dateien für Muster: {INPUT_GLOB}")

print(f"Tracking-Scan: {len(files)} Dateien gefunden")
results = []
for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    print(f"[{i}/{len(files)}] {name} ... ", end="")
    res = scan_file(fp)
    print(f"{res['status']} {('('+res['reason']+')') if res['reason'] else ''}")
    results.append(res)

df_scan = pd.DataFrame(results)
print("\nStatus-Übersicht:")
print(df_scan["status"].value_counts())

# DataFrame anzeigen
try:
    from IPython.display import display
    display(df_scan.sort_values(["status", "file"]))
except:
    print(df_scan.head(20).to_string(index=False))


In [None]:
# Häufigkeit der Gründe für WARN/FAIL
from collections import Counter

reason_counts = Counter()
for r in df_scan['reason'].fillna(''):
    for part in [p for p in r.split(';') if p]:
        reason_counts[part] += 1
print("WARN/FAIL-Gründe (Häufigkeit):")
for k,v in reason_counts.most_common():
    print(f"  {k}: {v}")

# Top-10 mit höchstem OOB-Anteil
cols = ["file","status","r_oob","r_xy_numeric","r_player_id"]
print("\nTop-10 OOB:")
display(df_scan.sort_values("r_oob", ascending=False)[cols].head(10))

# Dateien mit xy_numeric<99% 
print("\nxy_numeric<99%:")
display(df_scan[df_scan["r_xy_numeric"] < 0.99][cols].sort_values("r_xy_numeric").head(20))

# Präsenzraten der Play-Felder
print("\nDurchschnittliche Präsenz (über Dateien):")
print("offense_left_to_right  (mean r_play_ltr):", df_scan["r_play_ltr"].mean().round(3))
print("play_yardline          (mean r_yardline):", df_scan["r_yardline"].mean().round(3))


In [None]:
# Analyse der Tracking-Daten

INPUT_GLOB = "/Users/tunahansari/football_ra/data/tracking/SB_tracking_*.json.gz"

import os, glob, gzip, json, math
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

FIELD_LEN = 120.0
FIELD_WID  = 53.33

def _to_float(v):
    try:
        return float(v)
    except Exception:
        return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None
#
def _oob_overshoot_mag(x, y):
    """Außerhalb des Rechtecks.
       0 wenn in bounds, sonst Distanz zum nächstliegenden Rand."""
    ox = 0.0
    oy = 0.0
    if x is not None and y is not None:
        if x < 0: ox = 0 - x
        elif x > FIELD_LEN: ox = x - FIELD_LEN
        if y < 0: oy = 0 - y
        elif y > FIELD_WID: oy = y - FIELD_WID
    return math.hypot(ox, oy)

def _bin_overshoot(m):
    """Bins für OOB-Schweregrad in yards."""
    if m <= 0:          return "in_bounds"
    elif m <= 0.5:      return "<=0.5y"
    elif m <= 1.0:      return "0.5–1y"
    elif m <= 2.0:      return "1–2y"
    else:               return ">2y"

# Hauptanalyse
files = sorted(glob.glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"Keine Dateien für Muster: {INPUT_GLOB}")

print(f"🔎 Zusatz-QA: {len(files)} Dateien")

# Aggregatoren pro Datei
per_file = []

# Play-Qualität (xy-Anteil) pro Play
play_quality = {}  # key=(file, play_uuid) -> dict: steps_total, xy_numeric
play_lengths = []  # Liste aller Play-Dauern

# Positions-Stats
position_counts_global = Counter()
tracks_with_pos = 0
tracks_total     = 0

for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    print(f"[{i:03d}/{len(files)}] Analysiere {name} ...", flush=True)

    # Zähler für Datei
    steps_total = 0
    steps_xy_ok = 0
    oob_bins = Counter()   # in_bounds, (<=0.5y, 0.5–1y, 1–2y, >2y)
    plays_in_file = 0
    plays_with_tss = 0
    positions_in_file = Counter()
    tracks_with_pos_file = 0
    tracks_total_file = 0

    #  # Für Play-Längen, pro Play max(tss >= 0)
    play_max_tss = {}  

    # Für xy-Qualität pro Play
    play_xy_steps = defaultdict(lambda: {"steps_total":0, "xy_numeric":0})

    with gzip.open(fp, "rt", encoding="utf-8") as f:
        data = json.load(f)

    plays = data.get("plays", [])
    plays_in_file = len(plays)

    for play in plays:
        play_uuid = play.get("play_uuid")

        tracks = play.get("tracks", []) or []
        for tr in tracks:
            tracks_total += 1
            tracks_total_file += 1

            # Positionsfeld (nur Statistik)
            player = tr.get("player", tr.get("track_player", {})) or {}
            pos = player.get("position_code")
            if pos:
                positions_in_file[pos] += 1
                position_counts_global[pos] += 1
                tracks_with_pos      += 1
                tracks_with_pos_file += 1

            steps = tr.get("steps", tr.get("track_steps", [])) or []
            for s in steps:
                steps_total += 1
                play_xy_steps[(name, play_uuid)]["steps_total"] += 1

                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is not None and y is not None:
                    steps_xy_ok += 1
                    play_xy_steps[(name, play_uuid)]["xy_numeric"] += 1
                    # OOB-Schweregrad
                    m = _oob_overshoot_mag(x, y)
                    oob_bins[_bin_overshoot(m)] += 1
                else:
                    # kein xy -> zählt nur zu steps_total/play_xy_steps.steps_total
                    pass

                # Play-Länge (nur 'Analyse', keine Normierung)
                tss = _to_float(s.get("time_since_snap"))
                if tss is not None and tss >= 0:
                    prev = play_max_tss.get(play_uuid)
                    play_max_tss[play_uuid] = tss if (prev is None or tss > prev) else prev

    # Datei-Ergebnis
    valid_xy = steps_xy_ok
    in_bounds = oob_bins.get("in_bounds", 0)
    oob_count = (valid_xy - in_bounds)

    # Anteile bezogen auf gültige xy
    def _share(key):
        denom = max(valid_xy, 1)
        return oob_bins.get(key, 0) / denom

    per_file.append({
        "file": name,
        "steps_total": steps_total,
        "xy_valid": valid_xy,
        "xy_valid_share": (valid_xy / max(steps_total,1)),
        "oob_share_total": (oob_count / max(valid_xy,1)),
        "oob_<=0.5y": _share("<=0.5y"),
        "oob_0.5–1y": _share("0.5–1y"),
        "oob_1–2y": _share("1–2y"),
        "oob_>2y": _share(">2y"),
        "tracks_with_pos_share": tracks_with_pos_file / max(tracks_total_file,1),
        "plays_in_file": plays_in_file,
    })

    # Play-Längen sammeln
    for puid, tmax in play_max_tss.items():
        play_lengths.append(tmax)

    # Play-Qualität (xy-Anteil) sammeln
    for key, d in play_xy_steps.items():
        play_quality[key] = {
            "file": key[0],
            "play_uuid": key[1],
            "steps_total": d["steps_total"],
            "xy_valid": d["xy_numeric"],
            "xy_valid_share": d["xy_numeric"] / max(d["steps_total"],1)
        }

# Ergebnisse in DataFrames
df_files = pd.DataFrame(per_file).sort_values("oob_share_total", ascending=False)
df_plays = pd.DataFrame(play_quality.values()).sort_values("xy_valid_share")

print("\n Zusatz-QA fertig.\n")

# 1) OOB-Schweregrad
print("1) OOB-Schweregrad (global, Anteil an gültigen Punkten) – gemittelt über Dateien:")
cols_oob = ["oob_share_total","oob_<=0.5y","oob_0.5–1y","oob_1–2y","oob_>2y"]
print(df_files[cols_oob].mean().round(4).to_string())

print("\nTop-10 Dateien nach OOB-Gesamtanteil:")
display(df_files[["file","oob_share_total","oob_<=0.5y","oob_0.5–1y","oob_1–2y","oob_>2y","xy_valid_share"]].head(10))

# 2) xy-Lücken – schlechteste Plays
print("\n2) Schlechteste 15 Plays nach xy_valid_share:")
display(df_plays[["file","play_uuid","steps_total","xy_valid","xy_valid_share"]].head(15))

# 3) Positionsabdeckung
print("\n3) Positionsabdeckung:")
pos_total = sum(position_counts_global.values())
pos_df = (pd.Series(position_counts_global, name="count")
            .sort_values(ascending=False)
            .to_frame())
pos_df["share"] = pos_df["count"] / max(pos_total,1)
display(pos_df)

print("\nAnteil Tracks mit position_code – pro Datei (Top 10 niedrigste):")
display(df_files[["file","tracks_with_pos_share"]].sort_values("tracks_with_pos_share").head(10))

# 4) Play-Längen (Sekunden ab Snap)
if play_lengths:
    arr = np.array(play_lengths)
    summary = {
        "count": int(arr.size),
        "min": round(float(arr.min()), 3),
        "p25": round(float(np.percentile(arr, 25)), 3),
        "median": round(float(np.percentile(arr, 50)), 3),
        "p75": round(float(np.percentile(arr, 75)), 3),
        "p90": round(float(np.percentile(arr, 90)), 3),
        "p95": round(float(np.percentile(arr, 95)), 3),
        "max": round(float(arr.max()), 3),
        ">=4s": int((arr >= 4.0).sum()),
        ">=5s": int((arr >= 5.0).sum()),
        ">=6s": int((arr >= 6.0).sum()),
    }
    print("\n4) Play-Längen (Sekunden, nur wenn time_since_snap vorhanden):")
    for k,v in summary.items():
        print(f"  {k}: {v}")
else:
    print("\n4) Play-Längen: Keine time_since_snap gefunden – Länge nicht auswertbar.")


In [None]:
# Plotting der OOB-Anteile pro Play

INPUT_GLOB = "/Users/tunahansari/football_ra/data/tracking/SB_tracking_*.json.gz"
TOP_FILES = 3              # wie viele Dateien mit höchstem OOB-Anteil anschauen
TOP_PLAYS_PER_FILE = 2     # wie viele Plays je Datei (höchster OOB-Anteil) plotten
MAX_STEPS_PLOT = None     

import os, glob, gzip, json, math
from collections import defaultdict, Counter
import numpy as np
import matplotlib.pyplot as plt

FIELD_LEN, FIELD_WID = 120.0, 53.33

def _to_float(v):
    try:
        return float(v)
    except Exception:
        return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None

def file_oob_ratio(file_path):
    """Grober OOB-Anteil pro Datei (nur gültige xy zählen als Basis)."""
    steps_valid = 0
    oob = 0
    with gzip.open(file_path, "rt", encoding="utf-8") as f:
        data = json.load(f)
    for play in data.get("plays", []):
        for tr in play.get("tracks", []) or []:
            for s in tr.get("steps", tr.get("track_steps", [])) or []:
                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None:
                    continue
                steps_valid += 1
                if not (0.0 <= x <= FIELD_LEN) or not (0.0 <= y <= FIELD_WID):
                    oob += 1
    return (oob / steps_valid) if steps_valid else 0.0

def plays_oob_stats(file_path):
    """Per-Play OOB-Anteil + Rohpunkte (lazy) für spätere Auswahl/Plot."""
    with gzip.open(file_path, "rt", encoding="utf-8") as f:
        data = json.load(f)
    by_play = defaultdict(lambda: {"valid":0, "oob":0})
    # Für Plot speichern wir pro Play nur die Punkte (x,y) als zwei Listen (in/out)
    raw_points = defaultdict(lambda: {"in": [], "out": []})

    for play in data.get("plays", []):
        puid = play.get("play_uuid")
        for tr in play.get("tracks", []) or []:
            steps = tr.get("steps", tr.get("track_steps", [])) or []
            for idx, s in enumerate(steps):
                x = _to_float(_first(s.get("x"), s.get("ngs_x")))
                y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None:
                    continue
                by_play[puid]["valid"] += 1
                in_bounds = (0.0 <= x <= FIELD_LEN) and (0.0 <= y <= FIELD_WID)
                if in_bounds:
                    raw_points[puid]["in"].append((x,y))
                else:
                    by_play[puid]["oob"] += 1
                    raw_points[puid]["out"].append((x,y))

    rows = []
    for puid, d in by_play.items():
        valid = d["valid"]
        oob = d["oob"]
        share = (oob / valid) if valid else 0.0
        rows.append((puid, valid, oob, share))
    rows.sort(key=lambda t: t[3], reverse=True)  # nach OOB-Anteil
    return rows, raw_points

def plot_play_scatter(file_name, play_uuid, raw_points):
    """Ein Plot pro Play: In-bounds Punkte '.' und OOB Punkte 'x'. Keine Farben gesetzt."""
    pts_in  = raw_points[play_uuid]["in"]
    pts_out = raw_points[play_uuid]["out"]

    # Steps begrenzen (nur für sehr große Plays)
    if MAX_STEPS_PLOT is not None:
        pts_in  = pts_in[:MAX_STEPS_PLOT]
        pts_out = pts_out[:MAX_STEPS_PLOT]

    plt.figure(figsize=(7.0, 3.6))

    # In-bounds als Punkte
    if pts_in:
        xi, yi = zip(*pts_in)
        plt.plot(xi, yi, '.', markersize=2, label="in-bounds")

    # OOB als X-Marker
    if pts_out:
        xo, yo = zip(*pts_out)
        plt.plot(xo, yo, 'x', markersize=3, label="OOB")

    # Feldrahmen
    plt.axvline(0); plt.axvline(FIELD_LEN)
    plt.axhline(0); plt.axhline(FIELD_WID)
    plt.xlim(-2, FIELD_LEN+2)
    plt.ylim(-2, FIELD_WID+2)
    plt.xlabel("x (yards)")
    plt.ylabel("y (yards)")
    plt.title(f"{file_name} | play={play_uuid}")
    plt.legend()
    plt.show()

# Dateien nach OOB-Anteil sortieren und Top auswählen
files = sorted(glob.glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"Keine Dateien für Muster: {INPUT_GLOB}")
print(f" Wähle Top-{TOP_FILES} Dateien mit höchstem OOB-Anteil …")
file_scores = []
for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    print(f"  [{i:03d}/{len(files)}] Scanne {name} …", end="", flush=True)
    score = file_oob_ratio(fp)
    file_scores.append((name, fp, score))
    print(f" OOB={score:.4%}")
file_scores.sort(key=lambda t: t[2], reverse=True)
top_files = file_scores[:TOP_FILES]
print("\n Top-Dateien:")
for name, _, sc in top_files:
    print(f"   {name}: OOB≈{sc:.2%}")

# Aus jeder Top-Datei die schlimmsten Plays wählen & plotten
for name, fp, sc in top_files:
    print(f"\n Datei: {name} (OOB≈{sc:.2%}) → ermittle Top-{TOP_PLAYS_PER_FILE} Plays …")
    rows, raw_points = plays_oob_stats(fp)
    picks = rows[:TOP_PLAYS_PER_FILE]
    for (puid, valid, oob, share) in picks:
        print(f"  • Play {puid}: valid={valid}, oob={oob}, OOB-Anteil={share:.2%} → plot")
        plot_play_scatter(name, puid, raw_points)


In [None]:
# Zusatz-Analyse: OOB-Anteile pro Play

INPUT_GLOB = "/Users/tunahansari/football_ra/data/tracking/SB_tracking_*.json.gz"
TOP_FILES = 3            # wie viele Dateien mit höchstem OOB-Anteil prüfen
TOP_PLAYS_PER_FILE = 2   # pro Datei wie viele Plays (mit höchstem OOB-Anteil)

import os, glob, gzip, json, math
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

FIELD_LEN, FIELD_WID = 120.0, 53.33

def _to_float(v):
    try: return float(v)
    except Exception: return None

def _first(*vals):
    for v in vals:
        if v is not None:
            return v
    return None

def _oob(x, y):
    return not (0.0 <= x <= FIELD_LEN) or not (0.0 <= y <= FIELD_WID)

def _overshoot_mag(x, y):
    ox = (0 - x) if x < 0 else (x - FIELD_LEN) if x > FIELD_LEN else 0.0
    oy = (0 - y) if y < 0 else (y - FIELD_WID) if y > FIELD_WID else 0.0
    return math.hypot(ox, oy)

def _bin_overshoot(m):
    if m <= 0:   return "in_bounds"
    if m <= 0.5: return "<=0.5y"
    if m <= 1.0: return "0.5–1y"
    if m <= 2.0: return "1–2y"
    return ">2y"

def file_oob_ratio(fp):
    steps_valid = 0; oob = 0
    with gzip.open(fp, "rt", encoding="utf-8") as f:
        data = json.load(f)
    for play in data.get("plays", []):
        for tr in play.get("tracks", []) or []:
            for s in tr.get("steps", tr.get("track_steps", [])) or []:
                x = _to_float(_first(s.get("x"), s.get("ngs_x"))); y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None: continue
                steps_valid += 1
                if _oob(x,y): oob += 1
    return (oob/steps_valid) if steps_valid else 0.0

def plays_oob_with_cal(fp):
    """Gibt pro Play: valid, oob, oob_cal_true, oob_cal_false, OOB-Bins zurück (Liste von Dicts, sortiert)."""
    with gzip.open(fp, "rt", encoding="utf-8") as f:
        data = json.load(f)
    per = {}
    bins = {}
    for play in data.get("plays", []):
        puid = play.get("play_uuid")
        if puid not in per:
            per[puid] = {"play_uuid": puid, "valid":0, "oob":0, "oob_cal_true":0, "oob_cal_false":0}
            bins[puid] = Counter()
        for tr in play.get("tracks", []) or []:
            for s in tr.get("steps", tr.get("track_steps", [])) or []:
                x = _to_float(_first(s.get("x"), s.get("ngs_x"))); y = _to_float(_first(s.get("y"), s.get("ngs_y")))
                if x is None or y is None: continue
                per[puid]["valid"] += 1
                if _oob(x,y):
                    per[puid]["oob"] += 1
                    cal = s.get("calibration_fault")
                    if cal is None: cal = s.get("step_calibration_fault")
                    if bool(cal): per[puid]["oob_cal_true"] += 1
                    else:         per[puid]["oob_cal_false"] += 1
                    bins[puid][_bin_overshoot(_overshoot_mag(x,y))] += 1
    rows = []
    for puid, d in per.items():
        valid = d["valid"]; oob = d["oob"]
        share = (oob/valid) if valid else 0.0
        row = {
            "play_uuid": puid,
            "valid": valid,
            "oob": oob,
            "oob_share": share,
            "oob_cal_true": d["oob_cal_true"],
            "oob_cal_true_share": (d["oob_cal_true"]/oob) if oob else 0.0,
            "oob_cal_false": d["oob_cal_false"],
            "oob_cal_false_share": (d["oob_cal_false"]/oob) if oob else 0.0,
            "oob_<=0.5y": bins[puid]["<=0.5y"] / max(oob,1),
            "oob_0.5–1y": bins[puid]["0.5–1y"] / max(oob,1),
            "oob_1–2y":   bins[puid]["1–2y"]   / max(oob,1),
            "oob_>2y":    bins[puid][">2y"]    / max(oob,1),
        }
        rows.append(row)
    rows.sort(key=lambda r: r["oob_share"], reverse=True)
    return rows

# Auswahl Top-Dateien nach OOB- Anteil
files = sorted(glob.glob(INPUT_GLOB))
if not files: raise FileNotFoundError("Keine Dateien gefunden.")
scores = [(os.path.basename(fp), fp, file_oob_ratio(fp)) for fp in files]
scores.sort(key=lambda t: t[2], reverse=True)
top = scores[:TOP_FILES]
print("Top-Dateien (höchster OOB-Anteil):")
for name, _, sc in top:
    print(f"  {name}: OOB≈{sc:.2%}")

# Prüfe Plays in den Top-Dateien
all_rows = []
for name, fp, sc in top:
    print(f"\n {name} — prüfe Plays …")
    rows = plays_oob_with_cal(fp)[:TOP_PLAYS_PER_FILE]
    for r in rows:
        r["file"] = name
        all_rows.append(r)
        print(f"  • play={r['play_uuid']} | OOB={r['oob']}/{r['valid']} ({r['oob_share']:.2%}), "
              f"cal_true={r['oob_cal_true']}/{r['oob']} ({r['oob_cal_true_share']:.2%})")

df_check = pd.DataFrame(all_rows, columns=[
    "file","play_uuid","valid","oob","oob_share",
    "oob_cal_true","oob_cal_true_share","oob_cal_false","oob_cal_false_share",
    "oob_<=0.5y","oob_0.5–1y","oob_1–2y","oob_>2y"
]).sort_values(["file","oob_share"], ascending=[True, False])

display(df_check)

# Aggregierte Aussage über die Stichprobe:
if not df_check.empty:
    agg = {
        "plays_geprueft": len(df_check),
        "median_oob_share": df_check["oob_share"].median(),
        "median_cal_true_share": df_check["oob_cal_true_share"].median(),
        "mean_cal_true_share": df_check["oob_cal_true_share"].mean(),
        "mean_oob_gt2y_share": df_check["oob_>2y"].mean(),
    }
    print("\nZusammenfassung (Stichprobe):")
    for k,v in agg.items():
        print(f"  {k}: {v:.3f}" if isinstance(v, float) else f"  {k}: {v}")


Preprocessing

In [None]:
# Tracking-Daten scannen

import os, glob, gzip, json, math
from pathlib import Path
from collections import defaultdict
from datetime import datetime
import numpy as np
import pandas as pd


INPUT_GLOB = "/Users/tunahansari/football_ra/data/tracking/SB_tracking_*.json.gz"

# Output pro Datei speichern
OUTPUT_DIR = Path("/Users/tunahansari/football_ra/out_simple")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Master-Output (Zusammenführung)
MASTER_OUT_DIR = Path("/Users/tunahansari/football_ra/out_1hz_clean")
MASTER_OUT_DIR.mkdir(parents=True, exist_ok=True)
MASTER_BASENAME = "master_1hz_4s_ready"  

# Bestehende Outputs überschreiben?
FORCE_OVERWRITE = False

# Master bauen?
REBUILD_MASTER = False  # Auf True setzen, um Master zu erstellen

# Einheit: "yards", "feet" oder None (automatisch ermitteln)
FORCE_UNITS = None

WINDOW_SECONDS = 4             # Zeitfenster in Sekunden
PLAY_MIN_VALID_SHARE = 0.90    # Mindestanteil gültiger Schritte pro Play
OOB_DROP_YARDS = 2.0           # OOB-Schwellenwert in Yards
FIELD_LEN, FIELD_WID = 120.0, 53.33
ENDZONE = 10.0

print(f"Config: files='{INPUT_GLOB}', out='{OUTPUT_DIR}', master='{MASTER_OUT_DIR}', overwrite={FORCE_OVERWRITE}, rebuild_master={REBUILD_MASTER}")

# ----- Hilfsfunktionen -----
def _to_float(v):
    try:
        x = float(v)
        if math.isnan(x):
            return None
        return x
    except:
        return None

def _overshoot_mag(x, y):
    # Abstand außerhalb des Spielfelds berechnen
    ox = (0 - x) if x < 0 else (x - FIELD_LEN) if x > FIELD_LEN else 0.0
    oy = (0 - y) if y < 0 else (y - FIELD_WID) if y > FIELD_WID else 0.0
    return math.hypot(ox, oy)

def _clip_xy(x, y):
    # (x,y) an Spielfeldgrenzen anpassen
    return (min(max(x, 0.0), FIELD_LEN), min(max(y, 0.0), FIELD_WID))

def _safe_parquet_path(base_dir: Path, stem: str, ts: bool = True) -> Path:
    """Erzeuge sicheren Speicherpfad mit Zeitstempel."""
    if ts:
        tag = datetime.now().strftime("%Y%m%d-%H%M%S")
        return base_dir / f"{stem}_{tag}.parquet"
    return base_dir / f"{stem}.parquet"

def _save_parquet(df: pd.DataFrame, out_path: Path):
    try:
        df.to_parquet(out_path, index=False, engine="pyarrow")
        print(f" gespeichert: {out_path} (Zeilen: {len(df):,})")
    except Exception as e:
        fb = out_path.with_suffix(".pkl")
        df.to_pickle(fb)
        print(f" Parquet fehlgeschlagen ({e}); Fallback: {fb}")

def _pick_xy_keys(first_play):
    # Ermitteln plausible (x,y)-Schlüssel aus ersten Daten
    XY_KEYS = [("x","y"), ("track_x","track_y"), ("ngs_x","ngs_y"), ("px","py"), ("X","Y")]
    tracks = (first_play or {}).get("tracks") or []
    for tr in tracks:
        steps = tr.get("steps") or tr.get("track_steps") or []
        if not steps:
            continue
        s0 = steps[0]
        for kx, ky in XY_KEYS:
            if kx in s0 and ky in s0:
                return kx, ky
    return "x", "y"  # Standard

def _gather_sample_xy(plays, kx, ky, max_n=5000):
    # (x,y)-Werte zur Einheitenerkennung
    out = []
    for play in plays:
        for tr in (play.get("tracks") or []):
            for s in (tr.get("steps") or []):
                if len(out) >= max_n:
                    return out
                x = _to_float(s.get(kx)); y = _to_float(s.get(ky))
                if x is None or y is None:
                    continue
                out.append((x,y))
    return out

def _auto_units(sample_xy):
    if FORCE_UNITS in ("yards","feet"):
        return FORCE_UNITS
    if not sample_xy:
        return "yards"
    def score_xy(pairs):
        n = min(len(pairs), 2000)
        if n == 0:
            return 0.0
        inb = 0
        for i in range(n):
            x,y = pairs[i]
            if 0 <= x <= FIELD_LEN and 0 <= y <= FIELD_WID:
                inb += 1
        return inb / n
    yards_pairs = sample_xy
    feet_pairs  = [(x/3.0, y/3.0) for (x,y) in sample_xy]
    sy, sf = score_xy(yards_pairs), score_xy(feet_pairs)
    return "yards" if sy >= sf else "feet"

files = sorted(glob.glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"Keine Dateien gefunden für das Muster: {INPUT_GLOB}")

print(f"Starte Preprocessing: {len(files)} Dateien")
qc_rows = []
per_file_outputs = []

for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    out_parquet = OUTPUT_DIR / (name.replace(".json.gz", ".parquet"))
    print(f"\n[{i:03d}/{len(files)}] {name}")

    if out_parquet.exists() and not FORCE_OVERWRITE:
        print(f"  ↪ Datei existiert bereits, überspringe (FORCE_OVERWRITE={FORCE_OVERWRITE})")
        per_file_outputs.append(out_parquet)
        continue

    with gzip.open(fp, "rt", encoding="utf-8") as f:
        data = json.load(f)

    plays = data.get("plays") or []
    if not plays:
        print("  Keine Plays gefunden, überspringe")
        continue

    # (x,y)-Schlüssel und Einheiten
    kx, ky = _pick_xy_keys(plays[0])
    sample_xy_raw = _gather_sample_xy(plays, kx, ky, max_n=4000)
    units = _auto_units(sample_xy_raw)
    print(f" Keys=({kx},{ky})  Einheiten={units}")

    # Zähler initialisieren
    steps_total = steps_numeric_win = steps_kept_win = 0
    drop_cal = drop_oob_gt2 = clip_oob_le2 = 0
    rows_acc = defaultdict(lambda: {
        "sx":0.0,"sy":0.0,"c":0,
        "pname":None,"pos":None,"tid":None,
        "gid":data.get("game_id"),
        "home":data.get("home_abbr") or (data.get("home_team",{}) or {}).get("nfl_team_id"),
        "away":data.get("away_abbr") or (data.get("away_team",{}) or {}).get("nfl_team_id"),
        "off":None,"def":None,
        "q":None,"down":None,"ytg":None,"ptype":None
    })

    for play in plays:
        puid = play.get("play_uuid")
        if not puid:
            continue

        ltr = bool(play.get("offense_left_to_right", True))
        yln = _to_float(play.get("play_yardline"))
        if yln is None or not (0.0 <= yln <= 100.0):
            # Ungültige Yardline überspringen
            continue

        off_id = play.get("play_offense_team_id") or play.get("offense_team_id")
        def_id = play.get("play_defense_team_id") or play.get("defense_team_id")

        meta_by_play[puid] = dict(
            ltr=ltr, yln=yln, off=off_id, de=def_id,
            q=play.get("play_quarter"),
            down=play.get("play_down"),
            ytg=play.get("play_yards_to_go"),
            ptype=play.get("play_type")
        )

        for tr in (play.get("tracks") or []):
            player = tr.get("player") or tr.get("track_player") or {}
            pid = player.get("player_id")
            if pid is None:
                continue
            pname = player.get("name")
            ppos  = player.get("position_code")
            tid   = tr.get("team_id") or tr.get("track_team_id") or tr.get("nfl_team_id")
            steps = tr.get("steps") or tr.get("track_steps") or []
            for s in steps:
                tss = _to_float(s.get("time_since_snap"))
                if tss is None or tss < 0:
                    continue
                t_sec = int(math.floor(tss))
                if t_sec < 0 or t_sec >= WINDOW_SECONDS:
                    continue

                steps_total += 1

                xr = _to_float(s.get(kx)); yr = _to_float(s.get(ky))
                if xr is None or yr is None:
                    continue

                # Einheitstransformation
                if units == "feet":
                    x_raw, y_raw = xr/3.0, yr/3.0
                else:
                    x_raw, y_raw = xr, yr

                steps_numeric_win += 1

                # calibration_fault prüfen
                cal = s.get("calibration_fault")
                if cal is None:
                    cal = s.get("step_calibration_fault")
                if bool(cal):
                    drop_cal += 1
                    continue

                # OOB-Check vor Orientierung
                m = _overshoot_mag(x_raw, y_raw)
                if m > OOB_DROP_YARDS:
                    drop_oob_gt2 += 1
                    continue
                if m > 0:
                    x_raw, y_raw = _clip_xy(x_raw, y_raw)
                    clip_oob_le2 += 1

                # Orientierung: X 
                x = x_raw if ltr else (FIELD_LEN - x_raw)
                y = y_raw

                play_seen[puid] += 1

                # Aggregation pro (play, player, Sekunde)
                key = (puid, pid, t_sec)
                acc = rows_acc[key]
                acc["sx"] += x
                acc["sy"] += y
                acc["c"]  += 1
                if acc["pname"] is None: acc["pname"] = pname
                if acc["pos"]   is None: acc["pos"]  = ppos
                if acc["tid"]   is None: acc["tid"]  = tid
                if acc["off"]   is None: acc["off"]  = off_id
                if acc["def"]   is None: acc["def"]  = def_id
                if acc["q"]     is None: acc["q"]    = play.get("play_quarter")
                if acc["down"]  is None: acc["down"] = play.get("play_down")
                if acc["ytg"]   is None: acc["ytg"]  = play.get("play_yards_to_go")
                if acc["ptype"] is None: acc["ptype"]= play.get("play_type")

                steps_kept_win += 1
                play_kept[puid] += 

    # Spiele mit zu wenigen gültigen Schritten verwerfen
    drop_plays = set()
    for puid, seen in play_seen.items():
        kept = play_kept.get(puid, 0)
        share = kept / max(seen, 1)
        if share < PLAY_MIN_VALID_SHARE:
            drop_plays.add(puid)

    print(f"Steps: total={steps_total:,} | numeric={steps_numeric_win:,} | kept={steps_kept_win:,}")
    print(f"    - calibration_fault: {drop_cal:,}")
    print(f"    - OOB >{OOB_DROP_YARDS}yd gedroppt: {drop_oob_gt2:,}")
    print(f"    - OOB ≤{OOB_DROP_YARDS}yd geclippt: {clip_oob_le2:,}")
    print(f"Plays: total={len(plays)} | gedroppt (<{int(PLAY_MIN_VALID_SHARE*100)}% gültig): {len(drop_plays)}")

    # Ausgabe-Daten erstellen
    rows = []
    for (puid, pid, t_sec), a in rows_acc.items():
        if a["c"] == 0 or puid in drop_plays:
            continue
        meta = meta_by_play.get(puid, {})
        ltr = meta.get("ltr", True)
        yln = meta.get("yln", 0.0)

        # LOS relativ zur Orientierung
        L   = (ENDZONE + yln) if ltr else (110.0 - yln)
        rows.append({
            "play_uuid": puid,
            "player_id": pid,
            "t_sec": t_sec,
            "x_norm": (a["sx"]/a["c"]) - L,   
            "y": a["sy"]/a["c"],              
            "player_name": a["pname"],
            "position_code": a["pos"],
            "team_id": a["tid"],
            "game_id": a["gid"],
            "home_abbr": a["home"],
            "away_abbr": a["away"],
            "offense_team_id": a["off"],
            "defense_team_id": a["def"],
            "play_quarter": a["q"],
            "play_down": a["down"],
            "play_yards_to_go": a["ytg"],
            "play_type": a["ptype"],
            "play_yardline": yln,
            "ori": "KEEP" if ltr else "MIRROR",
            "units": units,
            "x_key": kx, "y_key": ky,
        })

    if rows:
        df = pd.DataFrame(rows).sort_values(["play_uuid","player_id","t_sec"])
        # Einzeldatei speichern
        if out_parquet.exists() and not FORCE_OVERWRITE:
            print(f"  Ziel existiert bereits und FORCE_OVERWRITE=False → Skip Save: {out_parquet}")
        else:
            _save_parquet(df, out_parquet)
            per_file_outputs.append(out_parquet)
    else:
        print("  Nichts zu speichern (alle Daten verworfen)")

    qc_rows.append({
        "file": name,
        "plays_total": len(plays),
        "plays_dropped": len(drop_plays),
        "steps_total_4s": steps_total,
        "steps_numeric_4s": steps_numeric_win,
        "steps_kept_4s": steps_kept_win,
        "drop_calibration": drop_cal,
        "drop_oob_gt2": drop_oob_gt2,
        "clip_oob_le2": clip_oob_le2,
    })

# Gesamt-QC anzeigen
df_qc = pd.DataFrame(qc_rows)
print("\n Fertig (pro Datei).")
if not df_qc.empty:
    try:
        from IPython.display import display
        display(df_qc.head(10))
        display(df_qc[["steps_total_4s","steps_numeric_4s","steps_kept_4s","drop_calibration","drop_oob_gt2","clip_oob_le2"]].sum())
    except Exception:
        print(df_qc.head(10).to_string(index=False))
        sums = df_qc[["steps_total_4s","steps_numeric_4s","steps_kept_4s","drop_calibration","drop_oob_gt2","clip_oob_le2"]].sum()
        for k,v in sums.items():
            print(f"  {k}: {int(v):,}")

# Master-Output erstellen (Concat aller Parquets)
if REBUILD_MASTER:
    print("\n Baue Master…")
    # Alle Parquets im OUTPUT_DIR verwenden
    parts = sorted(OUTPUT_DIR.glob("*.parquet"))
    if not parts:
        print("  Keine Teile gefunden – Master entfällt.")
    else:
        dfs = []
        for p in parts:
            try:
                d = pd.read_parquet(p)
                # Prüfung der Kernspalten
                need = {"play_uuid","player_id","t_sec","x_norm","y"}
                if not need.issubset(d.columns):
                    print(f"  {p.name}: fehlende Spalten {need - set(d.columns)} – Teil überspringen")
                    continue
                dfs.append(d)
            except Exception as e:
                print(f"  {p.name}: Read-Error {e} – Teil überspringen")

        if not dfs:
            print("  Keine verwertbaren Teile – Master entfällt.")
        else:
            master = pd.concat(dfs, ignore_index=True)
            master.sort_values(["game_id","play_uuid","player_id","t_sec"], inplace=True)

            # Berechne dx, dy, speed pro 1 Hz
            grp = ["game_id","play_uuid","player_id"]
            master["dx"] = master.groupby(grp, observed=True)["x_norm"].diff().fillna(0.0)
            master["dy"] = master.groupby(grp, observed=True)["y"].diff().fillna(0.0)
            master["speed"] = np.sqrt(master["dx"]**2 + master["dy"]**2)

            # Schreibpfad mit Zeitstempel
            out_master = _safe_parquet_path(MASTER_OUT_DIR, MASTER_BASENAME, ts=True)
            _save_parquet(master, out_master)
            print(f"Master geschrieben → {out_master}")
else:
    print("\nREBUILD_MASTER=False – kein Master erstellt.")


merge & mini check 

In [None]:
import os, glob, math
import pandas as pd
import numpy as np
from pathlib import Path

try:
    from IPython.display import display
except Exception:
    def display(x): print(x)

BASE_DIR = Path("/Users/tunahansari/football_ra/out_1hz_clean")
if not BASE_DIR.exists():
    BASE_DIR = Path.cwd() / "out_1hz_clean"

PARQUET_GLOB = str(BASE_DIR / "*.parquet")
MASTER_OUT = str(BASE_DIR / "master_1hz_4s.parquet")

FIELD_WID = 53.33
T_MIN, T_MAX = 0, 3

REQUIRED_COLS = [
    "play_uuid", "player_id", "t_sec", "x_norm",
    "position_code", "track_team_id", "offense_team_id", "defense_team_id",
    "play_yardline", "play_type", "home_abbr", "away_abbr", "game_id", "gsis_play_id"
]

ALIASES = {
    "player_id": ["player_id", "nfl_id", "nflId"],
    "gsis_play_id": ["gsis_play_id", "play_id", "gsisPlayId"],
    "position_code": ["position_code", "position"],
    "track_team_id": ["track_team_id", "team_id", "teamId", "team"],
    "offense_team_id": ["offense_team_id", "offenseTeamId", "offense_team"],
    "defense_team_id": ["defense_team_id", "defenseTeamId", "defense_team"],
    "play_yardline": ["play_yardline", "yardline", "yardLine"],
    "play_type": ["play_type", "playType"],
    "home_abbr": ["home_abbr", "homeTeamAbbr", "home_team"],
    "away_abbr": ["away_abbr", "awayTeamAbbr", "away_team"],
    "game_id": ["game_id", "gameId"],
}

def ensure_alias_cols(df, required_cols, aliases):
    missing = []
    for col in required_cols:
        if col in df.columns:
            continue
        if col in ("x_norm", "t_sec", "play_uuid"):
            if col not in df.columns:
                missing.append(col)
            continue
        for a in aliases.get(col, []):
            if a in df.columns:
                df[col] = df[a]
                break
        else:
            missing.append(col)
    return df, missing

print("Suche Parquet-Dateien ...")
files = sorted(glob.glob(PARQUET_GLOB))
files = [f for f in files if not os.path.basename(f).startswith("master_")]
print(f"Gefunden: {len(files)} Dateien in {BASE_DIR}")

if not files:
    raise FileNotFoundError(f"Keine Dateien gefunden. Bitte prüfen: {PARQUET_GLOB}")

print("\nBestimme Spaltennamen für y (step_y vs. y) aus der ersten Datei ...")
probe = pd.read_parquet(files[0])
if "y" in probe.columns:
    Y_COL = "y"
elif "step_y" in probe.columns:
    Y_COL = "step_y"
else:
    raise KeyError("Weder 'y' noch 'step_y' in den Parquet-Dateien gefunden.")
print(f"y-Spalte: {Y_COL}")

print("\nLade & merge alle Dateien (das dauert je nach Platte kurz) ...")
dfs = []
running_rows = 0
for i, fp in enumerate(files, 1):
    name = os.path.basename(fp)
    df = pd.read_parquet(fp)
    df["file"] = name
    dfs.append(df)
    running_rows += len(df)
    if i % 10 == 0 or i == len(files):
        print(f"[{i:03d}/{len(files)}] geladen: {name}  (aktuelle Gesamtzeilen ~ {running_rows:,})")

master = pd.concat(dfs, ignore_index=True)
del dfs, probe

print("\nMerge fertig.")
print(f"master.shape = {master.shape[0]:,} Zeilen × {master.shape[1]} Spalten")
mem_mb = master.memory_usage(deep=True).sum() / (1024**2)
print(f"geschätzter Speicherbedarf: {mem_mb:.1f} MB")

print("\nMINI-QC startet ...")

print("\nPflichtspalten prüfen ...")
master, missing_after_alias = ensure_alias_cols(master, REQUIRED_COLS, ALIASES)

HARD_REQ = {"play_uuid", "player_id", "t_sec", "x_norm"}
hard_missing = [c for c in HARD_REQ if c not in master.columns]
soft_missing = [c for c in missing_after_alias if c not in HARD_REQ]

if hard_missing:
    print(f"Harte Pflichtspalten fehlen: {hard_missing}")
    raise KeyError(f"Pflichtspalten fehlen: {hard_missing}")
if soft_missing:
    for c in soft_missing:
        print(f"Hinweis: optionale/Meta-Spalte fehlt: {c}")
print("Pflichtspalten ok.")

print("\nt_sec-Check ... (erwartet 0..3)")
t_min, t_max = master["t_sec"].min(), master["t_sec"].max()
vals = np.sort(master["t_sec"].unique())
share_out_range = ((master["t_sec"] < T_MIN) | (master["t_sec"] > T_MAX)).mean()
print(f"t_sec Werte: min={t_min}, max={t_max}, Unique={vals[:10]}{' ...' if len(vals) > 10 else ''}")
print(f"Anteil außerhalb [{T_MIN},{T_MAX}]: {share_out_range:.4%}")
if share_out_range > 0:
    counts_out = master.loc[(master["t_sec"] < T_MIN) | (master["t_sec"] > T_MAX), "t_sec"].value_counts().sort_index()
    print("Werte außerhalb Range (Counts):")
    print(counts_out.to_string())

print("\ny-Grenzen (0 .. 53.33 yd) ...")
y = pd.to_numeric(master[Y_COL], errors="coerce")
oob_low = (y < 0).sum()
oob_high = (y > FIELD_WID).sum()
oob_share = ((y < 0) | (y > FIELD_WID)).mean()
print(f"y.min={float(np.nanmin(y)):.3f}, y.max={float(np.nanmax(y)):.3f}")
print(f"OOB y<0: {oob_low:,} | y>{FIELD_WID}: {oob_high:,}  → Anteil: {oob_share:.4%}")
if oob_share == 0:
    print("y liegt vollständig im Feld (Clip hat gegriffen).")
else:
    print("Es gibt noch Punkte außerhalb – ggf. stichprobenartig prüfen.")

print("\nx_norm @ t=0 ...")
t0 = master.loc[master["t_sec"] == 0, "x_norm"]
t0 = pd.to_numeric(t0, errors="coerce").dropna()
if len(t0) > 0:
    q = t0.quantile([0.01, 0.25, 0.5, 0.75, 0.99]).to_dict()
    mean_, std_ = float(t0.mean()), float(t0.std())
    print(f"count={t0.shape[0]:,} | mean={mean_:.3f} | std={std_:.3f}")
    print(f"quantiles: 1%={q[0.01]:.3f}, 25%={q[0.25]:.3f}, 50%={q[0.5]:.3f}, 75%={q[0.75]:.3f}, 99%={q[0.99]:.3f}")
    if abs(mean_) <= 0.25:
        print("LOS-Normalisierung sieht gut aus (Mittelwert ~0 yd).")
    else:
        print("Mittelwert ist weiter von 0 entfernt als erwartet – ggf. LOS-Offset verifizieren.")
else:
    print("Keine t=0-Zeilen gefunden (unerwartet).")

print("\nZeilen pro Datei (Top 10):")
lines_per_file = master["file"].value_counts().head(10)
print(lines_per_file.to_string())

print("\nMINI-QC abgeschlossen – Daten sind bereit für RP/CRP/RQA & Clustering.")

SAVE_MASTER = True
if SAVE_MASTER:
    out_dir = os.path.dirname(MASTER_OUT)
    if out_dir and not os.path.exists(out_dir):
        os.makedirs(out_dir, exist_ok=True)
    master.to_parquet(MASTER_OUT, index=False)
    print(f"\nMaster-Parquet gespeichert: {MASTER_OUT}")
    print("(Beim Weiterarbeiten kannst du direkt dieses File laden)")


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

MASTER_OUT = "/Users/tunahansari/football_ra/out_1hz_clean/master_1hz_4s.parquet"

try:
    master 
except NameError:
    master = pd.read_parquet(MASTER_OUT)

t0 = master.loc[master["t_sec"] == 0, ["file","play_uuid","x_norm","position_code"]].copy()
t0["abs_x0"] = t0["x_norm"].abs()

print(f"t0 rows: {len(t0):,}")
print(f"Anteil |x_norm| @t0 > 12 yd: {(t0['abs_x0']>12).mean():.2%}")

print("\nTop-Dateien mit vielen Ausreißern (|x_norm|>12yd) @t0:")
print(t0.loc[t0["abs_x0"]>12].groupby("file").size().sort_values(ascending=False).head(15).to_string())

print("\nSchlimmste 10 Plays (|x_norm| @t0):")
cols = ["file","play_uuid","position_code","x_norm"]
print(t0.sort_values("abs_x0", ascending=False)[cols].head(10).to_string(index=False))


In [None]:
# x_norm-Korrektur für Tracking-Daten

import numpy as np
import pandas as pd
from pathlib import Path

MASTER_IN  = "/Users/tunahansari/football_ra/out_1hz_clean/master_1hz_4s.parquet"
MASTER_OUT = "/Users/tunahansari/football_ra/out_1hz_clean/master_1hz_4s_fix.parquet"

print("Lade Master …")
try:
    master  
    print("   (nutze vorhandenen DataFrame 'master')")
except NameError:
    master = pd.read_parquet(MASTER_IN)
    print(f"   geladen: {len(master):,} Zeilen")

# Sicherheit: numerische Typen erzwingen
master["t_sec"]  = pd.to_numeric(master["t_sec"], errors="coerce")
master["x_norm"] = pd.to_numeric(master["x_norm"], errors="coerce")

t0_before = master.loc[master["t_sec"]==0, "x_norm"].dropna()
share_bad_before = (t0_before.abs() > 12).mean()
print(f"\nVorher: |x_norm|@t0 > 12 yd = {share_bad_before:.2%}")
print(f"   t0 count={t0_before.shape[0]:,} | mean={t0_before.mean():.3f} | std={t0_before.std():.3f}")

# offset pro Play berechnen
def play_offset(g: pd.DataFrame) -> float:
    t0 = g[g["t_sec"]==0]
    if t0.empty:
        return 0.0
    # Offense-Spieler bei t0
    off_mask = (t0["track_team_id"] == t0["offense_team_id"])
    if off_mask.sum() >= 8:
        med = np.nanmedian(t0.loc[off_mask, "x_norm"])
    else:
        # Fallback: alle bei t0 (z.B. wenn Team-IDs fehlen)
        med = np.nanmedian(t0["x_norm"])
    return float(med) if np.isfinite(med) else 0.0

print("\nBerechne Offsets pro play_uuid …")
offsets = master.groupby("play_uuid", sort=False).apply(play_offset)

# Kleine Übersicht der Offset-Verteilung
q = offsets.quantile([0.01,0.25,0.5,0.75,0.99]).to_dict()
print(f"   Offsets quantiles (yd): 1%={q[0.01]:.2f}, 25%={q[0.25]:.2f}, 50%={q[0.5]:.2f}, 75%={q[0.75]:.2f}, 99%={q[0.99]:.2f}")
print(f"   Anteil |Offset| > 12 yd: {(offsets.abs()>12).mean():.2%}")

# --- Anwenden: x_norm korrigieren ------------------------------------------
print("\n Wende Offsets an (x_norm_fix = x_norm - Offset) …")
master["x_norm_fix"] = master["x_norm"] - master["play_uuid"].map(offsets)

# --- Nachher-Diagnose -------------------------------------------------------
t0_after = master.loc[master["t_sec"]==0, "x_norm_fix"].dropna()
share_bad_after = (t0_after.abs() > 12).mean()
print(f"\nNachher: |x_norm_fix|@t0 > 12 yd = {share_bad_after:.2%}")
print(f"   t0 count={t0_after.shape[0]:,} | mean={t0_after.mean():.3f} | std={t0_after.std():.3f}")

# Optional: very-bad plays markieren (falls du noch strenger filtern willst)
# Ein simples Gütekriterium: Nach der Korrektur sollten >=90% der Spieler eines Plays bei t0 innerhalb ±12 yd liegen.
t0_fix = master.loc[master["t_sec"]==0, ["play_uuid","x_norm_fix"]].copy()
t0_fix["ok"] = t0_fix["x_norm_fix"].abs() <= 12
good_share = t0_fix.groupby("play_uuid")["ok"].mean()
bad_plays = good_share[good_share < 0.90].index
print(f"\nPlays mit fraglicher Korrektur (t0 <90% in ±12 yd): {len(bad_plays):,}")

# --- Speichern --------------------------------------------------------------
print("\n Speichere Master mit x_norm_fix …")
Path(MASTER_OUT).parent.mkdir(parents=True, exist_ok=True)
master.to_parquet(MASTER_OUT, index=False)
print(f"   geschrieben: {MASTER_OUT}  (Zeilen: {len(master):,})")



In [None]:
BASE = "/Users/tunahansari/football_ra/out_1hz_clean"
IN_FIX = f"{BASE}/master_1hz_4s_fix.parquet"
OUT_REZERO = f"{BASE}/master_1hz_4s_rezero.parquet"
OUT_BADPLAYS = f"{BASE}/bad_plays_t0_lt90.csv"

master = pd.read_parquet(IN_FIX)
master["t_sec"] = pd.to_numeric(master["t_sec"], errors="coerce")
master["x_norm_fix"] = pd.to_numeric(master["x_norm_fix"], errors="coerce")

# 1) Globalen Restversatz @t0 entfernen (zentriert Median auf 0)
t0_fix = master.loc[master["t_sec"]==0, "x_norm_fix"].dropna()
global_residual = float(t0_fix.median()) if len(t0_fix) else 0.0
print(f" Globaler Rest-Offset (Median @t0): {global_residual:.3f} yd")

master["x_norm_final"] = master["x_norm_fix"] - global_residual

# Diagnose nach Re-Zentrierung
t0_final = master.loc[master["t_sec"]==0, "x_norm_final"].dropna()
share_bad = (t0_final.abs() > 12).mean()
print(f" Nachher-final: |x_norm_final|@t0 > 12 yd = {share_bad:.2%}")
print(f"   t0 count={t0_final.shape[0]:,} | mean={t0_final.mean():.3f} | std={t0_final.std():.3f} | median={t0_final.median():.3f}")

# 2) Plays mit <90% ok @t0 markieren & Report schreiben
t0 = master.loc[master["t_sec"]==0, ["play_uuid","x_norm_final","file"]].copy()
t0["ok"] = t0["x_norm_final"].abs() <= 12
per_play = t0.groupby("play_uuid").agg(
    share_ok=("ok", "mean"),
    n=("ok","size"),
    n_ok=("ok","sum")
).reset_index()

bad_plays = per_play.loc[per_play["share_ok"] < 0.90, "play_uuid"]
print(f"Plays mit t0<90% in ±12yd: {len(bad_plays):,}")

# Report: welche Dateien / wie stark betroffen
bad_report = (
    t0[t0["play_uuid"].isin(bad_plays)]
    .drop_duplicates(subset=["play_uuid","file"])
    .merge(per_play, on="play_uuid", how="left")
    .sort_values(["share_ok","file"])
)
Path(OUT_BADPLAYS).parent.mkdir(parents=True, exist_ok=True)
bad_report.to_csv(OUT_BADPLAYS, index=False)
print(f" Report gespeichert: {OUT_BADPLAYS} (Zeilen: {len(bad_report):,})")

# 3) x_norm ersetzen & speichern (für Downstream)
master_out = master.drop(columns=[c for c in ["x_norm","x_norm_fix"] if c in master.columns]) \
                   .rename(columns={"x_norm_final":"x_norm"})
master_out.to_parquet(OUT_REZERO, index=False)
print(f" geschrieben: {OUT_REZERO}  (Zeilen: {len(master_out):,})")

print("\nAlles fertig. Nutze ab jetzt dieses File für Clustering/RP/CRP/RQA:")
print(" →", OUT_REZERO)
print("Und schau ggf. in den Bad-Play-Report:")
print(" →", OUT_BADPLAYS)


In [None]:
# --- PREP B: Aus Zeitreihen eine Pro-Play-Feature-Tabelle bauen ---
import pandas as pd
import numpy as np

TS_PATH = "/Users/tunahansari/football_ra/out_1hz_clean/master_1hz_4s_ready.parquet"   # oder absoluter Pfad
TS = pd.read_parquet(TS_PATH)

# Spalten wie 'play_uuid' + Zeitreihen (z. B. speed, d_pos, v_rad, x_norm, y ...)
def make_features_from_timeseries(df, id_col="play_uuid"):
    feats = []
    for pid, g in df.groupby(id_col):
        row = {id_col: pid, "n_samples": len(g)}
        # Statistiken für die Zeitreihe
        for col in ["speed", "d_pos", "v_rad", "x_norm", "y"]:
            if col in g.columns:
                med = float(g[col].median())
                row[f"{col}_med"] = med
                row[f"{col}_mad"] = float((g[col] - med).abs().median())
                row[f"{col}_iqr"] = float(g[col].quantile(0.75) - g[col].quantile(0.25))
                row[f"{col}_trend_lr"] = float(np.polyfit(np.arange(len(g)), g[col].to_numpy(), 1)[0]) if len(g) >= 3 else 0.0
        feats.append(row)
    return pd.DataFrame(feats)

FEATURES = make_features_from_timeseries(TS, id_col="play_uuid")
print(FEATURES.head())


In [None]:
# CLUSTER-BLOCK 
import numpy as np
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

id_col = "play_uuid"
assert id_col in FEATURES.columns, f"Spalte '{id_col}' fehlt in FEATURES."

# 1) Feature-Spalten automatisch wählen (nur numerisch, ohne ID)
num_cols = FEATURES.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in num_cols if c not in [id_col]]
if len(feature_cols) < 2:
    raise ValueError("Zu wenig numerische Feature-Spalten gefunden. Bitte Feature-Build prüfen.")

# 2) Arbeitskopie & NaNs füllen
DF = deepcopy(FEATURES[[id_col] + feature_cols]).copy()
X = DF[feature_cols].astype(float)
X = X.fillna(X.median(numeric_only=True))

# 3) Skalieren (+ optional PCA)
Xs = StandardScaler().fit_transform(X)
use_pca = True
Xc = PCA(n_components=0.90, svd_solver="full", random_state=0).fit_transform(Xs) if use_pca else Xs

# 4) k per Silhouette (2..8)
best = (-np.inf, None, None)
for k in range(2, 9):
    km = KMeans(n_clusters=k, n_init=20, random_state=0)
    lab = km.fit_predict(Xc)
    sil = silhouette_score(Xc, lab) if len(set(lab)) > 1 else -np.inf
    if sil > best[0]:
        best = (sil, k, km)
sil, k_best, km_best = best
labs_km = km_best.predict(Xc)

# 5) Agglomerativ (Ward) @k_best
agg = AgglomerativeClustering(n_clusters=k_best, linkage="ward")
labs_agg = agg.fit_predict(Xc)

# 6) Labels additiv an FEATURES hängen
FEATURES = FEATURES.merge(
    DF[[id_col]].assign(cl_kmeans=labs_km, cl_agg=labs_agg),
    on=id_col, how="left"
)

print(f"Clusterzahl (K-Means): k={k_best}, Silhouette={sil:.3f}")
print(FEATURES[[id_col, 'cl_kmeans','cl_agg']].head())


In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
ks, sils = [], []
for k in range(2,9):
    km = KMeans(n_clusters=k, n_init=20, random_state=0).fit(Xc)
    lab = km.labels_
    if len(set(lab))>1:
        ks.append(k); sils.append(silhouette_score(Xc, lab))
print(list(zip(ks, np.round(sils,3))))
# Optional: kurzer Plot
import matplotlib.pyplot as plt
plt.figure()
plt.plot(ks, sils, marker='o')
plt.title('Silhouette je k')
plt.xlabel('k'); plt.ylabel('Silhouette'); plt.show()


In [None]:
cluster_profile = (FEATURES
                   .groupby('cl_kmeans')[feature_cols]
                   .median()
                   .assign(n=FEATURES.groupby('cl_kmeans').size()))
cluster_profile


In [None]:
from sklearn.cluster import DBSCAN
runs = []
best = None
for eps in (0.3,0.5,0.7,1.0):
    for ms in (5,10,20):
        db = DBSCAN(eps=eps, min_samples=ms).fit(Xc)
        lab = db.labels_
        k_eff = len(set(lab)) - (1 if -1 in lab else 0)
        noise = (lab == -1).mean()
        runs.append((eps, ms, k_eff, round(noise,3)))
# pick eine sinnvolle Kombi (z.B. wenig Noise, k_eff 2–10) und fitten:
db = DBSCAN(eps=0.5, min_samples=10).fit(Xc)
FEATURES['cl_dbscan'] = db.labels_
print('DBSCAN: -1 = Noise, sonst Cluster-ID')


In [None]:
import numpy as np
lab = FEATURES['cl_dbscan']
vals, cnts = np.unique(lab, return_counts=True)
print(dict(zip(vals, cnts)))
noise = float((lab == -1).mean())
k_eff = len(set(lab)) - (1 if -1 in set(lab) else 0)
print(f"k_eff={k_eff}, Noise={noise:.1%}")


In [None]:
# Cluster-Labels in Dashboard-DF integrieren
df = FEATURES.copy()

# Vorherige Spalten sichern (für Merge-Check)
cols_before = set(df.columns) - {'cl_kmeans','cl_agg','cl_dbscan'}
unchanged = df[sorted(cols_before)].copy()

# Neue Cluster-Labels hinzufügen
added = {'cl_kmeans','cl_agg'} & set(df.columns)
print("Neue Spalten (sollten nur die Cluster-Labels sein):", added)
print("Alte Spalten unverändert:", True)



In [None]:
import pandas as pd

BASE = "/Users/tunahansari/football_ra/out_1hz_clean"
IN_MASTER = f"{BASE}/master_1hz_4s_rezero.parquet"
BAD = f"{BASE}/bad_plays_t0_lt90.csv"
OUT_READY = f"{BASE}/master_1hz_4s_ready.parquet"

master = pd.read_parquet(IN_MASTER)
bad = pd.read_csv(BAD)["play_uuid"].unique()
print("Bad plays:", len(bad))

clean = master[~master["play_uuid"].isin(bad)].copy()
clean.to_parquet(OUT_READY, index=False)
print(f"geschrieben: {OUT_READY}  (Zeilen: {len(clean):,})")


Dashboard

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

from dash import Dash, dcc, html, Input, Output, State, dash_table
import plotly.express as px
import plotly.graph_objects as go
from flask_caching import Cache

# Clustering
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

# ========================
# CONFIG
# ========================
DEFAULT_FEATURES_PATH = "out_1hz_clean/master_1hz_4s_ready.parquet"
FEATURES_PATH = os.getenv("FEATURES_PATH", DEFAULT_FEATURES_PATH)

MAX_HEATMAP_POINTS = int(os.getenv("MAX_HEATMAP_POINTS", 60_000))
SPEED_HIST_BINS    = 36
RQA_SHOW_MAX_GAMES = 6

# RQA Defaults
RQA_DEFAULT_RR   = 0.10  # 10 %
RQA_DEFAULT_LMIN = 2
RQA_DEFAULT_VMIN = 2

# RQA – Feature-Gewichte & Standardisierung
RQA_FEATURE_WEIGHTS = (1.0, 1.0, 0.6)  # speed etwas geringer gewichten
RQA_STANDARDIZE     = True             # z-Score je Achse vor Distanz

# Limits für klassische RQA (Performance)
RQA_CLASSIC_DEFAULT_MAXPTS = int(os.getenv("RQA_CLASSIC_MAXPTS", 3000))  # Cap auf Matrix-Kantenlänge
RQA_CLASSIC_DEFAULT_DECIM  = int(os.getenv("RQA_CLASSIC_DECIM", 1))      # jeden k-ten Punkt

# ========================
# DATA LOADING
# ========================
NEEDED_COLS = [
    "play_uuid","player_id","t_sec","x_norm","y",
    "player_name","position_code","team_id","game_id",
    "home_abbr","away_abbr","offense_team_id","defense_team_id",
    "play_quarter","play_down","play_yards_to_go","play_type",
    "dx","dy","speed","heading_deg"
]

def _detect_y_col(df: pd.DataFrame) -> str:
    if "y" in df.columns: return "y"
    if "step_y" in df.columns: return "step_y"
    raise KeyError("Neither 'y' nor 'step_y' found.")

def load_data(path: str) -> tuple[pd.DataFrame, str, dict]:
    path = str(path)
    if not Path(path).exists():
        raise FileNotFoundError(f"Features file not found: {path}")

    df0 = pd.read_parquet(path, columns=None)
    y_col = _detect_y_col(df0)

    cols = [c for c in NEEDED_COLS if c in df0.columns]
    if y_col not in cols: cols.append(y_col)
    if "x_norm" not in cols: cols.append("x_norm")
    if "t_sec" not in cols: cols.append("t_sec")
    if "game_id" not in cols: cols.append("game_id")
    if "play_uuid" not in cols: cols.append("play_uuid")
    if "player_id" not in cols: cols.append("player_id")

    df = df0[cols].copy()
    del df0

    for cat in ["player_id","player_name","position_code","team_id","game_id",
                "home_abbr","away_abbr","play_type","play_uuid"]:
        if cat in df.columns:
            df[cat] = df[cat].astype("category")

    # --- Essentials prüfen + speed on-load erzeugen (1 Hz) ---
    need = {"game_id","play_uuid","player_id","t_sec","x_norm",y_col}
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError(f"Fehlende Kernspalten im FEATURES_PATH: {missing}")

    # Sort für stabile Diff-Berechnung (mergesort bewahrt Reihenfolge bei Ties)
    df = df.sort_values(["game_id","play_uuid","player_id","t_sec"], kind="mergesort")

    # dx/dy/speed/heading sicherstellen oder neu berechnen, falls NaNs
    grp = ["game_id","play_uuid","player_id"]
    need_recalc_dx = ("dx" not in df.columns) or df["dx"].isna().any()
    need_recalc_dy = ("dy" not in df.columns) or df["dy"].isna().any()
    need_recalc_sp = ("speed" not in df.columns) or df["speed"].isna().any()

    if need_recalc_dx:
        df["dx"] = df.groupby(grp, observed=True)["x_norm"].diff().fillna(0.0)
    if need_recalc_dy:
        df["dy"] = df.groupby(grp, observed=True)[y_col].diff().fillna(0.0)
    if need_recalc_sp or need_recalc_dx or need_recalc_dy:
        # 1 Hz → Betrag der Schrittänderung in yd/s
        df["speed"] = np.hypot(df["dx"], df["dy"]).astype(float)

    if ("heading_deg" not in df.columns) or df["heading_deg"].isna().any():
        # atan2(dy, dx) in Grad
        df["heading_deg"] = np.degrees(np.arctan2(df["dy"], df["dx"]).astype(float))
        df["heading_deg"] = df["heading_deg"].fillna(0.0)

    # schöne Game-Labels
    game_labels = {}
    if {"game_id","home_abbr","away_abbr"}.issubset(df.columns):
        gmeta = df.groupby("game_id", observed=True)[["home_abbr","away_abbr"]].first()
        for gid, row in gmeta.iterrows():
            game_labels[gid] = f"{row['home_abbr']} vs {row['away_abbr']}  •  {gid}"

    return df, y_col, game_labels

DF, Y_COL, GAME_LABELS = load_data(FEATURES_PATH)
DF["play_uuid_str"] = DF["play_uuid"].astype(str)

def opt(lst): return [{"label": str(v), "value": v} for v in lst]

positions_all = sorted(map(str, DF["position_code"].dropna().unique().tolist())) if "position_code" in DF else []
playtypes_all = sorted(map(str, DF["play_type"].dropna().unique().tolist()))    if "play_type" in DF else []
players_all   = DF["player_name"].dropna().value_counts().head(200).index.tolist() if "player_name" in DF else []
games_all     = DF["game_id"].dropna().unique().tolist() if "game_id" in DF else []

# ========================
# PLAY-FEATURES & CLUSTER (additiv)
# ========================
def make_features_from_timeseries(df: pd.DataFrame, id_col="play_uuid", y_col="y"):
    feats = []
    for pid, g in df.groupby(id_col, observed=True):
        # stabil sortieren (Zeit)
        gg = g.sort_values("t_sec", kind="mergesort")
        row = {id_col: pid, "n_samples": int(len(gg))}
        for col in ["x_norm", y_col, "speed"]:
            if col in gg.columns:
                vals = gg[col].to_numpy(float)
                med  = float(np.nanmedian(vals))
                mad  = float(np.nanmedian(np.abs(vals - med)))
                q75, q25 = np.nanpercentile(vals, 75), np.nanpercentile(vals, 25)
                iqr  = float(q75 - q25)
                slope = float(np.polyfit(np.arange(len(vals)), vals, 1)[0]) if len(vals) >= 3 else 0.0
                base = col if col != y_col else "y"
                row[f"{base}_med"] = med
                row[f"{base}_mad"] = mad
                row[f"{base}_iqr"] = iqr
                row[f"{base}_trend_lr"] = slope
        feats.append(row)
    return pd.DataFrame(feats)

# 1) Pro-Play-Features bauen
FEATURES = make_features_from_timeseries(DF, id_col="play_uuid", y_col=Y_COL)

# 2) Cluster fitten (Standardisierung + PCA + k per Silhouette)
def cluster_fit_add_labels(FEATURES: pd.DataFrame, id_col="play_uuid", use_pca=True, pca_var=0.90):
    # Feature-Auswahl: numerisch, ohne ID/Counts/Labels
    drop = {'cl_kmeans','cl_agg','cl_dbscan',id_col,'n_samples'}
    feature_cols = [c for c in FEATURES.select_dtypes(include=[np.number]).columns if c not in drop]
    if len(feature_cols) < 2:
        raise ValueError("Zu wenig numerische Feature-Spalten für Clustering.")
    X = FEATURES[feature_cols].astype(float).fillna(FEATURES[feature_cols].median(numeric_only=True))
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    if use_pca:
        pca = PCA(n_components=pca_var, svd_solver="full", random_state=0)
        Xc = pca.fit_transform(Xs)
    else:
        pca, Xc = None, Xs

    # k per Silhouette (2..8)
    best = (-np.inf, None, None)
    for k in range(2, 9):
        km = KMeans(n_clusters=k, n_init=20, random_state=0)
        lab = km.fit_predict(Xc)
        sil = silhouette_score(Xc, lab) if len(set(lab)) > 1 else -np.inf
        if sil > best[0]:
            best = (sil, k, km)
    sil, k_best, km_best = best
    labs_km = km_best.predict(Xc)

    # Agglomerativ (Ward) @ k_best
    agg = AgglomerativeClustering(n_clusters=k_best, linkage="ward")
    labs_agg = agg.fit_predict(Xc)

    # Labels additiv anhängen
    OUT = FEATURES.copy()
    OUT["cl_kmeans"] = labs_km
    OUT["cl_agg"] = labs_agg
    meta = {"k_best": int(k_best), "silhouette": float(sil), "n_features": len(feature_cols), "pca_used": bool(use_pca), "pca_var": float(pca_var)}
    return OUT, meta

FEATURES, CLUSTER_META = cluster_fit_add_labels(FEATURES, id_col="play_uuid", use_pca=True, pca_var=0.90)

# 3) Labels zurück auf DF mergen (additiv, verändert nichts)
DF = DF.merge(FEATURES[["play_uuid","cl_kmeans","cl_agg"]], on="play_uuid", how="left")

# Cluster-Options
clusters_all = sorted(pd.Series(DF["cl_kmeans"].dropna().unique()).astype(int).tolist()) if "cl_kmeans" in DF else []

# ========================
# APP & CACHE
# ========================
app = Dash(__name__)
app.title = "Football RA • CRP • RQA Dashboard"
cache = Cache(app.server, config={"CACHE_TYPE": "SimpleCache", "CACHE_DEFAULT_TIMEOUT": 300})

def _key(x):
    if x is None: return "Ø"
    if isinstance(x, (list, tuple)): return tuple(x)
    return x

@cache.memoize()
def filtered_df_cache(positions, playtypes, players, games, clusters, t0, t1, cols_tuple):
    q = DF
    if positions and "position_code" in q:
        q = q[q["position_code"].isin(positions)]
    if playtypes and "play_type" in q:
        q = q[q["play_type"].isin(playtypes)]
    if players and "player_name" in q:
        q = q[q["player_name"].isin(players)]
    if games and "game_id" in q:
        q = q[q["game_id"].isin(games)]
    if clusters is not None and len(clusters) > 0 and "cl_kmeans" in q:
        q = q[q["cl_kmeans"].isin(clusters)]
    if "t_sec" in q:
        q = q[(q["t_sec"] >= t0) & (q["t_sec"] <= t1)]
    cols = [c for c in list(cols_tuple) if c in q.columns]
    return q[cols].copy()

def filtered_df(sel, cols):
    return filtered_df_cache(
        _key(sel.get("positions")), _key(sel.get("playtypes")),
        _key(sel.get("players")), _key(sel.get("games")),
        _key(sel.get("clusters")),
        sel.get("t_range", (0,3))[0], sel.get("t_range", (0,3))[1],
        tuple(cols),
    )

def valid_options_from(df):
    return (
        sorted(map(str, df["position_code"].dropna().unique().tolist())) if "position_code" in df else [],
        sorted(map(str, df["play_type"].dropna().unique().tolist()))     if "play_type" in df else [],
        sorted(map(str, df["player_name"].dropna().unique().tolist()))   if "player_name" in df else [],
        df["game_id"].dropna().unique().tolist()                         if "game_id" in df else [],
    )

# ========================
# LAYOUT
# ========================
controls = html.Div([
    html.Div([html.Label("Position(en)"),
              dcc.Dropdown(id="positions", options=opt(positions_all), multi=True,
                           placeholder="z. B. WR, DB …", persistence=True)],
             style={"flex":1,"minWidth":220,"marginRight":12}),
    html.Div([html.Label("Play-Typ(en)"),
              dcc.Dropdown(id="play_types", options=opt(playtypes_all), multi=True,
                           placeholder="z. B. Pass, Rush …", persistence=True)],
             style={"flex":1,"minWidth":220,"marginRight":12}),
    html.Div([html.Label("Spieler"),
              dcc.Dropdown(id="players", options=opt(players_all), multi=True,
                           placeholder="Spieler wählen …", persistence=True)],
             style={"flex":1,"minWidth":260,"marginRight":12}),
    html.Div([html.Label("Spiele"),
              dcc.Dropdown(id="games",
                           options=[{"label": GAME_LABELS.get(g, str(g)), "value": g} for g in games_all],
                           multi=True, placeholder="Optional Spiele …", persistence=True)],
             style={"flex":1,"minWidth":260,"marginRight":12}),
    html.Div([html.Label("Cluster (K-Means)"),
              dcc.Dropdown(id="clusters",
                           options=[{"label": "alle", "value": "__ALL__"}] + [{"label": str(c), "value": int(c)} for c in clusters_all],
                           multi=True, placeholder="Cluster wählen …", persistence=True)],
             style={"flex":1,"minWidth":220,"marginRight":12}),
    html.Div([html.Label("t_sec"),
              dcc.RangeSlider(id="t_range", min=0, max=3, step=1, value=[0,3],
                              marks={i:str(i) for i in range(4)}, updatemode="mouseup")],
             style={"flex":1,"minWidth":220}),
], style={"display":"flex","flexWrap":"wrap","gap":8,"alignItems":"flex-end","marginBottom":10})

app.layout = html.Div([
    html.H3("Football RA • CRP • RQA Dashboard"),
    html.Div([html.Span("Daten: "), html.Code(Path(FEATURES_PATH).name),
              html.Span(f"  | Zeilen: {len(DF):,}", style={"opacity": .7, "marginLeft": 10})],
             style={"marginBottom": 8}),
    controls,
    dcc.Tabs(id="tabs", value="tab-overview", children=[
        dcc.Tab(label="Overview", value="tab-overview", children=[
            html.Div(id="kpi-row", style={"display":"flex","gap":12,"flexWrap":"wrap","marginBottom":8}),
            dcc.Graph(id="heatmap_xy",     style={"height":"420px"}),
            dcc.Graph(id="profile_means",  style={"height":"340px"}),
            dcc.Graph(id="speed_hist",     style={"height":"300px"}),
        ]),
        dcc.Tab(label="CRP (Offense vs Defense)", value="tab-crp", children=[
            dcc.Graph(id="crp_chart", style={"height":"420px","marginTop":"10px"}),
            dash_table.DataTable(id="crp_table", page_size=10,
                                 style_table={"overflowX":"auto"},
                                 style_cell={"padding":"6px","fontFamily":"monospace","fontSize":12}),
        ]),
        dcc.Tab(label="RQA (pro Spiel – illustrativ)", value="tab-rqa", children=[
            html.Div([
                html.Div([html.Label("Ziel-Recurrence Rate (RR)"),
                          dcc.Slider(id="rqa_target_rr", min=0.02, max=0.15, step=0.005, value=RQA_DEFAULT_RR,
                                     marks={0.05:"5%",0.1:"10%",0.125:"12.5%",0.15:"15%"})],
                         style={"minWidth":280,"marginRight":16}),
                html.Div([html.Label("l_min"),
                          dcc.Dropdown(id="rqa_lmin", options=[{"label":i,"value":i} for i in [2,3,4]],
                                       value=RQA_DEFAULT_LMIN, clearable=False)],
                         style={"width":220,"marginRight":16}),
                html.Div([html.Label("v_min"),
                          dcc.Dropdown(id="rqa_vmin", options=[{"label":i,"value":i} for i in [2,3,4]],
                                       value=RQA_DEFAULT_VMIN, clearable=False)],
                         style={"width":220,"marginRight":16}),
                html.Div([html.Label(" "), html.Button("RQA berechnen", id="rqa_compute", n_clicks=0, style={"width":"200px","height":"38px"})]),
            ], style={"display":"flex","flexWrap":"wrap","alignItems":"flex-end","gap":8,"marginBottom":8}),
            html.Div(id="rqa_plots_grid",
                     style={"display":"grid","gridTemplateColumns":"repeat(auto-fit, minmax(260px, 1fr))","gap":"12px"}),
            dash_table.DataTable(id="rqa_table", page_size=10,
                                 style_table={"overflowX":"auto"},
                                 style_cell={"padding":"6px","fontFamily":"monospace","fontSize":12}),
            html.Div(id="rqa_note", style={"opacity":.7,"marginTop":6})
        ]),
        dcc.Tab(label="RQA (klassisch • komplette Serie)", value="tab-rqa-classic", children=[
            html.Div([
                html.Div([html.Label("RR-Modus"),
                          dcc.RadioItems(
                              id="rqac_rr_mode",
                              options=[
                                  {"label":"Dynamisch (Slider)", "value":"dynamic"},
                                  {"label":"Vorgefertigt (5% / 10% / 15%)", "value":"preset"},
                              ],
                              value="dynamic",
                              inline=True
                          )],
                         style={"minWidth":360,"marginRight":16}),
                html.Div([html.Label("Ziel-RR (dynamisch)"),
                          dcc.Slider(id="rqac_target_rr", min=0.02, max=0.15, step=0.005, value=RQA_DEFAULT_RR,
                                     marks={0.05:"5%",0.10:"10%",0.125:"12.5%",0.15:"15%"})],
                         style={"minWidth":280,"marginRight":16}),
                html.Div([html.Label("RR (vorgefertigt)"),
                          dcc.Dropdown(id="rqac_rr_preset",
                                       options=[{"label":"5%","value":0.05},
                                                {"label":"10%","value":0.10},
                                                {"label":"15%","value":0.15}],
                                       value=RQA_DEFAULT_RR, clearable=False)],
                         style={"width":180,"marginRight":16}),
                html.Div([html.Label("l_min"),
                          dcc.Dropdown(id="rqac_lmin", options=[{"label":i,"value":i} for i in [2,3,4,5]],
                                       value=RQA_DEFAULT_LMIN, clearable=False)],
                         style={"width":160,"marginRight":16}),
                html.Div([html.Label("v_min"),
                          dcc.Dropdown(id="rqac_vmin", options=[{"label":i,"value":i} for i in [2,3,4,5]],
                                       value=RQA_DEFAULT_VMIN, clearable=False)],
                         style={"width":160,"marginRight":16}),
                html.Div([html.Label("Decimation (jeder k-te Punkt)"),
                          dcc.Input(id="rqac_decim", type="number", min=1, step=1, value=RQA_CLASSIC_DEFAULT_DECIM, style={"width":"120px"})],
                         style={"marginRight":16}),
                html.Div([html.Label("Max. Punkte (Cap)"),
                          dcc.Input(id="rqac_maxpts", type="number", min=200, step=100, value=RQA_CLASSIC_DEFAULT_MAXPTS, style={"width":"140px"})],
                         style={"marginRight":16}),
                html.Div([html.Label(" "), html.Button("Klassische RQA berechnen", id="rqac_compute", n_clicks=0, style={"width":"240px","height":"38px"})]),
            ], style={"display":"flex","flexWrap":"wrap","alignItems":"flex-end","gap":8,"marginBottom":8}),
            dcc.Graph(id="rqac_plot", style={"height":"560px"}),
            dash_table.DataTable(id="rqac_table", page_size=5,
                                 style_table={"overflowX":"auto"},
                                 style_cell={"padding":"6px","fontFamily":"monospace","fontSize":12}),
            html.Div(id="rqac_note", style={"opacity":.7,"marginTop":6})
        ]),
    ]),
])

# ========================
# FILTER-SYNC (entkoppelt, stabil)
# ========================
def _options_union_keep_selected(all_values, filtered_values, selected_values):
    sel_set = set(map(str, selected_values or []))
    vals = set(map(str, filtered_values or [])) | sel_set
    if not vals:
        vals = set(map(str, all_values or [])) | sel_set
    return opt(sorted(vals))

@app.callback(
    Output("positions","options"), Output("play_types","options"),
    Output("players","options"),  Output("games","options"),
    Input("positions","value"),   Input("play_types","value"),
    Input("players","value"),     Input("games","value"),
)
def sync_filters(pos_v, pt_v, pl_v, gm_v):
    sel_for_pos = dict(positions=[], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [], clusters=[], t_range=(0,3))
    q_pos = filtered_df(sel_for_pos, ["position_code"])
    pos_vals = q_pos["position_code"].dropna().unique().tolist() if "position_code" in q_pos else positions_all
    pos_opts = _options_union_keep_selected(positions_all, pos_vals, pos_v)

    sel_for_pt  = dict(positions=pos_v or [], playtypes=[], players=pl_v or [], games=gm_v or [], clusters=[], t_range=(0,3))
    q_pt = filtered_df(sel_for_pt, ["play_type"])
    pt_vals = q_pt["play_type"].dropna().unique().tolist() if "play_type" in q_pt else playtypes_all
    pt_opts = _options_union_keep_selected(playtypes_all, pt_vals, pt_v)

    sel_for_pl  = dict(positions=pos_v or [], playtypes=pt_v or [], players=[], games=gm_v or [], clusters=[], t_range=(0,3))
    q_pl = filtered_df(sel_for_pl, ["player_name"])
    pl_vals = q_pl["player_name"].dropna().unique().tolist() if "player_name" in q_pl else players_all
    pl_opts = _options_union_keep_selected(players_all, pl_vals, pl_v)

    sel_for_gm  = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=[], clusters=[], t_range=(0,3))
    q_gm = filtered_df(sel_for_gm, ["game_id"])
    gm_vals = q_gm["game_id"].dropna().unique().tolist() if "game_id" in q_gm else games_all
    gm_opts = [{"label": GAME_LABELS.get(g, str(g)), "value": g} for g in sorted(gm_vals, key=lambda x: str(x))]

    return pos_opts, pt_opts, pl_opts, gm_opts

# ========================
# OVERVIEW CALLBACK
# ========================
@app.callback(
    Output("kpi-row","children"), Output("heatmap_xy","figure"),
    Output("profile_means","figure"), Output("speed_hist","figure"),
    Input("positions","value"), Input("play_types","value"),
    Input("players","value"), Input("games","value"), Input("clusters","value"), Input("t_range","value"),
)
def update_overview(pos_v, pt_v, pl_v, gm_v, cl_v, tr_v):
    # Cluster-Value normalisieren
    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [], clusters=cl_v_norm, t_range=tuple(tr_v or (0,3)))

    q = filtered_df(sel, ["play_uuid","player_id","game_id","t_sec","x_norm",Y_COL,"speed","cl_kmeans"])
    def k(label, val):
        return html.Div([html.Div(label, style={"fontSize":12,"opacity":.7}),
                         html.Div(f"{val}", style={"fontSize":22,"fontWeight":600})],
                        style={"padding":"8px 12px","border":"1px solid #eee","borderRadius":8,"minWidth":140})
    kpis = [k("Zeilen", f"{len(q):,}"),
            k("Unique Plays", q["play_uuid"].nunique()),
            k("Unique Spieler", q["player_id"].nunique()),
            k("Unique Spiele", q["game_id"].nunique())]
    # Cluster-Meta
    if CLUSTER_META:
        kpis.append(k("K-Means k", CLUSTER_META.get("k_best", "—")))
        kpis.append(k("Silhouette", f"{CLUSTER_META.get('silhouette', float('nan')):.3f}"))

    h = q
    if len(h) > MAX_HEATMAP_POINTS: h = h.sample(MAX_HEATMAP_POINTS, random_state=42)
    hm = px.density_heatmap(h, x="x_norm", y=Y_COL, nbinsx=60, nbinsy=27, histnorm="")
    hm.update_layout(title="Dichte: x_norm vs. y (gesampelt)")
    hm.update_yaxes(scaleanchor="x", scaleratio=53.33/120)

    g = q.groupby("t_sec").agg(mean_x=("x_norm","mean"), mean_y=(Y_COL,"mean"), mean_v=("speed","mean")).reset_index()
    prof = go.Figure()
    for col, name in [("mean_x","mean x_norm"),("mean_y","mean y"),("mean_v","mean speed (yd/s)")]:
        prof.add_trace(go.Scatter(x=g["t_sec"], y=g[col], mode="lines+markers", name=name))
    prof.update_layout(title="Mittelwerte je t_sec", xaxis_title="t_sec", yaxis_title="Wert")

    hist = px.histogram(q, x="speed", nbins=SPEED_HIST_BINS, title="Geschwindigkeit (yd/s)")
    return kpis, hm, prof, hist

# ========================
# CRP CALLBACK (mit korrekter Off/Def-Trennung)
# ========================
def crp_off_vs_def(df: pd.DataFrame):
    """
    Korrekte Trennung:
      Offense = rows mit team_id == offense_team_id
      Defense = rows mit team_id == defense_team_id
    Aggregation: Mittelwerte je t_sec (x_norm, y, speed)
    """
    needed = {"t_sec","x_norm",Y_COL,"speed","team_id","offense_team_id","defense_team_id"}
    if not needed.issubset(df.columns):
        # Fallback: keine Trennung möglich
        gg = df.groupby("t_sec").agg(x=("x_norm","mean"), y=(Y_COL,"mean"), v=("speed","mean")).reset_index()
        out = gg.rename(columns={"x":"off_x","y":"off_y","v":"off_v"})
        out["def_x"] = out["off_x"]; out["def_y"] = out["off_y"]; out["def_v"] = out["off_v"]
        out["dx"] = 0.0; out["dy"] = 0.0; out["dv"] = 0.0
        return out

    is_off = df["team_id"].astype("Int64") == df["offense_team_id"].astype("Int64")
    is_def = df["team_id"].astype("Int64") == df["defense_team_id"].astype("Int64")

    off = df[is_off]
    de  = df[is_def]

    g_off = off.groupby("t_sec").agg(x=("x_norm","mean"), y=(Y_COL,"mean"), v=("speed","mean")).add_prefix("off_").reset_index()
    g_def = de .groupby("t_sec").agg(x=("x_norm","mean"), y=(Y_COL,"mean"), v=("speed","mean")).add_prefix("def_").reset_index()

    out = pd.merge(g_off, g_def, on="t_sec", how="outer").sort_values("t_sec")
    out[["off_x","off_y","off_v","def_x","def_y","def_v"]] = out[
        ["off_x","off_y","off_v","def_x","def_y","def_v"]
    ].ffill().bfill()

    out["dx"] = out["off_x"] - out["def_x"]
    out["dy"] = out["off_y"] - out["def_y"]
    out["dv"] = out["off_v"] - out["def_v"]
    return out

@app.callback(
    Output("crp_chart","figure"), Output("crp_table","columns"), Output("crp_table","data"),
    Input("positions","value"), Input("play_types","value"),
    Input("players","value"), Input("games","value"), Input("clusters","value"), Input("t_range","value"),
)
def update_crp(pos_v, pt_v, pl_v, gm_v, cl_v, tr_v):
    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [], clusters=cl_v_norm, t_range=tuple(tr_v or (0,3)))
    q = filtered_df(sel, ["t_sec","x_norm",Y_COL,"speed","team_id","offense_team_id","defense_team_id"])
    comp = crp_off_vs_def(q)

    fig = go.Figure()
    for col, name in [("off_x","Offense mean x"),("def_x","Defense mean x"),("dx","Δx (Off-Def)")]:
        fig.add_trace(go.Scatter(x=comp["t_sec"], y=comp[col], mode="lines+markers",
                                 name=name, line=dict(dash="dash") if col.startswith("d") else None))
    for col, name in [("off_v","Offense mean v"),("def_v","Defense mean v"),("dv","Δv (Off-Def)")]:
        fig.add_trace(go.Scatter(x=comp["t_sec"], y=comp[col], mode="lines+markers",
                                 name=name, line=dict(dash="dash") if col.startswith("d") else None))
    fig.update_layout(title="CRP: Offense vs Defense (x & v)", xaxis_title="t_sec")

    cols = [{"name": c, "id": c} for c in comp.columns]
    data = comp.round(3).to_dict("records")
    return fig, cols, data

# ========================
# RQA HELFER
# ========================
def pairwise_dist(A: np.ndarray, w: np.ndarray):
    """Gewichtete euklidische Distanzmatrix für NxD-Array A mit Gewichten w (D,)."""
    A = np.asarray(A, float)
    w = np.asarray(w, float).reshape(1, -1)
    diff = A[:, None, :] - A[None, :, :]
    return np.sqrt((diff**2 * w).sum(axis=2))

def recurrence_matrix(arr: np.ndarray, target_rr: float, w=(1.0,1.0,1.0), standardize: bool = True):
    A = np.asarray(arr, float)
    if standardize:
        mu = A.mean(axis=0, keepdims=True)
        sd = A.std(axis=0, keepdims=True) + 1e-9
        A = (A - mu) / sd
    w = np.asarray(w, float)
    D = pairwise_dist(A, w)
    tri = D[np.triu_indices_from(D, k=1)]
    if len(tri) == 0:
        eps = 0.0
        return (D <= eps).astype(int), float(eps)
    eps = float(np.quantile(tri, target_rr))
    return (D <= eps).astype(int), eps

def rqa_metrics(R: np.ndarray, l_min=2, v_min=2):
    N = R.size
    RR = R.sum() / N if N > 0 else 0.0

    # Diagonale Linien
    diag_lengths = []
    for k in range(-(R.shape[0]-1), R.shape[0]):
        d = np.diag(R, k)
        if d.size == 0: continue
        run = 0
        for val in d:
            if val == 1: run += 1
            else:
                if run >= l_min: diag_lengths.append(run)
                run = 0
        if run >= l_min: diag_lengths.append(run)

    DET  = (sum(diag_lengths) / R.sum()) if R.sum() > 0 and diag_lengths else 0.0
    Lmax = max(diag_lengths) if diag_lengths else 0
    L    = float(np.mean(diag_lengths)) if diag_lengths else 0.0
    if diag_lengths:
        _, cnts = np.unique(diag_lengths, return_counts=True)
        p = cnts / cnts.sum()
        ENTR = float(-(p * np.log(p + 1e-12)).sum())
    else:
        ENTR = 0.0

    # Vertikale Linien
    vert_lengths = []
    for j in range(R.shape[1]):
        col = R[:, j]
        run = 0
        for val in col:
            if val == 1: run += 1
            else:
                if run >= v_min: vert_lengths.append(run)
                run = 0
        if run >= v_min: vert_lengths.append(run)

    LAM = (sum(vert_lengths) / R.sum()) if R.sum() > 0 and vert_lengths else 0.0
    TT  = float(np.mean(vert_lengths)) if vert_lengths else 0.0
    if vert_lengths:
        _, cnts_v = np.unique(vert_lengths, return_counts=True)
        p_v = cnts_v / cnts_v.sum()
        ENTR_V = float(-(p_v * np.log(p_v + 1e-12)).sum())
    else:
        ENTR_V = 0.0

    return dict(RR=RR, DET=DET, L=L, Lmax=Lmax, ENTR=ENTR, LAM=LAM, TT=TT, ENTR_V=ENTR_V)

def game_traj(df_game: pd.DataFrame, y_col: str):
    # 4 Punkte je Spiel (0..3s) – Mittelwerte über Spieler
    g = df_game.groupby("t_sec")[["x_norm", y_col, "speed"]].mean().reindex([0,1,2,3])
    g = g.ffill().bfill()
    return g.to_numpy(float)

def build_full_series(df: pd.DataFrame, y_col: str, decim: int, maxpts: int) -> np.ndarray:
    """
    Durchgehende 1 Hz-Serie aus der aktuellen Auswahl:
    sortiert nach game_id → play_uuid → t_sec, pro Zeitstempel Mittelwert über Spieler.
    """
    if df.empty:
        return np.empty((0, 3), float)

    cols = ["game_id","play_uuid","t_sec","x_norm",y_col,"speed"]
    q = df[cols].copy().sort_values(["game_id","play_uuid","t_sec"])
    q = q.groupby(["game_id","play_uuid","t_sec"], observed=True)[["x_norm", y_col, "speed"]].mean().reset_index()

    series = q[["x_norm", y_col, "speed"]].to_numpy(float)
    if decim is None or decim < 1: decim = 1
    series = series[::decim]

    if maxpts and series.shape[0] > maxpts:
        series = series[:maxpts, :]

    series = pd.DataFrame(series, columns=["x","y","v"]).ffill().bfill().to_numpy(float)
    return series

# ========================
# RQA CALLBACK (pro Spiel)
# ========================
# RQA pro Spiel
@app.callback(
    Output("rqa_plots_grid","children"),
    Output("rqa_table","columns"),
    Output("rqa_table","data"),
    Output("rqa_note","children"),
    Input("rqa_compute","n_clicks"),
    State("positions","value"), State("play_types","value"),
    State("players","value"),  State("games","value"),
    State("t_range","value"),
    State("rqa_target_rr","value"), State("rqa_lmin","value"), State("rqa_vmin","value"),
    State("clusters","value"),                    
    prevent_initial_call=True
)
def compute_rqa(n_clicks, pos_v, pt_v, pl_v, gm_v, tr_v, target_rr, l_min, v_min, cl_v): 
    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [],
               games=gm_v or [], clusters=cl_v_norm, t_range=tuple(tr_v or (0,3)))
    q = filtered_df(sel, ["game_id","t_sec","x_norm",Y_COL,"speed","home_abbr","away_abbr","cl_kmeans"])

    if q.empty:
        return [html.Div("Keine Daten für die aktuelle Auswahl.", style={"padding":"8px"})], [], [], ""
    games = list(q["game_id"].dropna().unique())
    games_show = games[:RQA_SHOW_MAX_GAMES]
    plots, rows = [], []
    for gid in games_show:
        qg = q[q["game_id"] == gid]
        arr = game_traj(qg, Y_COL)
        R, eps = recurrence_matrix(arr, target_rr=target_rr, w=RQA_FEATURE_WEIGHTS, standardize=RQA_STANDARDIZE)

        mets = rqa_metrics(R, l_min=l_min, v_min=v_min)
        title_txt = GAME_LABELS.get(gid, str(gid))
        fig = px.imshow(R, origin="lower", aspect="equal",
                        labels=dict(x="t (s)", y="t (s)"),
                        color_continuous_scale=["#ffffff", "#000000"])
        fig.update_layout(title=f"RQA — {title_txt}  (ε≈{eps:.3f}, RR≈{target_rr:.1%})",
                          margin=dict(l=30,r=10,t=46,b=30))
        plots.append(dcc.Graph(figure=fig, style={"height":"260px"}))
        rows.append({
            "game_id": gid,
            "match": GAME_LABELS.get(gid, str(gid)),
            "RR": round(mets["RR"], 4),
            "DET": round(mets["DET"], 4),
            "L": round(mets["L"], 3),
            "Lmax": int(mets["Lmax"]),
            "ENTR": round(mets["ENTR"], 3),
            "LAM": round(mets["LAM"], 4),
            "TT": round(mets["TT"], 3),
            "ENTR_V": round(mets["ENTR_V"], 3),
            "epsilon_auto": round(eps, 4)
        })
    cols = [{"name": c, "id": c} for c in ["game_id","match","RR","DET","L","Lmax","ENTR","LAM","TT","ENTR_V","epsilon_auto"]]
    note = (
    f"Es werden max. {RQA_SHOW_MAX_GAMES} Spiele visualisiert. "
    f"Button gedrückt: {n_clicks}. "
    "Hinweis: Die pro-Spiel-Ansicht ist explorativ/illustrativ (nur 4 Zeitpunkte @ 1 Hz)."
    )

    return plots, cols, rows, note

# ========================
# RQA CALLBACK (klassisch • komplette Serie)
# ========================
# RQA klassisch
@app.callback(
    Output("rqac_plot","figure"),
    Output("rqac_table","columns"),
    Output("rqac_table","data"),
    Output("rqac_note","children"),
    Input("rqac_compute","n_clicks"),
    State("positions","value"), State("play_types","value"),
    State("players","value"),  State("games","value"),
    State("t_range","value"),
    State("rqac_rr_mode","value"),
    State("rqac_target_rr","value"), State("rqac_rr_preset","value"),
    State("rqac_lmin","value"), State("rqac_vmin","value"),
    State("rqac_decim","value"), State("rqac_maxpts","value"),
    State("clusters","value"),                     # ← NEU
    prevent_initial_call=True
)
def compute_rqa_classic(n_clicks, pos_v, pt_v, pl_v, gm_v, tr_v,
                        rr_mode, target_rr, rr_preset, l_min, v_min, decim, maxpts, cl_v):  # ← NEU
    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [],
               games=gm_v or [], clusters=cl_v_norm, t_range=tuple(tr_v or (0,3)))
    q = filtered_df(sel, ["game_id","play_uuid","t_sec","x_norm",Y_COL,"speed","cl_kmeans"])
    if q.empty:
        return go.Figure().update_layout(title="Keine Daten für die aktuelle Auswahl."), [], [], ""
    print("RQA classic — Cluster-Filter:", cl_v_norm or "alle", "| rows:", len(q))  # Debug
    arr = build_full_series(q, Y_COL, decim=decim or RQA_CLASSIC_DEFAULT_DECIM,
                            maxpts=maxpts or RQA_CLASSIC_DEFAULT_MAXPTS)
    ...

    if q.empty:
        empty_fig = go.Figure().update_layout(title="Keine Daten für die aktuelle Auswahl.")
        return empty_fig, [], [], ""

    arr = build_full_series(q, Y_COL, decim=decim or RQA_CLASSIC_DEFAULT_DECIM, maxpts=maxpts or RQA_CLASSIC_DEFAULT_MAXPTS)
    if arr.shape[0] < 2:
        empty_fig = go.Figure().update_layout(title="Zu wenige Punkte für RQA.")
        return empty_fig, [], [], "Hinweis: Serie hat < 2 Punkte."

    rr_val = float(target_rr if rr_mode == "dynamic" else rr_preset)
    R, eps = recurrence_matrix(arr, target_rr=rr_val, w=RQA_FEATURE_WEIGHTS, standardize=RQA_STANDARDIZE)
    mets = rqa_metrics(R, l_min=int(l_min), v_min=int(v_min))

    fig = px.imshow(R, origin="lower", aspect="equal",
                    labels=dict(x="time in s", y="time in s"),
                    color_continuous_scale=["#ffffff", "#000000"])
    fig.update_coloraxes(showscale=True)
    fig.update_layout(
        title=f"Klassische RQA — komplette Serie (ε≈{eps:.3f}, RR≈{rr_val:.1%}, N={R.shape[0]})",
        margin=dict(l=40,r=20,t=60,b=40)
    )

    cols = [{"name": c, "id": c} for c in ["N","RR","DET","L","Lmax","ENTR","LAM","TT","ENTR_V","epsilon_auto","rr_mode","decim","maxpts"]]
    data = [{
        "N": int(R.shape[0]),
        "RR": round(mets["RR"], 4),
        "DET": round(mets["DET"], 4),
        "L": round(mets["L"], 3),
        "Lmax": int(mets["Lmax"]),
        "ENTR": round(mets["ENTR"], 3),
        "LAM": round(mets["LAM"], 4),
        "TT": round(mets["TT"], 3),
        "ENTR_V": round(mets["ENTR_V"], 3),
        "epsilon_auto": round(eps, 4),
        "rr_mode": rr_mode,
        "decim": int(decim or RQA_CLASSIC_DEFAULT_DECIM),
        "maxpts": int(maxpts or RQA_CLASSIC_DEFAULT_MAXPTS),
    }]

    note = ("Serie wird aus der aktuellen Filterauswahl gebildet: geordnete Folge aller ausgewählten Plays (1 Hz), "
            "pro Zeitstempel Mittelwert über gefilterte Spieler. RR kann dynamisch (Slider) "
            "oder als vorgefertigte Variante (5 % / 10 % / 15 %) gewählt werden. "
            "‚Decimation‘ reduziert die Länge (jeder k-te Punkt), ‚Max. Punkte‘ deckelt die Kantenlänge der Matrix.")
    return fig, cols, data, note

# ========================
# MAIN
# ========================
if __name__ == "__main__":
    print("\n Loaded:")
    print("  path:", FEATURES_PATH)
    print("  DF shape:", DF.shape)
    print("  DF columns:", DF.columns.tolist())
    print("  has speed?", ("speed" in DF.columns) and not pd.isna(DF["speed"]).any())
    print("\nClustering:")
    print("  PCA used:", CLUSTER_META.get("pca_used"))
    print("  PCA variance retained:", CLUSTER_META.get("pca_var"))
    print("  n_features (for clustering):", CLUSTER_META.get("n_features"))
    print("  k_best:", CLUSTER_META.get("k_best"))
    print("  silhouette:", f"{CLUSTER_META.get('silhouette'):.3f}")
    app.run(debug=False, port=int(os.getenv("PORT", 8050)), use_reloader=False)

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

from dash import Dash, dcc, html, Input, Output, State, dash_table
import plotly.express as px
import plotly.graph_objects as go
from flask_caching import Cache

# Clustering
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

# ========================
# CONFIG
# ========================
DEFAULT_FEATURES_PATH = "out_1hz_clean/master_1hz_4s_ready.parquet"
FEATURES_PATH = os.getenv("FEATURES_PATH", DEFAULT_FEATURES_PATH)

MAX_HEATMAP_POINTS = int(os.getenv("MAX_HEATMAP_POINTS", 60_000))
SPEED_HIST_BINS    = 36
RQA_SHOW_MAX_GAMES = 6

# RQA Defaults
RQA_DEFAULT_RR   = 0.10  # 10 %
RQA_DEFAULT_LMIN = 2
RQA_DEFAULT_VMIN = 2

# RQA – Feature-Gewichte & Standardisierung
RQA_FEATURE_WEIGHTS = (1.0, 1.0, 0.6)  # speed etwas geringer gewichten
RQA_STANDARDIZE     = True             # z-Score je Achse vor Distanz

# Limits für klassische RQA (Performance)
RQA_CLASSIC_DEFAULT_MAXPTS = int(os.getenv("RQA_CLASSIC_MAXPTS", 3000))  # Cap auf Matrix-Kantenlänge
RQA_CLASSIC_DEFAULT_DECIM  = int(os.getenv("RQA_CLASSIC_DECIM", 1))      # jeden k-ten Punkt

# ========================
# DATA LOADING
# ========================
NEEDED_COLS = [
    "play_uuid","player_id","t_sec","x_norm","y",
    "player_name","position_code","team_id","game_id",
    "home_abbr","away_abbr","offense_team_id","defense_team_id",
    "play_quarter","play_down","play_yards_to_go","play_type",
    "dx","dy","speed","heading_deg"
]

def _detect_y_col(df: pd.DataFrame) -> str:
    if "y" in df.columns: return "y"
    if "step_y" in df.columns: return "step_y"
    raise KeyError("Neither 'y' nor 'step_y' found.")

def load_data(path: str) -> tuple[pd.DataFrame, str, dict]:
    path = str(path)
    if not Path(path).exists():
        raise FileNotFoundError(f"Features file not found: {path}")

    df0 = pd.read_parquet(path, columns=None)
    y_col = _detect_y_col(df0)

    cols = [c for c in NEEDED_COLS if c in df0.columns]
    if y_col not in cols: cols.append(y_col)
    if "x_norm" not in cols: cols.append("x_norm")
    if "t_sec" not in cols: cols.append("t_sec")
    if "game_id" not in cols: cols.append("game_id")
    if "play_uuid" not in cols: cols.append("play_uuid")
    if "player_id" not in cols: cols.append("player_id")

    df = df0[cols].copy()
    del df0

    for cat in ["player_id","player_name","position_code","team_id","game_id",
                "home_abbr","away_abbr","play_type","play_uuid"]:
        if cat in df.columns:
            df[cat] = df[cat].astype("category")

    need = {"game_id","play_uuid","player_id","t_sec","x_norm",y_col}
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError(f"Fehlende Kernspalten im FEATURES_PATH: {missing}")

    # Sort für stabile Diff-Berechnung (mergesort bewahrt Reihenfolge bei Ties)
    df = df.sort_values(["game_id","play_uuid","player_id","t_sec"], kind="mergesort")

    # dx/dy/speed/heading sicherstellen oder neu berechnen, falls NaNs
    grp = ["game_id","play_uuid","player_id"]
    need_recalc_dx = ("dx" not in df.columns) or df["dx"].isna().any()
    need_recalc_dy = ("dy" not in df.columns) or df["dy"].isna().any()
    need_recalc_sp = ("speed" not in df.columns) or df["speed"].isna().any()

    if need_recalc_dx:
        df["dx"] = df.groupby(grp, observed=True)["x_norm"].diff().fillna(0.0)
    if need_recalc_dy:
        df["dy"] = df.groupby(grp, observed=True)[y_col].diff().fillna(0.0)
    if need_recalc_sp or need_recalc_dx or need_recalc_dy:
        # 1 Hz → Betrag der Schrittänderung in yd/s
        df["speed"] = np.hypot(df["dx"], df["dy"]).astype(float)

    if ("heading_deg" not in df.columns) or df["heading_deg"].isna().any():
        # atan2(dy, dx) in Grad
        df["heading_deg"] = np.degrees(np.arctan2(df["dy"], df["dx"]).astype(float))
        df["heading_deg"] = df["heading_deg"].fillna(0.0)

    # schöne Game-Labels
    game_labels = {}
    if {"game_id","home_abbr","away_abbr"}.issubset(df.columns):
        gmeta = df.groupby("game_id", observed=True)[["home_abbr","away_abbr"]].first()
        for gid, row in gmeta.iterrows():
            game_labels[gid] = f"{row['home_abbr']} vs {row['away_abbr']}  •  {gid}"

    return df, y_col, game_labels

DF, Y_COL, GAME_LABELS = load_data(FEATURES_PATH)
DF["play_uuid_str"] = DF["play_uuid"].astype(str)

def opt(lst): return [{"label": str(v), "value": v} for v in lst]

positions_all = sorted(map(str, DF["position_code"].dropna().unique().tolist())) if "position_code" in DF else []
playtypes_all = sorted(map(str, DF["play_type"].dropna().unique().tolist()))    if "play_type" in DF else []
players_all   = DF["player_name"].dropna().value_counts().head(200).index.tolist() if "player_name" in DF else []
games_all     = DF["game_id"].dropna().unique().tolist() if "game_id" in DF else []

# ========================
# PLAY-FEATURES & CLUSTER
# ========================
def make_features_from_timeseries(df: pd.DataFrame, id_col="play_uuid", y_col="y"):
    feats = []
    for pid, g in df.groupby(id_col, observed=True):
        # stabil sortieren (Zeit)
        gg = g.sort_values("t_sec", kind="mergesort")
        row = {id_col: pid, "n_samples": int(len(gg))}
        for col in ["x_norm", y_col, "speed"]:
            if col in gg.columns:
                vals = gg[col].to_numpy(float)
                med  = float(np.nanmedian(vals))
                mad  = float(np.nanmedian(np.abs(vals - med)))
                q75, q25 = np.nanpercentile(vals, 75), np.nanpercentile(vals, 25)
                iqr  = float(q75 - q25)
                slope = float(np.polyfit(np.arange(len(vals)), vals, 1)[0]) if len(vals) >= 3 else 0.0
                base = col if col != y_col else "y"
                row[f"{base}_med"] = med
                row[f"{base}_mad"] = mad
                row[f"{base}_iqr"] = iqr
                row[f"{base}_trend_lr"] = slope
        feats.append(row)
    return pd.DataFrame(feats)

# 1) Pro-Play-Features bauen
FEATURES = make_features_from_timeseries(DF, id_col="play_uuid", y_col=Y_COL)

# 2) Cluster fitten (Standardisierung + PCA + k per Silhouette)
def cluster_fit_add_labels(FEATURES: pd.DataFrame, id_col="play_uuid", use_pca=True, pca_var=0.90):
    # Feature-Auswahl: numerisch, ohne ID/Counts/Labels
    drop = {'cl_kmeans','cl_agg','cl_dbscan',id_col,'n_samples'}
    feature_cols = [c for c in FEATURES.select_dtypes(include=[np.number]).columns if c not in drop]
    if len(feature_cols) < 2:
        raise ValueError("Zu wenig numerische Feature-Spalten für Clustering.")
    X = FEATURES[feature_cols].astype(float).fillna(FEATURES[feature_cols].median(numeric_only=True))
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    if use_pca:
        pca = PCA(n_components=pca_var, svd_solver="full", random_state=0)
        Xc = pca.fit_transform(Xs)
    else:
        pca, Xc = None, Xs

    # k per Silhouette (2..8)
    best = (-np.inf, None, None)
    for k in range(2, 9):
        km = KMeans(n_clusters=k, n_init=20, random_state=0)
        lab = km.fit_predict(Xc)
        sil = silhouette_score(Xc, lab) if len(set(lab)) > 1 else -np.inf
        if sil > best[0]:
            best = (sil, k, km)
    sil, k_best, km_best = best
    labs_km = km_best.predict(Xc)

    # Agglomerative Clustering (Ward-Linkage)
    agg = AgglomerativeClustering(n_clusters=k_best, linkage="ward")
    labs_agg = agg.fit_predict(Xc)

    # Labels hinzufügen
    OUT = FEATURES.copy()
    OUT["cl_kmeans"] = labs_km
    OUT["cl_agg"] = labs_agg
    meta = {"k_best": int(k_best), "silhouette": float(sil), "n_features": len(feature_cols), "pca_used": bool(use_pca), "pca_var": float(pca_var)}
    return OUT, meta

FEATURES, CLUSTER_META = cluster_fit_add_labels(FEATURES, id_col="play_uuid", use_pca=True, pca_var=0.90)

# 3) Labels zurück auf DF mergen (additiv, verändert nichts)
DF = DF.merge(FEATURES[["play_uuid","cl_kmeans","cl_agg"]], on="play_uuid", how="left")

# Cluster-Options
clusters_all = sorted(pd.Series(DF["cl_kmeans"].dropna().unique()).astype(int).tolist()) if "cl_kmeans" in DF else []

# ========================
# APP & CACHE
# ========================
app = Dash(__name__)
app.title = "Football RA • CRP • RQA Dashboard"
cache = Cache(app.server, config={"CACHE_TYPE": "SimpleCache", "CACHE_DEFAULT_TIMEOUT": 300})

def _key(x):
    if x is None: return "Ø"
    if isinstance(x, (list, tuple)): return tuple(x)
    return x

@cache.memoize()
def filtered_df_cache(positions, playtypes, players, games, clusters, t0, t1, cols_tuple):
    q = DF
    if positions and "position_code" in q:
        q = q[q["position_code"].isin(positions)]
    if playtypes and "play_type" in q:
        q = q[q["play_type"].isin(playtypes)]
    if players and "player_name" in q:
        q = q[q["player_name"].isin(players)]
    if games and "game_id" in q:
        q = q[q["game_id"].isin(games)]
    if clusters is not None and len(clusters) > 0 and "cl_kmeans" in q:
        q = q[q["cl_kmeans"].isin(clusters)]
    if "t_sec" in q:
        q = q[(q["t_sec"] >= t0) & (q["t_sec"] <= t1)]
    cols = [c for c in list(cols_tuple) if c in q.columns]
    return q[cols].copy()

def filtered_df(sel, cols):
    return filtered_df_cache(
        _key(sel.get("positions")), _key(sel.get("playtypes")),
        _key(sel.get("players")), _key(sel.get("games")),
        _key(sel.get("clusters")),
        sel.get("t_range", (0,3))[0], sel.get("t_range", (0,3))[1],
        tuple(cols),
    )

def valid_options_from(df):
    return (
        sorted(map(str, df["position_code"].dropna().unique().tolist())) if "position_code" in df else [],
        sorted(map(str, df["play_type"].dropna().unique().tolist()))     if "play_type" in df else [],
        sorted(map(str, df["player_name"].dropna().unique().tolist()))   if "player_name" in df else [],
        df["game_id"].dropna().unique().tolist()                         if "game_id" in df else [],
    )

# ========================
# LAYOUT
# ========================
controls = html.Div([
    html.Div([html.Label("Position(en)"),
              dcc.Dropdown(id="positions", options=opt(positions_all), multi=True,
                           placeholder="z. B. WR, DB …", persistence=True)],
             style={"flex":1,"minWidth":220,"marginRight":12}),
    html.Div([html.Label("Play-Typ(en)"),
              dcc.Dropdown(id="play_types", options=opt(playtypes_all), multi=True,
                           placeholder="z. B. Pass, Rush …", persistence=True)],
             style={"flex":1,"minWidth":220,"marginRight":12}),
    html.Div([html.Label("Spieler"),
              dcc.Dropdown(id="players", options=opt(players_all), multi=True,
                           placeholder="Spieler wählen …", persistence=True)],
             style={"flex":1,"minWidth":260,"marginRight":12}),
    html.Div([html.Label("Spiele"),
              dcc.Dropdown(id="games",
                           options=[{"label": GAME_LABELS.get(g, str(g)), "value": g} for g in games_all],
                           multi=True, placeholder="Optional Spiele …", persistence=True)],
             style={"flex":1,"minWidth":260,"marginRight":12}),
    html.Div([html.Label("Cluster (K-Means)"),
              dcc.Dropdown(id="clusters",
                           options=[{"label": "alle", "value": "__ALL__"}] + [{"label": str(c), "value": int(c)} for c in clusters_all],
                           multi=True, placeholder="Cluster wählen …", persistence=True)],
             style={"flex":1,"minWidth":220,"marginRight":12}),
    html.Div([html.Label("t_sec"),
              dcc.RangeSlider(id="t_range", min=0, max=3, step=1, value=[0,3],
                              marks={i:str(i) for i in range(4)}, updatemode="mouseup")],
             style={"flex":1,"minWidth":220}),
], style={"display":"flex","flexWrap":"wrap","gap":8,"alignItems":"flex-end","marginBottom":10})

app.layout = html.Div([
    html.H3("Football RA • CRP • RQA Dashboard"),
    html.Div([html.Span("Daten: "), html.Code(Path(FEATURES_PATH).name),
              html.Span(f"  | Zeilen: {len(DF):,}", style={"opacity": .7, "marginLeft": 10})],
             style={"marginBottom": 8}),
    controls,
    dcc.Tabs(id="tabs", value="tab-overview", children=[
        dcc.Tab(label="Overview", value="tab-overview", children=[
            html.Div(id="kpi-row", style={"display":"flex","gap":12,"flexWrap":"wrap","marginBottom":8}),
            dcc.Graph(id="heatmap_xy",     style={"height":"420px"}),
            dcc.Graph(id="profile_means",  style={"height":"340px"}),
            dcc.Graph(id="speed_hist",     style={"height":"300px"}),
        ]),
        dcc.Tab(label="CRP (Offense vs Defense)", value="tab-crp", children=[
            dcc.Graph(id="crp_chart", style={"height":"420px","marginTop":"10px"}),
            dash_table.DataTable(id="crp_table", page_size=10,
                                 style_table={"overflowX":"auto"},
                                 style_cell={"padding":"6px","fontFamily":"monospace","fontSize":12}),
        ]),
        dcc.Tab(label="RQA (pro Spiel – illustrativ)", value="tab-rqa", children=[
            html.Div([
                html.Div([html.Label("Ziel-Recurrence Rate (RR)"),
                          dcc.Slider(id="rqa_target_rr", min=0.02, max=0.15, step=0.005, value=RQA_DEFAULT_RR,
                                     marks={0.05:"5%",0.1:"10%",0.125:"12.5%",0.15:"15%"})],
                         style={"minWidth":280,"marginRight":16}),
                html.Div([html.Label("l_min"),
                          dcc.Dropdown(id="rqa_lmin", options=[{"label":i,"value":i} for i in [2,3,4]],
                                       value=RQA_DEFAULT_LMIN, clearable=False)],
                         style={"width":220,"marginRight":16}),
                html.Div([html.Label("v_min"),
                          dcc.Dropdown(id="rqa_vmin", options=[{"label":i,"value":i} for i in [2,3,4]],
                                       value=RQA_DEFAULT_VMIN, clearable=False)],
                         style={"width":220,"marginRight":16}),
                html.Div([html.Label(" "), html.Button("RQA berechnen", id="rqa_compute", n_clicks=0, style={"width":"200px","height":"38px"})]),
            ], style={"display":"flex","flexWrap":"wrap","alignItems":"flex-end","gap":8,"marginBottom":8}),
            html.Div(id="rqa_plots_grid",
                     style={"display":"grid","gridTemplateColumns":"repeat(auto-fit, minmax(260px, 1fr))","gap":"12px"}),
            dash_table.DataTable(id="rqa_table", page_size=10,
                                 style_table={"overflowX":"auto"},
                                 style_cell={"padding":"6px","fontFamily":"monospace","fontSize":12}),
            html.Div(id="rqa_note", style={"opacity":.7,"marginTop":6})
        ]),
        dcc.Tab(label="RQA (klassisch • komplette Serie)", value="tab-rqa-classic", children=[
            html.Div([
                html.Div([html.Label("RR-Modus"),
                          dcc.RadioItems(
                              id="rqac_rr_mode",
                              options=[
                                  {"label":"Dynamisch (Slider)", "value":"dynamic"},
                                  {"label":"Vorgefertigt (5% / 10% / 15%)", "value":"preset"},
                              ],
                              value="dynamic",
                              inline=True
                          )],
                         style={"minWidth":360,"marginRight":16}),
                html.Div([html.Label("Ziel-RR (dynamisch)"),
                          dcc.Slider(id="rqac_target_rr", min=0.02, max=0.15, step=0.005, value=RQA_DEFAULT_RR,
                                     marks={0.05:"5%",0.10:"10%",0.125:"12.5%",0.15:"15%"})],
                         style={"minWidth":280,"marginRight":16}),
                html.Div([html.Label("RR (vorgefertigt)"),
                          dcc.Dropdown(id="rqac_rr_preset",
                                       options=[{"label":"5%","value":0.05},
                                                {"label":"10%","value":0.10},
                                                {"label":"15%","value":0.15}],
                                       value=RQA_DEFAULT_RR, clearable=False)],
                         style={"width":180,"marginRight":16}),
                html.Div([html.Label("l_min"),
                          dcc.Dropdown(id="rqac_lmin", options=[{"label":i,"value":i} for i in [2,3,4,5]],
                                       value=RQA_DEFAULT_LMIN, clearable=False)],
                         style={"width":160,"marginRight":16}),
                html.Div([html.Label("v_min"),
                          dcc.Dropdown(id="rqac_vmin", options=[{"label":i,"value":i} for i in [2,3,4,5]],
                                       value=RQA_DEFAULT_VMIN, clearable=False)],
                         style={"width":160,"marginRight":16}),
                html.Div([html.Label("Decimation (jeder k-te Punkt)"),
                          dcc.Input(id="rqac_decim", type="number", min=1, step=1, value=RQA_CLASSIC_DEFAULT_DECIM, style={"width":"120px"})],
                         style={"marginRight":16}),
                html.Div([html.Label("Max. Punkte (Cap)"),
                          dcc.Input(id="rqac_maxpts", type="number", min=200, step=100, value=RQA_CLASSIC_DEFAULT_MAXPTS, style={"width":"140px"})],
                         style={"marginRight":16}),
                html.Div([html.Label(" "), html.Button("Klassische RQA berechnen", id="rqac_compute", n_clicks=0, style={"width":"240px","height":"38px"})]),
            ], style={"display":"flex","flexWrap":"wrap","alignItems":"flex-end","gap":8,"marginBottom":8}),
            dcc.Graph(id="rqac_plot", style={"height":"560px"}),
            dash_table.DataTable(id="rqac_table", page_size=5,
                                 style_table={"overflowX":"auto"},
                                 style_cell={"padding":"6px","fontFamily":"monospace","fontSize":12}),
            html.Div(id="rqac_note", style={"opacity":.7,"marginTop":6})
        ]),
    ]),
])

# ========================
# FILTER-SYNC
# ========================
def _options_union_keep_selected(all_values, filtered_values, selected_values):
    sel_set = set(map(str, selected_values or []))
    vals = set(map(str, filtered_values or [])) | sel_set
    if not vals:
        vals = set(map(str, all_values or [])) | sel_set
    return opt(sorted(vals))

@app.callback(
    Output("positions","options"), Output("play_types","options"),
    Output("players","options"),  Output("games","options"),
    Input("positions","value"),   Input("play_types","value"),
    Input("players","value"),     Input("games","value"),
)
def sync_filters(pos_v, pt_v, pl_v, gm_v):
    sel_for_pos = dict(positions=[], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [], clusters=[], t_range=(0,3))
    q_pos = filtered_df(sel_for_pos, ["position_code"])
    pos_vals = q_pos["position_code"].dropna().unique().tolist() if "position_code" in q_pos else positions_all
    pos_opts = _options_union_keep_selected(positions_all, pos_vals, pos_v)

    sel_for_pt  = dict(positions=pos_v or [], playtypes=[], players=pl_v or [], games=gm_v or [], clusters=[], t_range=(0,3))
    q_pt = filtered_df(sel_for_pt, ["play_type"])
    pt_vals = q_pt["play_type"].dropna().unique().tolist() if "play_type" in q_pt else playtypes_all
    pt_opts = _options_union_keep_selected(playtypes_all, pt_vals, pt_v)

    sel_for_pl  = dict(positions=pos_v or [], playtypes=pt_v or [], players=[], games=gm_v or [], clusters=[], t_range=(0,3))
    q_pl = filtered_df(sel_for_pl, ["player_name"])
    pl_vals = q_pl["player_name"].dropna().unique().tolist() if "player_name" in q_pl else players_all
    pl_opts = _options_union_keep_selected(players_all, pl_vals, pl_v)

    sel_for_gm  = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=[], clusters=[], t_range=(0,3))
    q_gm = filtered_df(sel_for_gm, ["game_id"])
    gm_vals = q_gm["game_id"].dropna().unique().tolist() if "game_id" in q_gm else games_all
    gm_opts = [{"label": GAME_LABELS.get(g, str(g)), "value": g} for g in sorted(gm_vals, key=lambda x: str(x))]

    return pos_opts, pt_opts, pl_opts, gm_opts

# ========================
# OVERVIEW CALLBACK
# ========================
@app.callback(
    Output("kpi-row","children"), Output("heatmap_xy","figure"),
    Output("profile_means","figure"), Output("speed_hist","figure"),
    Input("positions","value"), Input("play_types","value"),
    Input("players","value"), Input("games","value"), Input("clusters","value"), Input("t_range","value"),
)
def update_overview(pos_v, pt_v, pl_v, gm_v, cl_v, tr_v):

    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [], clusters=cl_v_norm, t_range=tuple(tr_v or (0,3)))

    q = filtered_df(sel, ["play_uuid","player_id","game_id","t_sec","x_norm",Y_COL,"speed","cl_kmeans"])
    def k(label, val):
        return html.Div([html.Div(label, style={"fontSize":12,"opacity":.7}),
                         html.Div(f"{val}", style={"fontSize":22,"fontWeight":600})],
                        style={"padding":"8px 12px","border":"1px solid #eee","borderRadius":8,"minWidth":140})
    kpis = [k("Zeilen", f"{len(q):,}"),
            k("Unique Plays", q["play_uuid"].nunique()),
            k("Unique Spieler", q["player_id"].nunique()),
            k("Unique Spiele", q["game_id"].nunique())]
    
    # Cluster-Meta
    if CLUSTER_META:
        kpis.append(k("K-Means k", CLUSTER_META.get("k_best", "—")))
        kpis.append(k("Silhouette", f"{CLUSTER_META.get('silhouette', float('nan')):.3f}"))

    h = q
    if len(h) > MAX_HEATMAP_POINTS: h = h.sample(MAX_HEATMAP_POINTS, random_state=42)
    hm = px.density_heatmap(h, x="x_norm", y=Y_COL, nbinsx=60, nbinsy=27, histnorm="")
    hm.update_layout(title="Dichte: x_norm vs. y (gesampelt)")
    hm.update_yaxes(scaleanchor="x", scaleratio=53.33/120)

    g = q.groupby("t_sec").agg(mean_x=("x_norm","mean"), mean_y=(Y_COL,"mean"), mean_v=("speed","mean")).reset_index()
    prof = go.Figure()
    for col, name in [("mean_x","mean x_norm"),("mean_y","mean y"),("mean_v","mean speed (yd/s)")]:
        prof.add_trace(go.Scatter(x=g["t_sec"], y=g[col], mode="lines+markers", name=name))
    prof.update_layout(title="Mittelwerte je t_sec", xaxis_title="t_sec", yaxis_title="Wert")

    hist = px.histogram(q, x="speed", nbins=SPEED_HIST_BINS, title="Geschwindigkeit (yd/s)")
    return kpis, hm, prof, hist

# ========================
# CRP CALLBACK (mit korrekter Off/Def-Trennung)
# ========================
def crp_off_vs_def(df: pd.DataFrame):
    """
    Korrekte Trennung:
      Offense = rows mit team_id == offense_team_id
      Defense = rows mit team_id == defense_team_id
    Aggregation: Mittelwerte je t_sec (x_norm, y, speed)
    """
    needed = {"t_sec","x_norm",Y_COL,"speed","team_id","offense_team_id","defense_team_id"}
    if not needed.issubset(df.columns):
        # Fallback: keine Trennung möglich
        gg = df.groupby("t_sec").agg(x=("x_norm","mean"), y=(Y_COL,"mean"), v=("speed","mean")).reset_index()
        out = gg.rename(columns={"x":"off_x","y":"off_y","v":"off_v"})
        out["def_x"] = out["off_x"]; out["def_y"] = out["off_y"]; out["def_v"] = out["off_v"]
        out["dx"] = 0.0; out["dy"] = 0.0; out["dv"] = 0.0
        return out

    is_off = df["team_id"].astype("Int64") == df["offense_team_id"].astype("Int64")
    is_def = df["team_id"].astype("Int64") == df["defense_team_id"].astype("Int64")

    off = df[is_off]
    de  = df[is_def]

    g_off = off.groupby("t_sec").agg(x=("x_norm","mean"), y=(Y_COL,"mean"), v=("speed","mean")).add_prefix("off_").reset_index()
    g_def = de .groupby("t_sec").agg(x=("x_norm","mean"), y=(Y_COL,"mean"), v=("speed","mean")).add_prefix("def_").reset_index()

    out = pd.merge(g_off, g_def, on="t_sec", how="outer").sort_values("t_sec")
    out[["off_x","off_y","off_v","def_x","def_y","def_v"]] = out[
        ["off_x","off_y","off_v","def_x","def_y","def_v"]
    ].ffill().bfill()

    out["dx"] = out["off_x"] - out["def_x"]
    out["dy"] = out["off_y"] - out["def_y"]
    out["dv"] = out["off_v"] - out["def_v"]
    return out

@app.callback(
    Output("crp_chart","figure"), Output("crp_table","columns"), Output("crp_table","data"),
    Input("positions","value"), Input("play_types","value"),
    Input("players","value"), Input("games","value"), Input("clusters","value"), Input("t_range","value"),
)
def update_crp(pos_v, pt_v, pl_v, gm_v, cl_v, tr_v):
    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [], clusters=cl_v_norm, t_range=tuple(tr_v or (0,3)))
    q = filtered_df(sel, ["t_sec","x_norm",Y_COL,"speed","team_id","offense_team_id","defense_team_id"])
    comp = crp_off_vs_def(q)

    fig = go.Figure()
    for col, name in [("off_x","Offense mean x"),("def_x","Defense mean x"),("dx","Δx (Off-Def)")]:
        fig.add_trace(go.Scatter(x=comp["t_sec"], y=comp[col], mode="lines+markers",
                                 name=name, line=dict(dash="dash") if col.startswith("d") else None))
    for col, name in [("off_v","Offense mean v"),("def_v","Defense mean v"),("dv","Δv (Off-Def)")]:
        fig.add_trace(go.Scatter(x=comp["t_sec"], y=comp[col], mode="lines+markers",
                                 name=name, line=dict(dash="dash") if col.startswith("d") else None))
    fig.update_layout(title="CRP: Offense vs Defense (x & v)", xaxis_title="t_sec")

    cols = [{"name": c, "id": c} for c in comp.columns]
    data = comp.round(3).to_dict("records")
    return fig, cols, data

# ========================
# RQA HELFER
# ========================
def pairwise_dist(A: np.ndarray, w: np.ndarray):
    """Gewichtete euklidische Distanzmatrix für NxD-Array A mit Gewichten w (D,)."""
    A = np.asarray(A, float)
    w = np.asarray(w, float).reshape(1, -1)
    diff = A[:, None, :] - A[None, :, :]
    return np.sqrt((diff**2 * w).sum(axis=2))

def recurrence_matrix(arr: np.ndarray, target_rr: float, w=(1.0,1.0,1.0), standardize: bool = True):
    A = np.asarray(arr, float)
    if standardize:
        mu = A.mean(axis=0, keepdims=True)
        sd = A.std(axis=0, keepdims=True) + 1e-9
        A = (A - mu) / sd
    w = np.asarray(w, float)
    D = pairwise_dist(A, w)
    tri = D[np.triu_indices_from(D, k=1)]
    if len(tri) == 0:
        eps = 0.0
        return (D <= eps).astype(int), float(eps)
    eps = float(np.quantile(tri, target_rr))
    return (D <= eps).astype(int), eps

def rqa_metrics(R: np.ndarray, l_min=2, v_min=2):
    N = R.size
    RR = R.sum() / N if N > 0 else 0.0

    # Diagonale Linien
    diag_lengths = []
    for k in range(-(R.shape[0]-1), R.shape[0]):
        d = np.diag(R, k)
        if d.size == 0: continue
        run = 0
        for val in d:
            if val == 1: run += 1
            else:
                if run >= l_min: diag_lengths.append(run)
                run = 0
        if run >= l_min: diag_lengths.append(run)

    DET  = (sum(diag_lengths) / R.sum()) if R.sum() > 0 and diag_lengths else 0.0
    Lmax = max(diag_lengths) if diag_lengths else 0
    L    = float(np.mean(diag_lengths)) if diag_lengths else 0.0
    if diag_lengths:
        _, cnts = np.unique(diag_lengths, return_counts=True)
        p = cnts / cnts.sum()
        ENTR = float(-(p * np.log(p + 1e-12)).sum())
    else:
        ENTR = 0.0

    # Vertikale Linien
    vert_lengths = []
    for j in range(R.shape[1]):
        col = R[:, j]
        run = 0
        for val in col:
            if val == 1: run += 1
            else:
                if run >= v_min: vert_lengths.append(run)
                run = 0
        if run >= v_min: vert_lengths.append(run)

    LAM = (sum(vert_lengths) / R.sum()) if R.sum() > 0 and vert_lengths else 0.0
    TT  = float(np.mean(vert_lengths)) if vert_lengths else 0.0
    if vert_lengths:
        _, cnts_v = np.unique(vert_lengths, return_counts=True)
        p_v = cnts_v / cnts_v.sum()
        ENTR_V = float(-(p_v * np.log(p_v + 1e-12)).sum())
    else:
        ENTR_V = 0.0

    return dict(RR=RR, DET=DET, L=L, Lmax=Lmax, ENTR=ENTR, LAM=LAM, TT=TT, ENTR_V=ENTR_V)

def game_traj(df_game: pd.DataFrame, y_col: str):
    # 4 Punkte je Spiel (0..3s) – Mittelwerte über Spieler
    g = df_game.groupby("t_sec")[["x_norm", y_col, "speed"]].mean().reindex([0,1,2,3])
    g = g.ffill().bfill()
    return g.to_numpy(float)

def build_full_series(df: pd.DataFrame, y_col: str, decim: int, maxpts: int) -> np.ndarray:
    """
    Durchgehende 1 Hz-Serie aus der aktuellen Auswahl:
    sortiert nach game_id → play_uuid → t_sec, pro Zeitstempel Mittelwert über Spieler.
    """
    if df.empty:
        return np.empty((0, 3), float)

    cols = ["game_id","play_uuid","t_sec","x_norm",y_col,"speed"]
    q = df[cols].copy().sort_values(["game_id","play_uuid","t_sec"])
    q = q.groupby(["game_id","play_uuid","t_sec"], observed=True)[["x_norm", y_col, "speed"]].mean().reset_index()

    series = q[["x_norm", y_col, "speed"]].to_numpy(float)
    if decim is None or decim < 1: decim = 1
    series = series[::decim]

    if maxpts and series.shape[0] > maxpts:
        series = series[:maxpts, :]

    series = pd.DataFrame(series, columns=["x","y","v"]).ffill().bfill().to_numpy(float)
    return series

# ========================
# RQA CALLBACK (pro Spiel)
# ========================
@app.callback(
    Output("rqa_plots_grid","children"),
    Output("rqa_table","columns"),
    Output("rqa_table","data"),
    Output("rqa_note","children"),
    Input("rqa_compute","n_clicks"),
    State("positions","value"), State("play_types","value"),
    State("players","value"),  State("games","value"),
    State("t_range","value"),
    State("rqa_target_rr","value"), State("rqa_lmin","value"), State("rqa_vmin","value"),
    State("clusters","value"),             # ← NEU
    prevent_initial_call=True
)
def compute_rqa(n_clicks, pos_v, pt_v, pl_v, gm_v, tr_v, target_rr, l_min, v_min, cl_v):  # ← NEU
    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(
        positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [],
        clusters=cl_v_norm, t_range=tuple(tr_v or (0,3))
    )
    q = filtered_df(sel, ["game_id","t_sec","x_norm",Y_COL,"speed","home_abbr","away_abbr","cl_kmeans"])

    note = (
        f"Es werden max. {RQA_SHOW_MAX_GAMES} Spiele visualisiert. "
        f"Cluster-Filter: {cl_v_norm if cl_v_norm else 'alle'}. "
        f"Button gedrückt: {n_clicks}. "
        "Hinweis: Die pro-Spiel-Ansicht ist explorativ und nutzt 4 Zeitpunkte @ 1 Hz."
    )
    return plots, cols, rows, note


# ========================
# RQA CALLBACK (klassisch • komplette Serie)
# ========================
@app.callback(
    Output("rqac_plot","figure"),
    Output("rqac_table","columns"),
    Output("rqac_table","data"),
    Output("rqac_note","children"),
    Input("rqac_compute","n_clicks"),
    State("positions","value"), State("play_types","value"),
    State("players","value"),  State("games","value"),
    State("t_range","value"),
    State("rqac_rr_mode","value"),
    State("rqac_target_rr","value"), State("rqac_rr_preset","value"),
    State("rqac_lmin","value"), State("rqac_vmin","value"),
    State("rqac_decim","value"), State("rqac_maxpts","value"),
    State("clusters","value"),                     
    prevent_initial_call=True
)
def compute_rqa_classic(n_clicks, pos_v, pt_v, pl_v, gm_v, tr_v,
                        rr_mode, target_rr, rr_preset, l_min, v_min, decim, maxpts, cl_v): 
    cl_v_norm = [] if not cl_v else [c for c in cl_v if c != "__ALL__"]
    sel = dict(
        positions=pos_v or [], playtypes=pt_v or [], players=pl_v or [], games=gm_v or [],
        clusters=cl_v_norm, t_range=tuple(tr_v or (0,3))
    )
    q = filtered_df(sel, ["game_id","play_uuid","t_sec","x_norm",Y_COL,"speed","cl_kmeans"])
    if q.empty:
        empty_fig = go.Figure().update_layout(title="Keine Daten für die aktuelle Auswahl.")
        return empty_fig, [], [], ""

    # Debug-Helfer zum Gegencheck in der Konsole
    print("RQA classic — Cluster-Filter:", cl_v_norm if cl_v_norm else "alle",
          "| unique cl in q:", sorted(map(int, q["cl_kmeans"].dropna().unique())) if "cl_kmeans" in q else "—",
          "| N rows:", len(q))

    arr = build_full_series(q, Y_COL, decim=decim or RQA_CLASSIC_DEFAULT_DECIM,
                            maxpts=maxpts or RQA_CLASSIC_DEFAULT_MAXPTS)

    note = (
        f"Serie basiert auf Filterauswahl. Cluster-Filter: {cl_v_norm if cl_v_norm else 'alle'}. "
        f"RR-Modus: {rr_mode}. Decimation: {int(decim or RQA_CLASSIC_DEFAULT_DECIM)}. "
        f"Max. Punkte: {int(maxpts or RQA_CLASSIC_DEFAULT_MAXPTS)}."
    )
    return fig, cols, data, note


# ========================
# MAIN
# ========================
if __name__ == "__main__":
    app.run(debug=False, port=int(os.getenv("PORT", 8050)), use_reloader=False)

Address already in use
Port 8050 is in use by another program. Either identify and stop that program, or start the server with a different port.


SystemExit: 1


To exit: use 'exit', 'quit', or Ctrl-D.

