In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction

This notebook combines *all 3 blocks* into one:

1. *Data Caching* (load weekly CSVs, save Parquet for speed)  
2. *Exploratory Data Analysis (EDA)* with statistics & charts  
3. *Interactive Visualisations* (static plots, Voronoi diagrams, animations)  

#### Notes on Structure

- Each block *re-imports its own libraries*.  
  This is *intentional*:  
  - As a reminder of which libraries are needed.  
  - Keeps blocks self-contained if reused in a separate notebook.  
  - Makes it easier to swap or expand blocks in the future.

## Block 1_Data Caching (Inputs & Outputs)

To check all weekly input/output CSV files, concatenate them, and save as Parquet.  

Speeds up subsequent runs and ensures EDA & viz can run smoothly.

In [None]:
import os, glob, pathlib, gc
import pandas as pd
import numpy as np

DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction"
TRAIN_DIR = f"{DATA_DIR}/train"
CACHE_DIR = "/kaggle/working/cache"
pathlib.Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)

# Keep only columns that are broadly useful across EDA/modelling
USECOLS = [
    "game_id","play_id","nfl_id","frame_id",
    "player_to_predict","player_height","player_weight","player_birth_date",
    "player_position","player_side","player_role",
    "x","y","s","a","o","dir",
    "num_frames_output","ball_land_x","ball_land_y",
    "play_direction","absolute_yardline_number"
]

# Helpful dtype downsizing
DTYPES = {
    "game_id":"int64","play_id":"int64","nfl_id":"int64","frame_id":"int32",
    "player_to_predict":"boolean",
    "player_weight":"float32","absolute_yardline_number":"float32",
    "x":"float32","y":"float32","s":"float32","a":"float32","o":"float32","dir":"float32",
    "num_frames_output":"float32","ball_land_x":"float32","ball_land_y":"float32"
}
CAT_COLS = ["player_position","player_side","player_role","play_direction"]

input_paths  = sorted(glob.glob(f"{TRAIN_DIR}/input_2023_w*.csv"))
output_paths = sorted(glob.glob(f"{TRAIN_DIR}/output_2023_w*.csv"))

print(f"Found {len(input_paths)} input files, {len(output_paths)} output files")

def load_and_cache(csv_path, prefix):
    w = os.path.splitext(os.path.basename(csv_path))[0].split("_")[-1]  # e.g. w01
    pq_path = f"{CACHE_DIR}/{prefix}_{w}.parquet"
    if os.path.exists(pq_path):
        print(f"[skip] {pq_path} exists")
        return pq_path

    df = pd.read_csv(csv_path, usecols=USECOLS if prefix=="input" else None)
    # Downcast numerics
    for c, dt in DTYPES.items():
        if c in df.columns:
            df[c] = df[c].astype(dt)
    # Categoricals
    for c in CAT_COLS:
        if c in df.columns:
            df[c] = df[c].astype("category")

    df.to_parquet(pq_path, index=False)
    print(f"[ok] cached {pq_path} -> {df.shape}")
    del df; gc.collect()
    return pq_path

# Cache inputs
for p in input_paths:
    load_and_cache(p, "input")

# Cache outputs
for p in output_paths:
    load_and_cache(p, "output")

# Quick schema & quality snapshot on a small subset
SAMPLE_WEEKS = ["w01","w02"]
def read_inputs(weeks):
    df = pd.concat(
        [pd.read_parquet(f"{CACHE_DIR}/input_{w}.parquet") for w in weeks if os.path.exists(f"{CACHE_DIR}/input_{w}.parquet")],
        ignore_index=True
    )
    return df

df_small = read_inputs(SAMPLE_WEEKS)
print("Sample shape:", df_small.shape)
print("\nColumns & dtypes:\n", df_small.dtypes)

# Missing values (top 15)
na = (df_small.isna().mean().sort_values(ascending=False)*100).head(15)
print("\nMissing values (%, top 15):\n", na)

# Range sanity checks for core geometry
for c, (lo, hi) in {
    "x":(0,120), "y":(0,53.3), "ball_land_x":(0,120), "ball_land_y":(0,53.3)
}.items():
    if c in df_small:
        bad = ((df_small[c] < lo) | (df_small[c] > hi)).mean()*100
        print(f"{c} out-of-bounds: {bad:.3f}%")

print("\n Cache ready at:", CACHE_DIR)

## Block 2_Exploratory Data Analysis (EDA)

Analyse:  
- Dataset shape & missing values  
- Player roles, speeds, and accelerations  
- Ball landing, play directions, and positional distributions  
- Correlations between features

In [None]:
import os, glob, gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

CACHE_DIR = "/kaggle/working/cache"
SMALL = True  # set SMALL=False to broaden to all weeks
WEEKS = [f"w{w:02d}" for w in ([1,2,3] if SMALL else range(1,19))]

def read_inputs(weeks, columns=None):
    dfs = []
    for w in weeks:
        p = f"{CACHE_DIR}/input_{w}.parquet"
        if os.path.exists(p):
            dfs.append(pd.read_parquet(p, columns=columns))
    return pd.concat(dfs, ignore_index=True)

def read_outputs(weeks, columns=None):
    dfs = []
    for w in weeks:
        p = f"{CACHE_DIR}/output_{w}.parquet"
        if os.path.exists(p):
            dfs.append(pd.read_parquet(p, columns=columns))
    return pd.concat(dfs, ignore_index=True)

USECOLS = [
    "game_id","play_id","nfl_id","frame_id",
    "player_position","player_side","player_role","play_direction",
    "x","y","s","a","o","dir",
    "num_frames_output","ball_land_x","ball_land_y","absolute_yardline_number"
]
df_in  = read_inputs(WEEKS, USECOLS)
df_out = read_outputs(WEEKS, ["game_id","play_id","nfl_id","frame_id","x","y"])

print("Inputs:", df_in.shape, "Outputs:", df_out.shape)

# Downsample for plotting (keeps EDA snappy)
plot_df = df_in.sample(n=min(250_000, len(df_in)), random_state=42) if len(df_in)>250_000 else df_in.copy()

# ---- Basic distributions
fig, ax = plt.subplots(1,2, figsize=(12,4))
plot_df["s"].dropna().plot.hist(bins=40, ax=ax[0], edgecolor='black')
ax[0].set_title("Speed (y/s)")
plot_df["a"].dropna().plot.hist(bins=40, ax=ax[1], edgecolor='black')
ax[1].set_title("Acceleration (y/s²)")
plt.tight_layout()
plt.show()

# ---- Roles / positions
role_counts = plot_df["player_role"].value_counts(dropna=False)
pos_counts  = plot_df["player_position"].value_counts().head(15)
fig, ax = plt.subplots(1,2, figsize=(14,5))
role_counts.plot(kind="bar", ax=ax[0])
ax[0].set_title("Player Role Counts")
pos_counts.plot(kind="barh", ax=ax[1])
ax[1].invert_yaxis(); ax[1].set_title("Top 15 Player Positions")
plt.tight_layout(); plt.show()

# ---- Field heatmaps (hexbin) on downsampled data
fig, ax = plt.subplots(1,2, figsize=(14,5))
hb = ax[0].hexbin(plot_df["x"], plot_df["y"], gridsize=40, cmap="YlOrRd", mincnt=1)
ax[0].set_xlim(0,120); ax[0].set_ylim(0,53.3); ax[0].set_title("Player Position Density")
plt.colorbar(hb, ax=ax[0])
hb2 = ax[1].hexbin(plot_df["ball_land_x"], plot_df["ball_land_y"], gridsize=30, cmap="Greens", mincnt=1)
ax[1].set_xlim(0,120); ax[1].set_ylim(0,53.3); ax[1].set_title("Ball Landing Density")
plt.colorbar(hb2, ax=ax[1])
plt.tight_layout(); plt.show()

# ---- Feature engineering (quick, no heavy loops)
plot_df = plot_df.copy()
plot_df["dist_to_ball"] = np.sqrt((plot_df["ball_land_x"]-plot_df["x"])**2 + (plot_df["ball_land_y"]-plot_df["y"])**2)

fig, ax = plt.subplots(1,2, figsize=(14,5))
plot_df2 = plot_df.copy()
plot_df2["player_role"] = plot_df2["player_role"].astype(str)
sns.boxplot(data=plot_df2, x="player_role", y="dist_to_ball", ax=ax[0])
ax[0].set_title("Initial Distance to Ball by Role"); ax[0].tick_params(axis='x', rotation=45)
ax[1].hexbin(plot_df["s"], plot_df["a"], gridsize=35, cmap="magma")
ax[1].set_xlabel("Speed"); ax[1].set_ylabel("Acceleration"); ax[1].set_title("Speed vs Accel")
plt.tight_layout(); plt.show()

# ---- Quick temporal snapshot: mean displacement over frame_id (small join)
# Use only a small fraction of outputs to keep it quick
out_small = df_out.sample(n=min(300_000, len(df_out)), random_state=42) if len(df_out)>300_000 else df_out
# last pre-throw position per (g,p,n) from inputs
last_pre = (df_in.sort_values(["game_id","play_id","nfl_id","frame_id"])
                .groupby(["game_id","play_id","nfl_id"], as_index=False)
                .last()[["game_id","play_id","nfl_id","x","y"]]
                .rename(columns={"x":"x_last","y":"y_last"}))
merged = out_small.merge(last_pre, on=["game_id","play_id","nfl_id"], how="left")
merged["dx"] = merged["x"]-merged["x_last"]
merged["dy"] = merged["y"]-merged["y_last"]
frame_mean = merged.groupby("frame_id")[["dx","dy"]].mean().reset_index().sort_values("frame_id").head(40)

ax = frame_mean.plot(x="frame_id", y=["dx","dy"], figsize=(10,4), title="Mean displacement by frame (first 40)")
ax.set_xlabel("frame_id"); ax.set_ylabel("yards"); plt.show()

print(" Core EDA complete. (Tip: set SMALL=False to broaden to all weeks later.)")


## Block 3_Interactive Play Visualizations
- Loads only the chosen week (CSV -> cached Parquet per week)
- Role-based colours/markers
- Interactive widgets (choose Week, Game, Play)
- Animated with adjustable frame count

In [None]:
import os, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import matplotlib.patches as patches
from IPython.display import HTML, display
import ipywidgets as W

# ---------- Paths & cache ----------
DATA_DIR  = "/kaggle/input/nfl-big-data-bowl-2026-prediction"
TRAIN_DIR = f"{DATA_DIR}/train"
CACHE_DIR = "/kaggle/working/cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def _wk_str(w):
    return f"w{int(w):02d}"

def _week_paths(w):
    wk = _wk_str(w)
    return (
        f"{TRAIN_DIR}/input_2023_{wk}.csv",
        f"{TRAIN_DIR}/output_2023_{wk}.csv",
        f"{CACHE_DIR}/input_{wk}.parquet",
        f"{CACHE_DIR}/output_{wk}.parquet",
    )

# ---------- Lazy loader (one week at a time) ----------
_week_cache = {}  # {week: (df_in, df_out)}

def load_week(week:int):
    if week in _week_cache:
        return _week_cache[week]

    in_csv, out_csv, in_pq, out_pq = _week_paths(week)

    # Prefer cached Parquet for speed; create it if missing (one week only)
    if os.path.exists(in_pq) and os.path.exists(out_pq):
        df_in  = pd.read_parquet(in_pq)
        df_out = pd.read_parquet(out_pq)
    else:
        df_in  = pd.read_csv(in_csv)
        df_out = pd.read_csv(out_csv)
        # write small, fast parquet for future runs
        df_in.to_parquet(in_pq, index=False)
        df_out.to_parquet(out_pq, index=False)

    _week_cache[week] = (df_in, df_out)
    return df_in, df_out

# ---------- Styling ----------
ROLE_STYLES = {
    "Targeted Receiver": dict(color="#E63946", marker="o", size=90),
    "Passer":             dict(color="#1D3557", marker="s", size=90),
    "Defensive Coverage": dict(color="#457B9D", marker="^", size=80),
    "Other Route Runner": dict(color="#F4A261", marker="o", size=70),
}
DEF_FALLBACK = dict(color="#6c757d", marker="o", size=70)

def role_style(row):
    return ROLE_STYLES.get(str(row.get("player_role", "")), DEF_FALLBACK)

# ---------- Field drawing ----------
def draw_field(ax):
    # main grass
    ax.add_patch(patches.Rectangle((0, 0), 120, 53.3, facecolor="#e9f7ec", edgecolor="#2a9d8f", lw=2, zorder=0))
    # endzones
    ax.add_patch(patches.Rectangle((0, 0), 10, 53.3, facecolor="#e3f2fd", zorder=0))
    ax.add_patch(patches.Rectangle((110, 0), 10, 53.3, facecolor="#fde2e4", zorder=0))
    # yard lines
    for x in range(10, 120, 10):
        ax.axvline(x, color="white", lw=1.2, alpha=0.7, zorder=1)
    # subtle hash marks
    for x in np.arange(10, 110.1, 1.0):
        for y in (18.37, 34.93):
            ax.plot([x, x], [y, y+0.4], color="white", lw=0.6, alpha=0.45, zorder=1)

    ax.set_xlim(-1, 121); ax.set_ylim(-1, 54.3); ax.set_aspect("equal"); ax.axis("off")

def make_legend(fig):
    # Build a horizontal legend above the axes (no overlap with title)
    from matplotlib.lines import Line2D
    handles = []
    for role, st in ROLE_STYLES.items():
        handles.append(Line2D([0],[0], marker=st["marker"], color='w',
                              markerfacecolor=st["color"], markersize=8,
                              markeredgecolor="k", label=role))
    leg = fig.legend(handles=handles, ncol=len(handles), loc="upper center",
                     bbox_to_anchor=(0.5, 1.02), frameon=False)
    return leg

# ---------- Animation ----------
def animate_play(df_in, df_out, game_id:int, play_id:int, max_frames:int=30, fps:int=10):
    in_play  = df_in .query("game_id==@game_id & play_id==@play_id").sort_values("frame_id")
    out_play = df_out.query("game_id==@game_id & play_id==@play_id").sort_values("frame_id")
    if in_play.empty or out_play.empty:
        raise ValueError("No data for the chosen game/play in this week.")

    ball_x = float(in_play["ball_land_x"].iloc[0])
    ball_y = float(in_play["ball_land_y"].iloc[0])
    max_frame = int(out_play["frame_id"].max())
    max_frames = int(min(max_frames, max_frame))

    # Pre-join role info for faster access
    role_map = in_play.groupby("nfl_id")["player_role"].first()

    fig, ax = plt.subplots(figsize=(12, 6))
    draw_field(ax)
    make_legend(fig)

    # clear, re-draw each frame
    def draw_frame(t):
        ax.clear()
        draw_field(ax)
        make_legend(fig)

        # Title with padding so it doesn't collide with legend
        ax.set_title(
            f"Game {game_id}  |  Play {play_id}  |  t = {t/10:.1f}s",
            loc="center", pad=26, fontsize=13, fontweight="bold"
        )

        # ball landing
        ax.scatter(ball_x, ball_y, s=180, c="black", marker="X", zorder=4, label="Ball landing")

        # plot players
        for nfl_id, g in out_play.groupby("nfl_id"):
            past = g[g["frame_id"] <= t]
            if past.empty:  # not yet started
                continue

            # start at pre-throw position
            p0 = in_play[in_play["nfl_id"]==nfl_id].iloc[0]
            xs = [p0["x"]] + past["x"].tolist()
            ys = [p0["y"]] + past["y"].tolist()

            role = role_map.get(nfl_id, "Other Route Runner")
            st = ROLE_STYLES.get(role, DEF_FALLBACK)

            # trail
            ax.plot(xs, ys, lw=1.6, alpha=0.7, color=st["color"], zorder=3)
            # current point
            ax.scatter(xs[-1], ys[-1], s=st["size"], marker=st["marker"],
                       facecolors=st["color"], edgecolors="k", linewidth=0.7, zorder=5)

        # neat frame label
        ax.text(0.01, 0.02, f"Frame {t}/{max_frame}", transform=ax.transAxes,
                fontsize=9, bbox=dict(boxstyle="round", facecolor="w", alpha=0.8))

        return ax

    ani = FuncAnimation(fig, lambda t: draw_frame(t), frames=range(0, max_frames+1),
                        interval=1000//fps, blit=False)
    plt.close(fig)
    return ani

# ---------- Widgets ----------
week_dd   = W.Dropdown(options=[(f"Week {w:02d}", w) for w in range(1, 19)],
                       value=12, description="Week:", layout=W.Layout(width="180px"))
game_dd   = W.Dropdown(options=[], description="Game:", layout=W.Layout(width="220px"))
play_dd   = W.Dropdown(options=[], description="Play:", layout=W.Layout(width="220px"))
frames_sl = W.IntSlider(value=30, min=5, max=100, step=5, description="Frames:", continuous_update=False)
fps_sl    = W.IntSlider(value=10, min=5, max=20, step=1, description="FPS:", continuous_update=False)
render_bt = W.Button(description="Render Animation", button_style="primary", icon="film")
out       = W.Output()

def refresh_games(*_):
    with out:
        out.clear_output()
        print("Loading week…")
    df_in, df_out = load_week(week_dd.value)
    games = df_out["game_id"].drop_duplicates().sort_values().tolist()
    game_dd.options = games
    game_dd.value = games[0] if games else None
    refresh_plays()

def refresh_plays(*_):
    if game_dd.value is None:
        play_dd.options = []
        return
    df_in, df_out = load_week(week_dd.value)
    plays = df_out.loc[df_out["game_id"]==game_dd.value, "play_id"].drop_duplicates().sort_values().tolist()
    play_dd.options = plays
    play_dd.value = plays[0] if plays else None

def render(*_):
    out.clear_output()
    with out:
        try:
            df_in, df_out = load_week(week_dd.value)
            ani = animate_play(df_in, df_out,
                               game_id=int(game_dd.value),
                               play_id=int(play_dd.value),
                               max_frames=int(frames_sl.value),
                               fps=int(fps_sl.value))
            display(HTML(ani.to_jshtml()))
        except Exception as e:
            print("Error:", e)

week_dd.observe(refresh_games, names="value")
game_dd.observe(refresh_plays, names="value")
render_bt.on_click(render)

# initial populate
refresh_games()

ui = W.VBox([
    W.HBox([week_dd, game_dd, play_dd]),
    W.HBox([frames_sl, fps_sl, render_bt]),
])
display(ui, out)