In [None]:
# ===============
# libraries
# ===============
import os, gc, glob, pickle, warnings
import random, math, time
import joblib, pickle, itertools
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from pandas import DataFrame
import polars as pl
from tqdm import tqdm

# visualization
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
sns.set(style="whitegrid")  # ノートブック用スタイル設定（任意）

from category_encoders import OrdinalEncoder

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import lightgbm as lgb
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
date = datetime.now().strftime("%Y%m%d")
print(f"TODAY is {date}")

In [None]:
# ===============
# utils
# ===============
def sep(word, num=80):
    print("="*num); print(word); print("="*80)

def show_df(df, num=3, showtail=False):
    print(df.shape)
    display(df.head(num))
    if showtail:
        display(df.tail(num))

def glob_walk(root: Path, glob_str: str) -> list:
    path = Path(root)
    walker = sorted(list(path.glob(glob_str)))
    return walker

def seed_everything(seed, GPU=False):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    if GPU:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(0)

In [None]:
class PATHS:
    input_dir=Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
    output_dir=Path("/kaggle/working")

### Check Directories

In [None]:
glob_walk(PATHS.input_dir, "*")

In [None]:
glob_walk(PATHS.input_dir, "train/*")

> Note: There are input_hoge.csv & output_hoge.csv. Both has 18 datasets. 

# Check Dataset

In [None]:
# ==================
# Submission
# ==================
sub = pl.read_csv(PATHS.input_dir / "sample_submission.csv")
sep("Submission"); show_df(sub)

> NOTE: id is a concatenated identifier in format {game_id}_{play_id}_{nfl_id}_{frame_id}.

| Column | Explanation | Detail | 
| ---|--- |--- |
| x |  Player position along the long axis of the field| generally within 0 - 120 yards. (numeric)
| y |  Player position along the short axis of the field| generally within 0 - 53.3 yards. (numeric)

In [None]:
# ==================
# train (input)
# ==================
train_inputs = glob_walk(PATHS.input_dir/"train", "input*.csv")
train_ip = pl.read_csv(train_inputs[0])
sep(str(os.path.basename(train_inputs[0]))); show_df(train_ip)

In [None]:
# ==================
# test_input.csv
# ==================
test_ip = pl.read_csv(PATHS.input_dir / "test_input.csv")
sep("test_input.csv"); show_df(test_ip)

**| Column | Explanation | Detail | 
| ---|--- |--- | 
| game_id | Game identifier | unique (numeric) |
| play_id | Play identifier | not unique across games (numeric) |
| player_to_predict | whether or not the x/y prediction for this player will be scored (bool) |
| nfl_id | Player identification number | unique across players (numeric) |
| frame_id | Frame identifier for each play/type | starting at 1 for each game_id/play_id/file type (input or output) (numeric) |
| play_direction | Direction that the offense is moving (left or right) |
| absolute_yardline_number | Distance from end zone for possession team (numeric) |
| player_name | |
| player_height | player height (ft-in) |
| player_weight | player weight (lbs) |
| player_birth_date | birth date (yyyy-mm-dd) |
| player_position | the player's position (the specific role on the field that they typically play) |
| player_side | team player is on (Offense or Defense) |
| player_role | role player has on play (Defensive Coverage | Targeted Receiver | Passer or Other Route Runner) |
| x | Player position along the long axis of the field | generally within 0 - 120 yards. (numeric) |
| y | Player position along the short axis of the field | generally within 0 - 53.3 yards. (numeric) |
| s | Speed in yards/second (numeric) |
| a | Acceleration in yards/second^2 (numeric) |
| o | orientation of player (deg) |
| dir | angle of player motion (deg) |
| num_frames_output | Number of frames to predict in output data for the given game_id/play_id/nfl_id. (numeric) |
| ball_land_x | Ball landing position position along the long axis of the field | generally within 0 - 120 yards. (numeric) |
| ball_land_y | Ball landing position along the short axis of the field | generally within 0 - 53.3 yards. (numeric) |

> NOTE: Columns are same as test_inputs

In [None]:
# ==================
# train (output)
# ==================
train_outputs = glob_walk(PATHS.input_dir/"train", "output*.csv")
train_op = pl.read_csv(train_outputs[0])
sep(str(os.path.basename(train_outputs[0]))); show_df(train_op)

In [None]:
# ==================
# test.csv (output)
# ==================
test = pl.read_csv(PATHS.input_dir / "test.csv")
sep("test.csv"); show_df(test)

| Column | Explanation | Detail | 
| ---|--- |--- | 
| game_id|  Game identifier|  unique (numeric) | 
| play_id|  Play identifier|  not unique across games (numeric) |
| nfl_id|  Player identification number|  unique across players. (numeric) |
| frame_id|  Frame identifier for each play/type | starting at 1 for each game_id/play_id/ file type (input or output). The maximum value for a given game_id, play_id and nfl_id will be the same as the num_frames_output value from the corresponding input file. (numeric) |
| x|  Player position along the long axis of the field |  generally within 0-120 yards. (TARGET TO PREDICT) |
| y|  Player position along the short axis of the field |  generally within 0 - 53.3 yards. (TARGET TO PREDICT) |

> NOTE: Columns of "x" and "y" are added to test.csv

# Check & understand columns

In [None]:
from matplotlib_venn import venn2
def show_duplicated(df_train, df_test, columns):
    """Display scatter plots for columns in x_list against the 'popularity' column.

    Args:
        df_train (polars.DataFrame): The train DataFrame.
        df_test  (polars.DataFrame): The test  DataFrame.
        col (str): column name
        num (int): number of show example
    """
    # サブプロットの設定
    n_cols = 3  # 列の数
    n_rows = (len(columns) + n_cols - 1) // n_cols  # 行の数
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows*2))
    axes = axes.flatten()  # 2次元配列を1次元に変換

    for idx, col in enumerate(columns):
        # 各データフレームのユニークな値を取得
        train_unique = set(df_train[col].unique())
        test_unique = set(df_test[col].unique())

        # Venn図を描画
        venn2([train_unique, test_unique], ('input', 'output'), ax=axes[idx])
        axes[idx].set_title(col)

    # 不要なサブプロットを非表示にする
    for j in range(idx + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()  # レイアウト調整
    plt.show()

In [None]:
# ===============
# Read data
# ===============
input_dfs = []
output_dfs = []

for _path in tqdm(train_inputs, total=len(input_dfs)):
    _df = pl.read_csv(_path)
    _df = _df.with_columns(pl.lit(str(os.path.basename(_path))).alias("filename"))
    input_dfs.append(_df)
input_df = pl.concat(input_dfs)
del input_dfs

for _path in tqdm(train_outputs, total=len(output_dfs)):
    _df = pl.read_csv(_path)
    _df = _df.with_columns(pl.lit(str(os.path.basename(_path))).alias("filename"))
    output_dfs.append(_df)

output_df = pl.concat(output_dfs)
del output_dfs
gc.collect()

In [None]:
sep("input_df"); show_df(input_df); display(input_df.describe()); print()
sep("output_df"); show_df(output_df); display(output_df.describe()); print()

> - There are no null data

# Columns which are in both input and output
---
### game_id,play_id,nfl_id,frame_id

In [None]:
show_duplicated(input_df, output_df, ["game_id","play_id","nfl_id","frame_id"])

### game_id 
- Game identifier | unique (numeric)

In [None]:
# ===================
# Check elements
# ===================
for i in range(1, 7): # 19
    _file_ip = f"input_2023_w{str(i).zfill(2)}.csv"
    _file_op = f"output_2023_w{str(i).zfill(2)}.csv"
    
    _ip_df = input_df.filter(pl.col("filename")==_file_ip)
    _op_df = output_df.filter(pl.col("filename")==_file_op)

    _ip_id = sorted(list(_ip_df["game_id"].unique()))
    _op_id = sorted(list(_op_df["game_id"].unique()))
    print(f"w{str(i).zfill(2)}:{len(_ip_id)}|{len(_op_id)}\n\t{_ip_id}\n\t{_op_id}")

In [None]:
ip_ids = set()
for i in range(1, 19):
    _file_ip = f"input_2023_w{str(i).zfill(2)}.csv"
    _ip_df = input_df.filter(pl.col("filename") == _file_ip)
    _ip_id = set(_ip_df["game_id"].unique().to_list())
    ip_ids.update(_ip_id)   
    print(f"{i}: {len(ip_ids)}", end=", ")

> - Exact match for every input–output element.

### play_id
- Play identifier | not unique across games (numeric)

In [None]:
# ===================
# Check elements
# ===================
for i in range(1, 4): # 19
    _file_ip = f"input_2023_w{str(i).zfill(2)}.csv"
    _file_op = f"output_2023_w{str(i).zfill(2)}.csv"
    
    _ip_df = input_df.filter(pl.col("filename")==_file_ip)
    _op_df = output_df.filter(pl.col("filename")==_file_op)

    _ip_id = sorted(list(_ip_df["play_id"].unique()))
    _op_id = sorted(list(_op_df["play_id"].unique()))
    print(f"w{str(i).zfill(2)}:{len(_ip_id)}|{len(_op_id)}\n\t{_ip_id[:20]}\n\t{_op_id[:20]}")

> - Exact match for every input–output element.
> - Common elements exist between the CSV files.

### nfl_id
- Player identification number | unique across players (numeric)


In [None]:
# ===================
# Check elements
# ===================
for i in range(1, 4): # 19
    _file_ip = f"input_2023_w{str(i).zfill(2)}.csv"
    _file_op = f"output_2023_w{str(i).zfill(2)}.csv"
    
    _ip_df = input_df.filter(pl.col("filename")==_file_ip)
    _op_df = output_df.filter(pl.col("filename")==_file_op)

    _ip_id = sorted(list(_ip_df["nfl_id"].unique()))
    _op_id = sorted(list(_op_df["nfl_id"].unique()))
    print(f"w{str(i).zfill(2)}:{len(_ip_id)}|{len(_op_id)}\n\t{_ip_id[:20]}\n\t{_op_id[:20]}")

> - Not exact match for every input–output element.
> - Common elements exist between the CSV files.

### frame_id
- Frame identifier for each play/type | starting at 1 for each game_id/play_id/file type (input or output) (numeric)

In [None]:
# ===================
# Check elements
# ===================
for i in range(1, 4): # 19
    _file_ip = f"input_2023_w{str(i).zfill(2)}.csv"
    _file_op = f"output_2023_w{str(i).zfill(2)}.csv"
    
    _ip_df = input_df.filter(pl.col("filename")==_file_ip)
    _op_df = output_df.filter(pl.col("filename")==_file_op)

    _ip_id = sorted(list(_ip_df["frame_id"].unique()))
    _op_id = sorted(list(_op_df["frame_id"].unique()))
    print(f"w{str(i).zfill(2)}:{len(_ip_id)}|{len(_op_id)}\n\t{_ip_id[:20]}\n\t{_op_id[:20]}")

---

# Columns which is only in input

### player_to_predict 
- whether or not the x/y prediction for this player will be scored (bool)

In [None]:
_stat = input_df.group_by("filename").agg([
    pl.col("player_to_predict").count().name.suffix("_count"),
    pl.col("player_to_predict").sum().name.suffix("_true")
]).with_columns(
    (pl.col("player_to_predict_count")-pl.col("player_to_predict_true")).alias("player_to_predict_false")
).sort("filename")

show_df(_stat)

In [None]:
# visualization
df_plot = _stat.select(["filename", "player_to_predict_true", "player_to_predict_false"]).to_pandas()
df_plot.set_index("filename", inplace=True)

ax = df_plot.plot(kind="bar", figsize=(10,2))
plt.title("True/False counts per filename")
plt.ylabel("count")
plt.xlabel("filename")
plt.xticks(rotation=45, ha="right")
plt.legend(["True", "False"])
plt.tight_layout()
plt.show()

### play_direction
- Direction that the offense is moving (left or right)

In [None]:
_stat = (
    input_df
    .group_by(["filename", "play_direction"])
    .agg(pl.len().alias("count"))
    .pivot(
        values="count",
        index="filename",
        columns="play_direction"
    )
    .sort("filename")
)

show_df(_stat)

In [None]:
# visualization
df_plot = _stat.select(["filename", "right", "left"]).to_pandas()
df_plot.set_index("filename", inplace=True)

ax = df_plot.plot(kind="bar", figsize=(10,2))
plt.title("True/False counts per filename")
plt.ylabel("count")
plt.xlabel("filename")
plt.xticks(rotation=45, ha="right")
plt.legend(["right", "left"])
plt.tight_layout()
plt.show()

# Columns which is player's feature
--- 
1. player_name, 2. player_height, 3. player_weight, 4. player_birth_date, 5. player_position, 6. player_side, 7. player_role

In [None]:
player_cols = ["player_name","player_height","player_weight","player_birth_date","player_position","player_side","player_role"]
df_players = input_df.select(player_cols).unique()
show_df(df_players)

### player_name

In [None]:
unique_players = list(df_players["player_name"].unique())
print(f"n of unique_players: {len(unique_players)}")

In [None]:
df_players.group_by("player_name").count().sort("count", descending=True)

In [None]:
df_players.filter(pl.col("player_name")=="Taysom Hill")

In [None]:
df_players.filter(pl.col("player_name")=="Derrick Henry")

In [None]:
df_players.select(["player_name", "player_height", "player_weight", "player_birth_date"]).unique()

> - There are no duplicated player_name
> - Nevertheless, some player has 2 or 3 player_roles

### player_position
- the player's position (the specific role on the field that they typically play)	

In [None]:
unique_position = list(df_players["player_position"].unique())
print(f"n of unique_position: {len(unique_position)}")
print(unique_position)

In [None]:
(df_players.select(["player_position", "player_side", "player_role"])
.unique()
.sort(["player_position", "player_side"]))

| Category        | Abbr. | Position (English) | Position (Japanese) | Main role |
|-----------------|-------|--------------------|----------------------|-----------|
| **Offense**     | QB    | Quarterback        | クォーターバック     | Leader of the offense. Receives the snap and decides whether to pass, hand off, or run. |
| Offense         | RB    | Running Back       | ランニングバック     | Primary ball carrier. Runs with the ball on rushing plays. |
| Offense         | FB    | Fullback           | フルバック           | Powerful blocker, also used for short-yardage runs. |
| Offense         | WR    | Wide Receiver      | ワイドレシーバー     | Specializes in catching passes. Lines up wide on the field. |
| Offense         | TE    | Tight End          | タイトエンド         | Hybrid role: blocks like a lineman, catches passes like a receiver. |
| Offense         | T     | Offensive Tackle   | タックル（攻撃側）  | Protects the QB and blocks for the RB. Positioned at the ends of the offensive line. |
| **Defense**     | DT    | Defensive Tackle   | ディフェンスタックル | Interior lineman. Stops the run and pressures the QB. |
| Defense         | NT    | Nose Tackle        | ノーズタックル       | Lines up directly over the center. Specializes in run stopping. |
| Defense         | DE    | Defensive End      | ディフェンスエンド   | Edge rusher. Contains outside runs and rushes the QB from the edge. |
| Defense         | LB    | Linebacker         | ラインバッカー       | Versatile defender: tackles runners, covers passes, blitzes QB. |
| Defense         | ILB   | Inside Linebacker  | インサイドLB         | Central LB. Strong in run defense, often defensive leader. |
| Defense         | MLB   | Middle Linebacker  | ミドルLB             | Similar to ILB. Directs defensive plays. |
| Defense         | OLB   | Outside Linebacker | アウトサイドLB       | Covers outside runs, rushes the QB, may cover TE/RB. |
| Defense         | CB    | Cornerback         | コーナーバック       | Covers WRs in man-to-man or zone coverage. Defends passes. |
| Defense         | S     | Safety             | セイフティ           | Last line of defense. Protects against deep passes. |
| Defense         | FS    | Free Safety        | フリーセイフティ     | Focuses on pass coverage with freedom to roam. |
| Defense         | SS    | Strong Safety      | ストロングセイフティ | Plays closer to the line, strong in run defense, covers TE. |
| **Special Teams** | K   | Kicker             | キッカー             | Kicks field goals, extra points, and kickoffs. |
| Special Teams   | P     | Punter             | パンター             | Punts the ball to change field position. |

### Player height / weight
- player height (ft-in)	/ player weight (lbs)	

In [None]:
df_players = df_players.with_columns(
    pl.col("player_height").map_elements(
        lambda x: int(x.split("-")[0]) * 12 * 2.54 + int(x.split("-")[1]) * 2.54,
        return_dtype=pl.Float32
    ).alias("player_height_cm"),
    (pl.col("player_weight") * 0.453592).alias("player_weight_kg")
)

In [None]:
# Scatter plot
import matplotlib.cm as cm

df_plot = (
    df_players
    .select(["player_height_cm", "player_weight_kg", "player_position"])
    .drop_nulls()
    .to_pandas() 
)

fig, ax = plt.subplots(figsize=(8, 6))
positions = sorted(df_plot["player_position"].dropna().unique())
cmap = cm.get_cmap("tab20", len(positions))

for i, pos in enumerate(positions):
    sub = df_plot[df_plot["player_position"] == pos]
    ax.scatter(
        sub["player_height_cm"],
        sub["player_weight_kg"],
        s=10, alpha=0.6,
        label=pos,
        color=cmap(i)
    )

ax.set_xlabel("Player height (cm)")
ax.set_ylabel("Player weight (kg)")
ax.set_title("Height vs Weight colored by Player Position")
ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.5)
ax.legend(ncol=2, fontsize=8, frameon=False)

plt.tight_layout()
plt.show()

---

# Columns which related to position and movement

| Column            | Meaning | Explanation |
|-------------------|---------|-------------|
| **x**             | Player’s position (long axis) | Position of the player along the **long axis** of the field. Numeric, generally within 0–120 yards. |
| **y**             | Player’s position (short axis) | Position of the player along the **short axis** of the field. Numeric, generally within 0–53.3 yards. |
| **s**             | Speed | Speed of the player in yards/second. |
| **a**             | Acceleration | Acceleration of the player in yards/second². Positive = speeding up, negative = slowing down. |
| **o**             | Orientation | Orientation of the player in degrees. Direction the body is facing (0° = north, 90° = east, etc.). |
| **dir**           | Motion direction | Angle of the player’s actual motion (movement direction) in degrees. May differ from orientation. |
| **ball_land_x**   | Ball landing (long axis) | Ball landing position along the **long axis** of the field. Numeric, generally within 0–120 yards. |
| **ball_land_y**   | Ball landing (short axis) | Ball landing position along the **short axis** of the field. Numeric, generally within 0–53.3 yards. |

 ### absolute_yardline_number	
 - Distance from end zone for possession team (numeric)

In [None]:
series = input_df["absolute_yardline_number"].to_pandas()
plt.figure(figsize=(8,3))
series.hist(bins=30, edgecolor="black")  
plt.title("Distribution of Absolute Yardline Number")
plt.xlabel("Yards from offense end zone")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

### x / y
- x | Player position along the long axis of the field | generally within 0 - 120 yards. (numeric)
- y | Player position along the short axis of the field | generally within 0 - 53.3 yards. (numeric)

In [None]:
# Scatter plot
df_plot = (
    input_df
    .select(["x", "y", "player_position"])
    .drop_nulls()
    .to_pandas() 
)
MAX_POINTS = 50_000
if len(df_plot) > MAX_POINTS:
    df_plot = df_plot.sample(n=MAX_POINTS, random_state=42)

fig, ax = plt.subplots(figsize=(10, 3))
positions = sorted(df_plot["player_position"].dropna().unique())
cmap = cm.get_cmap("tab20", len(positions))

for i, pos in enumerate(positions):
    sub = df_plot[df_plot["player_position"] == pos]
    ax.scatter(
        sub["x"],
        sub["y"],
        s=10, alpha=0.6,
        label=pos,
        color=cmap(i)
    )

ax.set_xlabel("x (yards)")
ax.set_ylabel("y (yards)")
ax.set_title("Player Position")
ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.5)
ax.legend(ncol=2, fontsize=8, frameon=False)

plt.tight_layout()
plt.show()

In [None]:
cols = ["x", "y", "s", "a", "o", "dir", "ball_land_x", "ball_land_y"]
df_plot = input_df.select(cols).to_pandas()

# Subplots
fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.flatten()

for i, col in enumerate(cols):
    df_plot[col].hist(
        bins=30,
        edgecolor="black",
        ax=axes[i]
    )
    axes[i].set_title(f"Distribution of {col}")
    axes[i].set_xlabel(col)
    axes[i].set_ylabel("Frequency")
    axes[i].grid(axis="y", linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
cols = ["x", "y", "s", "a", "o", "dir", "ball_land_x", "ball_land_y"]

# play_direction も一緒に取り出す（Polars → pandas）
df_plot = input_df.select(["play_direction"] + cols).to_pandas()

fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.flatten()

directions = ["left", "right"]
colors = {"left": "tab:blue", "right": "tab:orange"}  # 任意

for i, col in enumerate(cols):
    ax = axes[i]
    # 共通ビンを作る（両方向のデータをまとめて計算）
    data_all = df_plot[col].dropna().to_numpy()
    bins = np.histogram_bin_edges(data_all, bins=30)

    for d in directions:
        data = df_plot.loc[df_plot["play_direction"] == d, col].dropna().to_numpy()
        ax.hist(
            data,
            bins=bins,
            alpha=0.45,          # 重ね表示しやすいよう半透明
            label=d,
            edgecolor="black",
            density=False        # 必要なら True にして形比較（確率密度）に
        )

    ax.set_title(f"Distribution of {col}")
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax.grid(axis="y", linestyle="--", alpha=0.7)
    ax.legend(title="play_direction")

plt.tight_layout()
plt.show()

In [None]:
cols = ["x", "y", "s", "a", "o", "dir", "ball_land_x", "ball_land_y"]

# play_direction も一緒に取り出す（Polars → pandas）
df_plot = input_df.select(["player_side"] + cols).to_pandas()

fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.flatten()

directions = ["Offense", "Defense"]
colors = {"Offense": "tab:blue", "Defense": "tab:orange"}  # 任意

for i, col in enumerate(cols):
    ax = axes[i]
    # 共通ビンを作る（両方向のデータをまとめて計算）
    data_all = df_plot[col].dropna().to_numpy()
    bins = np.histogram_bin_edges(data_all, bins=30)

    for d in directions:
        data = df_plot.loc[df_plot["player_side"] == d, col].dropna().to_numpy()
        ax.hist(
            data,
            bins=bins,
            alpha=0.45,          # 重ね表示しやすいよう半透明
            label=d,
            edgecolor="black",
            density=False        # 必要なら True にして形比較（確率密度）に
        )

    ax.set_title(f"Distribution of {col}")
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax.grid(axis="y", linestyle="--", alpha=0.7)
    ax.legend(title="player_side")

plt.tight_layout()
plt.show()

In [None]:
# カラム追加
input_df = (
    input_df
    # NFL角度o[deg] → math角度[rad]に変換
    .with_columns(
        ( (90.0 - pl.col("o")) * np.pi / 180.0 ).alias("theta_rad")
    )
    # ベクトル成分を計算
    .with_columns([
        (pl.col("s") * pl.col("theta_rad").cos()).alias("s_x"),
        (pl.col("s") * pl.col("theta_rad").sin()).alias("s_y"),
        (pl.col("a") * pl.col("theta_rad").cos()).alias("a_x"),
        (pl.col("a") * pl.col("theta_rad").sin()).alias("a_y"),
    ])
    .drop("theta_rad")  # 中間列は不要なら削除
)

show_df(input_df)

In [None]:
cols = ["x", "y", "s_x", "s_y", "a_x", "a_y"]

# play_direction も一緒に取り出す（Polars → pandas）
df_plot = input_df.select(["play_direction"] + cols).to_pandas()

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

directions = ["left", "right"]
colors = {"left": "tab:blue", "right": "tab:orange"}  # 任意

for i, col in enumerate(cols):
    ax = axes[i]
    # 共通ビンを作る（両方向のデータをまとめて計算）
    data_all = df_plot[col].dropna().to_numpy()
    bins = np.histogram_bin_edges(data_all, bins=30)

    for d in directions:
        data = df_plot.loc[df_plot["play_direction"] == d, col].dropna().to_numpy()
        ax.hist(
            data,
            bins=bins,
            alpha=0.45,          # 重ね表示しやすいよう半透明
            label=d,
            edgecolor="black",
            density=False        # 必要なら True にして形比較（確率密度）に
        )

    ax.set_title(f"Distribution of {col}")
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax.grid(axis="y", linestyle="--", alpha=0.7)
    ax.legend(title="play_direction")

plt.tight_layout()
plt.show()

In [None]:
x_center, y_center = 60.0, 26.65
cols = ["x", "y", "s_x", "s_y", "a_x", "a_y", "o", "dir", "ball_land_x", "ball_land_y"]

input_df = input_df.with_columns([
    # 位置
    pl.when(pl.col("play_direction") == "right")
      .then(2*x_center - pl.col("x"))
      .otherwise(pl.col("x"))
      .alias("x"),

    pl.when(pl.col("play_direction") == "right")
      .then(2*y_center - pl.col("y"))
      .otherwise(pl.col("y"))
      .alias("y"),

    # 速度・加速度
    pl.when(pl.col("play_direction") == "right")
      .then(-pl.col("s_x"))
      .otherwise(pl.col("s_x"))
      .alias("s_x"),

    pl.when(pl.col("play_direction") == "right")
      .then(-pl.col("a_x"))
      .otherwise(pl.col("a_x"))
      .alias("a_x"),

    # 向き・進行方向（角度）
    pl.when(pl.col("play_direction") == "right")
      .then((pl.col("o") + 180) % 360)
      .otherwise(pl.col("o"))
      .alias("o"),

    pl.when(pl.col("play_direction") == "right")
      .then((pl.col("dir") + 180) % 360)
      .otherwise(pl.col("dir"))
      .alias("dir"),

    # ボール落下地点
    pl.when(pl.col("play_direction") == "right")
      .then(2*x_center - pl.col("ball_land_x"))
      .otherwise(pl.col("ball_land_x"))
      .alias("ball_land_x"),

    pl.when(pl.col("play_direction") == "right")
      .then(2*y_center - pl.col("ball_land_y"))
      .otherwise(pl.col("ball_land_y"))
      .alias("ball_land_y"),
])

# play_direction も一緒に取り出す（Polars → pandas）
df_plot = input_df.select(["play_direction"] + cols).to_pandas()

fig, axes = plt.subplots(3, 4, figsize=(15, 8))
axes = axes.flatten()

directions = ["left", "right"]
colors = {"left": "tab:blue", "right": "tab:orange"}  # 任意

for i, col in enumerate(cols):
    ax = axes[i]
    # 共通ビンを作る（両方向のデータをまとめて計算）
    data_all = df_plot[col].dropna().to_numpy()
    bins = np.histogram_bin_edges(data_all, bins=30)

    for d in directions:
        data = df_plot.loc[df_plot["play_direction"] == d, col].dropna().to_numpy()
        ax.hist(
            data,
            bins=bins,
            alpha=0.45,          # 重ね表示しやすいよう半透明
            # label=d,
            edgecolor="black",
            density=False        # 必要なら True にして形比較（確率密度）に
        )

    ax.set_title(f"Distribution of {col}")
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax.grid(axis="y", linestyle="--", alpha=0.7)
    # ax.legend(title="play_direction")

plt.tight_layout()
plt.show()

---

### num_frames_output
- Number of frames to predict in output data for the given game_id/play_id/nfl_id. (numeric)	


In [None]:
series = input_df["num_frames_output"].to_pandas()
print(f"MAX:{series.max()}\tMIN:{series.min()}\tMEAN:{series.mean():.1f}")

plt.figure(figsize=(8,3))
series.hist(bins=30, edgecolor="black")  
plt.title("Distribution of num_frames_output")
plt.xlabel("num_frames_output")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
series = input_df.group_by(["game_id", "play_id", ]).agg(
    pl.col("frame_id").max().name.suffix("_max")
)

series = series["frame_id_max"].to_pandas()
print(f"MAX:{series.max()}\tMIN:{series.min()}\tMEAN:{series.mean():.1f}")

plt.figure(figsize=(8,3))
series.hist(bins=30, edgecolor="black")  
plt.title("Distribution of num_frames_input")
plt.xlabel("num_frames_input")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
series = test_ip["num_frames_output"].to_pandas()
print(f"MAX:{series.max()}\tMIN:{series.min()}\tMEAN:{series.mean():.1f}")

plt.figure(figsize=(8,3))
series.hist(bins=30, edgecolor="black")  
plt.title("Distribution of num_frames_output (test)")
plt.xlabel("num_frames_output (test)")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
series = test_ip.group_by(["game_id", "play_id", ]).agg(
    pl.col("frame_id").max().name.suffix("_max")
)

series = series["frame_id_max"].to_pandas()
print(f"MAX:{series.max()}\tMIN:{series.min()}\tMEAN:{series.mean():.1f}")

plt.figure(figsize=(8,3))
series.hist(bins=30, edgecolor="black")  
plt.title("Distribution of num_frames_input (test)")
plt.xlabel("num_frames_input (test)")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
series = input_df.group_by(["game_id", "play_id", ]).agg(
    pl.col("frame_id").max().name.suffix("_max"),
    pl.col("num_frames_output").max().name.suffix("_max")
)
x = series["frame_id_max"].to_pandas()
y = series["num_frames_output_max"].to_pandas()

print(f"Input frames:  MAX:{x.max():.0f}\tMIN:{x.min():.0f}\tMEAN:{x.mean():.1f}")
print(f"Output frames: MAX:{y.max():.0f}\tMIN:{y.min():.0f}\tMEAN:{y.mean():.1f}")

plt.figure(figsize=(6, 5))
sns.scatterplot(x=x, y=y, alpha=0.4, s=25, edgecolor="none")

# 相関係数を表示
r = x.corr(y)
plt.title(f"Frames Input vs Output (r={r:.3f})")
plt.xlabel("num_frames_input")
plt.ylabel("num_frames_output")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
series = (
    input_df
    .group_by(["game_id", "play_id"])
    .agg([
        pl.col("frame_id").max().alias("num_frames_input"),
        pl.col("num_frames_output").max().alias("num_frames_output"),
    ])
)

df = series.to_pandas()
df["num_frames_input"]  = pd.to_numeric(df["num_frames_input"],  errors="coerce")
df["num_frames_output"] = pd.to_numeric(df["num_frames_output"], errors="coerce")
df = df.dropna(subset=["num_frames_input", "num_frames_output"]).astype({
    "num_frames_input": float, "num_frames_output": float
})

fig = plt.figure(figsize=(7,6))
gs = fig.add_gridspec(2,2, width_ratios=(7,1), height_ratios=(1,7),
                      wspace=0.05, hspace=0.05)

ax_scatter = fig.add_subplot(gs[1,0])
hb = ax_scatter.hexbin(df["num_frames_input"], df["num_frames_output"],
                       gridsize=35, cmap="viridis", mincnt=1)
ax_scatter.set_xlabel("num_frames_input")
ax_scatter.set_ylabel("num_frames_output")
fig.colorbar(hb, ax=ax_scatter, label="count")

ax_histx = fig.add_subplot(gs[0,0], sharex=ax_scatter)
sns.histplot(data=df, x="num_frames_input", bins=30, ax=ax_histx)
ax_histx.set_ylabel("")
ax_histx.tick_params(axis="x", labelbottom=False)

ax_histy = fig.add_subplot(gs[1,1], sharey=ax_scatter)
ax_histy.hist(df["num_frames_output"].values.astype(float), bins=20,
              orientation="horizontal", color=sns.color_palette()[1],
              edgecolor="black", alpha=0.8)
ax_histy.set_xlabel("")
ax_histy.tick_params(axis="y", labelleft=False)

plt.show()