# **FOREWORD**

This is my version of the NFL Big Data Bowl 2026 Prediction baseline model, developing from the source [here](https://www.kaggle.com/code/hiwe0305/nfl-big-data-baseline). This one scores 0.94 on the leaderboard. 

What I do here-
- I extend this work by customizing the seed. This is a good way to add a bit of diversity to the solution.

In [None]:
%%time 

import torch
import os, math, gc, random
import numpy as np, pandas as pd
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.model_selection import GroupKFold
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

try:
    from lightgbm import LGBMRegressor
    HAS_LGBM = True
except Exception:
    LGBMRegressor = None
    HAS_LGBM = False

try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception:
    XGBRegressor = None
    HAS_XGB = False

try:
    from catboost import CatBoostRegressor
    HAS_CAT = True
except Exception:
    CatBoostRegressor = None
    HAS_CAT = False

print(f"HAS_LGBM={HAS_LGBM}  HAS_XGB={HAS_XGB}  HAS_CAT={HAS_CAT}")

In [None]:
SEED = 42
rng  = np.random.default_rng(SEED)
random.seed(SEED) 
np.random.seed(SEED)

test_req = False
if test_req :
    nest = 10
else:
    nest = 2500

# **PREPROCESSING**

In [None]:
%%time 

FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3

DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
TRAIN_IN_FILES  = [f"{DATA_DIR}train/input_2023_w{w:02d}.csv"  for w in range(1,19)]
TRAIN_OUT_FILES = [f"{DATA_DIR}train/output_2023_w{w:02d}.csv" for w in range(1,19)]
TEST_INPUT_PATH   = f"{DATA_DIR}test_input.csv"
TEST_TARGETS_PATH = f"{DATA_DIR}test.csv"

def height_to_inches(ht):
    if isinstance(ht, str) and "-" in ht:
        f, ins = ht.split("-")
        return int(f)*12 + int(ins)
    return np.nan

def prepareLast(df_in: pd.DataFrame) -> pd.DataFrame:
    df_last = df_in.sort_values(["game_id","play_id","nfl_id","frame_id"]) \
                   .groupby(["game_id","play_id","nfl_id"], as_index=False).last()
    return df_last.rename(columns={"x":"x_last","y":"y_last"})

def inject_target_receiver_xy(df_last: pd.DataFrame) -> pd.DataFrame:
    trg = df_last[df_last["player_role"]=="Targeted Receiver"] \
            [["game_id","play_id","x_last","y_last"]] \
            .rename(columns={"x_last":"target_x","y_last":"target_y"})
    return df_last.merge(trg, on=["game_id","play_id"], how="left")

def add_features(df: pd.DataFrame, is_train=True) -> pd.DataFrame:
    # time
    df["frame_offset"] = df["frame_id"].astype(float)
    df["time_offset"]  = df["frame_offset"] / 10.0
    df["T"] = np.clip(df["num_frames_output"].astype(float), 1.0, None)
    df["t_rel"] = df["frame_offset"] / df["T"]

    # dist/angle to ball
    dx_ball = df["ball_land_x"] - df["x_last"]
    dy_ball = df["ball_land_y"] - df["y_last"]
    dist = np.sqrt(dx_ball**2 + dy_ball**2).astype(float)
    ang  = np.arctan2(dy_ball, dx_ball).astype(float)
    df["dist_to_ball"] = dist
    df["sin_ab"] = np.sin(ang); df["cos_ab"] = np.cos(ang)

    # target receiver
    df["dist_to_target"] = np.sqrt((df["target_x"]-df["x_last"])**2 + (df["target_y"]-df["y_last"])**2)
    df["is_target"] = (df["player_role"]=="Targeted Receiver").astype(int)

    # velocity components (dir in degrees with data convention)
    dir_rad = np.deg2rad(df["dir"].astype(float))
    df["speed_x"] = df["s"] * np.sin(dir_rad)
    df["speed_y"] = df["s"] * np.cos(dir_rad)

    # unit vec to ball & closing/perp speeds
    den = df["dist_to_ball"].replace(0, 1e-6)
    ux = dx_ball / den; uy = dy_ball / den
    df["v_par"]  = df["speed_x"]*ux + df["speed_y"]*uy
    df["v_perp"] = df["speed_x"]*uy - df["speed_y"]*ux

    # normalized pos + logs
    df["x_norm"] = df["x_last"]/120.0
    df["y_norm"] = df["y_last"]/53.3
    df["log_dist_ball"]   = np.log1p(df["dist_to_ball"])
    df["log_dist_target"] = np.log1p(df["dist_to_target"].fillna(0.0))

    if is_train:
        df["dx"] = df["x"] - df["x_last"]
        df["dy"] = df["y"] - df["y_last"]
    return df

NUM_FEATURES = [
    "x_last","y_last","s","a","o","dir",
    "frame_offset","time_offset","t_rel","T",
    "dist_to_ball","sin_ab","cos_ab","dist_to_target",
    "speed_x","speed_y","v_par","v_perp",
    "x_norm","y_norm","log_dist_ball","log_dist_target",
    "absolute_yardline_number","player_height","player_weight"
]
CAT_FEATURES = ["player_role","player_side","play_direction"]
TARGETS = ["dx","dy"]

In [None]:
%%time 

if 'DATA_DIR' not in globals():
    DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
if 'TRAIN_IN_FILES' not in globals():
    TRAIN_IN_FILES  = [f"{DATA_DIR}train/input_2023_w{w:02d}.csv"  for w in range(1,19)]
if 'TRAIN_OUT_FILES' not in globals():
    TRAIN_OUT_FILES = [f"{DATA_DIR}train/output_2023_w{w:02d}.csv" for w in range(1,19)]
if 'TEST_INPUT_PATH' not in globals():
    TEST_INPUT_PATH   = f"{DATA_DIR}test_input.csv"
if 'TEST_TARGETS_PATH' not in globals():
    TEST_TARGETS_PATH = f"{DATA_DIR}test.csv"

FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3

def height_to_inches(ht):
    if isinstance(ht, str) and "-" in ht:
        f, ins = ht.split("-")
        try:
            return int(f)*12 + int(ins)
        except:
            return np.nan
    return np.nan

def prepareLast(df_in: pd.DataFrame) -> pd.DataFrame:
    """Lấy frame cuối trước ném cho mỗi (game, play, player) từ input_*."""
    df_last = df_in.sort_values(["game_id","play_id","nfl_id","frame_id"]) \
                   .groupby(["game_id","play_id","nfl_id"], as_index=False).last()
    return df_last.rename(columns={"x":"x_last","y":"y_last"})

def inject_target_receiver_xy(df_last: pd.DataFrame) -> pd.DataFrame:
    """Gắn vị trí Targeted Receiver tại lúc ném cho tất cả cầu thủ trong play."""
    trg = df_last[df_last["player_role"]=="Targeted Receiver"][["game_id","play_id","x_last","y_last"]] \
          .rename(columns={"x_last":"target_x","y_last":"target_y"})
    return df_last.merge(trg, on=["game_id","play_id"], how="left")

def add_features(df: pd.DataFrame, is_train=True) -> pd.DataFrame:
    # time
    df["frame_offset"] = df["frame_id"].astype(float)
    df["time_offset"]  = df["frame_offset"] / 10.0
    df["T"] = np.clip(df["num_frames_output"].astype(float), 1.0, None)
    df["t_rel"] = df["frame_offset"] / df["T"]

    # geometry to ball
    dxb = df["ball_land_x"] - df["x_last"]
    dyb = df["ball_land_y"] - df["y_last"]
    dist = np.sqrt(dxb**2 + dyb**2).astype(float)
    ang  = np.arctan2(dyb, dxb).astype(float)
    df["dist_to_ball"] = dist
    df["sin_ab"] = np.sin(ang); df["cos_ab"] = np.cos(ang)

    # target receiver
    df["dist_to_target"] = np.sqrt((df["target_x"]-df["x_last"])**2 + (df["target_y"]-df["y_last"])**2)
    df["is_target"] = (df["player_role"]=="Targeted Receiver").astype(int)

    # velocity (data convention)
    dir_rad = np.deg2rad(df["dir"].astype(float))
    df["speed_x"] = df["s"] * np.sin(dir_rad)
    df["speed_y"] = df["s"] * np.cos(dir_rad)

    # closing/perp speed
    den = df["dist_to_ball"].replace(0, 1e-6)
    ux = dxb / den; uy = dyb / den
    df["v_par"]  = df["speed_x"]*ux + df["speed_y"]*uy
    df["v_perp"] = df["speed_x"]*uy - df["speed_y"]*ux

    # normalizations + logs
    df["x_norm"] = df["x_last"]/120.0
    df["y_norm"] = df["y_last"]/53.3
    df["log_dist_ball"]   = np.log1p(df["dist_to_ball"])
    df["log_dist_target"] = np.log1p(df["dist_to_target"].fillna(0.0))

    if is_train:
        df["dx"] = df["x"] - df["x_last"]
        df["dy"] = df["y"] - df["y_last"]
    return df

# Feature lists (đặt lại để dùng downstream)
NUM_FEATURES = [
    "x_last","y_last","s","a","o","dir",
    "frame_offset","time_offset","t_rel","T",
    "dist_to_ball","sin_ab","cos_ab","dist_to_target",
    "speed_x","speed_y","v_par","v_perp",
    "x_norm","y_norm","log_dist_ball","log_dist_target",
    "absolute_yardline_number","player_height","player_weight"
]
CAT_FEATURES = ["player_role","player_side","play_direction"]

df_in  = pd.concat([pd.read_csv(p) for p in TRAIN_IN_FILES],  ignore_index=True)
df_out = pd.concat([pd.read_csv(p) for p in TRAIN_OUT_FILES], ignore_index=True)

# last pre-throw per player (no frame_id in merge key)
last_all = prepareLast(df_in)
# convert height if column exists
if "player_height" in last_all.columns:
    last_all["player_height"] = last_all["player_height"].apply(height_to_inches)
last_all = inject_target_receiver_xy(last_all)

cols_keep_no_fid = [
    "game_id","play_id","nfl_id",
    "x_last","y_last","s","a","o","dir",
    "player_role","player_side","num_frames_output","ball_land_x","ball_land_y",
    "target_x","target_y","play_direction","absolute_yardline_number",
    "player_height","player_weight"
]

# Merge output_* với last_* KHÔNG theo frame_id (tránh NaN anchor)
train = df_out.merge(last_all[cols_keep_no_fid],
                     on=["game_id","play_id","nfl_id"], how="left", validate="many_to_one")

# Features + targets
train = add_features(train, is_train=True)

# Clean labels (XGB/LGBM cần nhãn hữu hạn)
lbl_mask = np.isfinite(train["dx"].values) & np.isfinite(train["dy"].values)
n_bad = int((~lbl_mask).sum())
if n_bad > 0:
    print(f"[Hotfix] Drop rows with NaN/Inf dx,dy: {n_bad}")
    train = train.loc[lbl_mask].reset_index(drop=True)

anc_mask = np.isfinite(train["x_last"].values) & np.isfinite(train["y_last"].values)
if (~anc_mask).any():
    print(f"[Hotfix] Drop rows with NaN anchor: {int((~anc_mask).sum())}")
    train = train.loc[anc_mask].reset_index(drop=True)

for c in CAT_FEATURES:
    if c in train.columns:
        train[c] = train[c].astype("category")

w_time = train["frame_offset"] / train["T"].clip(lower=1.0)
w_time = 1.0 + 0.6*(w_time - w_time.min())/(w_time.max()-w_time.min() + 1e-9)  # 1.0→1.6
w_role = np.where(train["player_role"].astype(str)=="Targeted Receiver", 2.0, 1.0)
train["sample_weight"] = w_time * w_role

print("Train rows (clean):", len(train))

# ---------- Load test ----------
test_in   = pd.read_csv(TEST_INPUT_PATH)
test_tmpl = pd.read_csv(TEST_TARGETS_PATH)

last_test = prepareLast(test_in)
if "player_height" in last_test.columns:
    last_test["player_height"] = last_test["player_height"].apply(height_to_inches)
last_test = inject_target_receiver_xy(last_test)

test_rows = test_tmpl.merge(last_test[cols_keep_no_fid],
                            on=["game_id","play_id","nfl_id"], how="left", validate="many_to_one")
test = add_features(test_rows, is_train=False)

for c in CAT_FEATURES:
    if c in test.columns:
        test[c] = test[c].astype("category")

print("Test rows (rebuilt):", len(test))

# **MODEL TRAINING**

In [None]:
%%time 

def make_base_models(SEED, nest):
   
    models = []
    if HAS_LGBM:
        models.append(("lgbm",
            LGBMRegressor(
                n_estimators=  nest, 
                learning_rate=0.05, 
                max_depth=9, 
                num_leaves=64,
                subsample=0.8, 
                colsample_bytree=0.8, 
                random_state=SEED, 
                verbose = -1,
                device = "gpu" if torch.cuda.is_available() else "cpu"
            )
        ))
    if HAS_XGB:
        models.append(("xgb",
            XGBRegressor(
                n_estimators=nest, 
                learning_rate=0.06, 
                max_depth=9,
                subsample=0.8, 
                colsample_bytree=0.8, 
                random_state=SEED,
                tree_method="hist", 
                enable_categorical=True, 
                verbosity=0,
                device = "cuda:0" if torch.cuda.is_available() else "cpu"
            )
        ))
    if HAS_CAT:
        models.append(("cat",
            CatBoostRegressor(
                iterations=nest, 
                learning_rate=0.05, 
                depth=8,
                random_seed=SEED, 
                verbose=False,
                task_type = "GPU" if torch.cuda.is_available() else "CPU"
            )
        ))
    if not models:
        raise RuntimeError("Không có base model nào sẵn. Cần ít nhất 1 trong LightGBM / XGBoost / CatBoost.")
    return models

def encode_for_model(X: pd.DataFrame, model_name: str):
    """Chuẩn hóa đầu vào tùy model (giữ category cho LGBM/Cat; XGB có thể dùng enable_categorical).
       Nếu môi trường XGB không hỗ trợ categorical, fallback one-hot.
    """
    Xc = X.copy()
    if model_name in ("lgbm", "cat"):
        return Xc
    elif model_name == "xgb":
        return Xc
    else:
        return pd.get_dummies(Xc, columns=CAT_FEATURES, dummy_na=True)

def fit_stacked_oof(
    train_df, 
    features, 
    cat_features, 
    target, 
    groups, 
    n_splits=5,
    SEED : int = 42,
    nest : int = 10,
):
    gkf         = GroupKFold(n_splits=n_splits)
    base_models = make_base_models(SEED = SEED, nest = nest)

    oof         = np.zeros(len(train_df), dtype=np.float32)
    preds_each  = {name: np.zeros(len(train_df), dtype=np.float32) for name,_ in base_models}
    fold_models = {name: [] for name,_ in base_models}

    X_full = train_df[features + cat_features].copy()
    y_full = train_df[target].values
    w_full = train_df.get("sample_weight", pd.Series(np.ones(len(train_df)))).values

    for fold, (tr_idx, va_idx) in tqdm( enumerate(gkf.split(train_df, groups=groups)) ):

        print(f"---> Fold {fold + 1}")
        
        X_tr = X_full.iloc[tr_idx].copy(); 
        y_tr = y_full[tr_idx]; 
        w_tr = w_full[tr_idx]
        X_va = X_full.iloc[va_idx].copy(); 
        y_va = y_full[va_idx]

        for name, mdl in make_base_models(SEED = SEED, nest = nest) :
            Xtr_enc = encode_for_model(X_tr, name)
            Xva_enc = encode_for_model(X_va, name)

            mdl2 = mdl.__class__(**mdl.get_params())
            try:
                if name == "cat":
                    mdl2.fit(
                        Xtr_enc,
                        y_tr, 
                        sample_weight=w_tr, 
                        cat_features=[Xtr_enc.columns.get_loc(c) for c in cat_features]
                    )
                    
                elif name == "lgbm":
                    mdl2.fit(Xtr_enc, y_tr, sample_weight=w_tr, categorical_feature=cat_features)
                    
                else: 
                    mdl2.fit(Xtr_enc, y_tr, sample_weight=w_tr)
                    
            except Exception:
                Xtr_enc = pd.get_dummies(X_tr, columns=cat_features, dummy_na=True)
                Xva_enc = pd.get_dummies(X_va, columns=cat_features, dummy_na=True)
                Xva_enc = Xva_enc.reindex(columns=Xtr_enc.columns, fill_value=0)
                mdl2.fit(Xtr_enc, y_tr, sample_weight=w_tr)

            pred_va = mdl2.predict(Xva_enc).astype(np.float32)
            preds_each[name][va_idx] = pred_va
            fold_models[name].append(mdl2)

        meta_va = np.vstack([preds_each[name][va_idx] for name,_ in base_models]).T
        meta = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0))
        meta.fit(meta_va, y_va)
        oof[va_idx] = meta.predict(meta_va).astype(np.float32)

    meta_full = np.vstack([preds_each[name] for name,_ in base_models]).T
    meta_full_learner = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0)).fit(meta_full, y_full)

    return {
        "base_models": base_models,
        "fold_models": fold_models,
        "meta_learner": meta_full_learner,
        "oof_pred": oof,
        "preds_each": preds_each,
    }

def predict_stacked(models_pack, X_df, cat_features):
    base_models = models_pack["base_models"]
    fold_models = models_pack["fold_models"]
    meta        = models_pack["meta_learner"]

    base_preds = []
    for name,_ in base_models:
        X_enc = encode_for_model(X_df, name)
        preds = []
        for m in fold_models[name]:
            try:
                preds.append(m.predict(X_enc))
            except Exception:
                X_oh = pd.get_dummies(X_df, columns=cat_features, dummy_na=True)
                X_oh = X_oh.reindex(columns=m.feature_names_in_, fill_value=0) if hasattr(m, "feature_names_in_") else X_oh
                preds.append(m.predict(X_oh))
        base_preds.append(np.mean(np.column_stack(preds), axis=1))
    meta_infer = np.vstack(base_preds).T
    return meta.predict(meta_infer)

In [None]:
%%time 

games = train["game_id"].to_numpy()
Mdl_Preds_x = []
Mdl_Preds_y = []

for myseed in tqdm( [1, 42, 55, 1000, 61846,] , "Random seeds"):

    if test_req == True:
        print(f"\n\n---> Test run for syntax check - Seed = {myseed}")
    else:
        print(f"\n\n---> Seed = {myseed}")

    print(f"\n---> Models for x")
    pack_dx = fit_stacked_oof(
        train_df=train,
        features=NUM_FEATURES, 
        cat_features=CAT_FEATURES,
        target="dx", 
        groups=games, 
        n_splits=5,
        SEED = myseed, 
        nest = nest,
    )
    
    print(f"\n---> Models for y")
    pack_dy = fit_stacked_oof(
        train_df=train,
        features=NUM_FEATURES, 
        cat_features=CAT_FEATURES,
        target="dy", 
        groups=games, 
        n_splits=5,
        SEED = myseed, 
        nest = nest,
    )
    
    x_pred_oof = train["x_last"].values + pack_dx["oof_pred"]
    y_pred_oof = train["y_last"].values + pack_dy["oof_pred"]
    rmse_2d    = np.sqrt(
        ((x_pred_oof - train["x"].values)**2 + 
         (y_pred_oof - train["y"].values)**2
        ).mean() / 2.0
    )
    print(f"---> Score = {rmse_2d:.8f}")

    X_test       = test[NUM_FEATURES + CAT_FEATURES].copy()
    pred_dx_test = predict_stacked(pack_dx, X_test, CAT_FEATURES)
    pred_dy_test = predict_stacked(pack_dy, X_test, CAT_FEATURES)
    
    pred_x = np.clip(test["x_last"].values + pred_dx_test, FIELD_X_MIN, FIELD_X_MAX)
    pred_y = np.clip(test["y_last"].values + pred_dy_test, FIELD_Y_MIN, FIELD_Y_MAX)

    Mdl_Preds_x.append(pred_x)
    Mdl_Preds_y.append(pred_y)

# **SUBMISSION**

In [None]:
%%time 

pred_x = np.mean( np.stack(Mdl_Preds_x, axis=1), axis = 1)
pred_y = np.mean( np.stack(Mdl_Preds_y, axis=1), axis = 1)

sub = test[["game_id","play_id","nfl_id","frame_id"]].copy()

sub["id"] = (
    sub["game_id"].astype(str) + "_" +
    sub["play_id"].astype(str) + "_" +
    sub["nfl_id"].astype(str)  + "_" +
    sub["frame_id"].astype(str)
)

submission = sub[["id"]].copy()
submission["x"] = pred_x
submission["y"] = pred_y

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv:", len(submission))

print()
!head submission.csv