# Feature Engineering & Synthetic Data Walkthrough

Quick tour of the 167-feature pipeline (`src/data/preprocessing.py`) and how to prep sequences for training/inference.

- Runs the same feature steps used in the competition models
- Stays lightweight by generating a demo sample when raw data isn't available
- Points to the synthetic datasets stored in `/mnt/raid0/BigData2` (research only)


## Setup

- Set `RAW_INPUT` / `RAW_OUTPUT` to a small slice of the official tracking data (or leave as-is to use the demo sample)
- `window_size` controls how many past frames feed each model input
- All feature logic lives in `src/data/preprocessing.py`; augmentation lives in `src/data/augmentation.py`


In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

# Add parent directory to path for src imports
sys.path.insert(0, str(Path.cwd().parent))

from src.data.preprocessing import (
    add_basic_features,
    add_ball_relative_features,
    add_temporal_features,
    add_geometric_features,
    preprocess_pipeline,
)

DATA_ROOT = Path("../data")  # adjust if you keep data elsewhere
RAW_INPUT = DATA_ROOT / "input_sample.csv"
RAW_OUTPUT = DATA_ROOT / "output_sample.csv"

# Tweak this if you want a shorter/longer context window
# Note: Production models use WINDOW_SIZE 9-13; this is set to 5 for quick demo
WINDOW_SIZE = 5

In [2]:
def load_or_create_sample(input_path: Path, output_path: Path):
    """Load a small slice of tracking data or build a tiny demo sample."""
    if input_path.exists() and output_path.exists():
        input_df = pd.read_csv(input_path)
        output_df = pd.read_csv(output_path)
        source = "local CSVs"
    else:
        # Demo: two players across 10 frames
        frames = np.arange(10)
        input_df = pd.DataFrame({
            "game_id": [1] * 20,
            "play_id": [1] * 20,
            "nfl_id": [101] * 10 + [102] * 10,
            "frame_id": list(frames) * 2,
            "x": np.random.uniform(0, 120, 20),
            "y": np.random.uniform(0, 53.3, 20),
            "s": np.random.uniform(0, 10, 20),
            "a": np.random.uniform(-2, 2, 20),
            "dir": np.random.uniform(0, 360, 20),
            "o": np.random.uniform(0, 360, 20),
            "player_height": ["6-2"] * 20,
            "player_weight": [205] * 20,
            "player_side": ["Offense"] * 10 + ["Defense"] * 10,
            "player_role": ["Targeted Receiver"] * 5 + ["Other"] * 15,
            "ball_land_x": [60] * 20,
            "ball_land_y": [26.65] * 20,
            "num_frames_output": [94] * 20,
        })

        output_df = pd.DataFrame({
            "game_id": [1] * 20,
            "play_id": [1] * 20,
            "nfl_id": [101] * 10 + [102] * 10,
            "frame_id": list(frames + 10) * 2,
            "x": np.random.uniform(0, 120, 20),
            "y": np.random.uniform(0, 53.3, 20),
        })
        source = "generated demo sample"

    return input_df, output_df, source


input_df, output_df, sample_source = load_or_create_sample(RAW_INPUT, RAW_OUTPUT)
print(f"Loaded {len(input_df):,} input rows from {sample_source}.")
display(input_df.head())
display(output_df.head())


Loaded 20 input rows from generated demo sample.


Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y,s,a,dir,o,player_height,player_weight,player_side,player_role,ball_land_x,ball_land_y,num_frames_output
0,1,1,101,0,102.384065,19.187633,0.862296,0.587007,279.291549,215.329078,6-2,205,Offense,Targeted Receiver,60,26.65,94
1,1,1,101,1,20.874216,38.392939,2.431892,-1.70728,231.637516,18.296402,6-2,205,Offense,Targeted Receiver,60,26.65,94
2,1,1,101,2,72.066097,8.331787,8.569785,1.210435,251.082984,129.996538,6-2,205,Offense,Targeted Receiver,60,26.65,94
3,1,1,101,3,34.570852,32.924878,6.427107,1.971271,110.68698,49.275193,6-2,205,Offense,Targeted Receiver,60,26.65,94
4,1,1,101,4,119.358598,24.232251,8.602465,-1.789028,266.253821,88.966165,6-2,205,Offense,Targeted Receiver,60,26.65,94


Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
0,1,1,101,10,111.382886,10.064575
1,1,1,101,11,47.17356,1.026775
2,1,1,101,12,21.459309,22.477687
3,1,1,101,13,55.848715,48.522001
4,1,1,101,14,37.650937,8.339097


## Engineer features step by step

- `add_basic_features`: velocity, acceleration, BMI, role flags, kinetic energy
- `add_ball_relative_features`: distance/angle/alignment to ball landing spot
- `add_temporal_features`: lags, rolling stats, EMA, velocity deltas
- `add_geometric_features`: endpoint projection and alignment

This mirrors the training pipeline used for the leaderboard models.


In [3]:
fe_df = add_basic_features(input_df)
fe_df = add_ball_relative_features(fe_df)
fe_df = add_temporal_features(fe_df, lags=[1, 2], windows=[3])
fe_df = add_geometric_features(fe_df)

numeric_cols = [
    c for c in fe_df.columns
    if c not in {"game_id", "play_id", "nfl_id", "frame_id"}
    and pd.api.types.is_numeric_dtype(fe_df[c])
]
print(f"Engineered features: {len(numeric_cols)} columns")
preview_cols = [
    "velocity_x",
    "velocity_y",
    "acceleration_x",
    "acceleration_y",
    "distance_to_ball",
    "orientation_diff",
    "geo_endpoint_x",
    "geo_velocity_error",
]
fe_df[["game_id", "play_id", "nfl_id"] + preview_cols].head()


Engineered features: 79 columns


Unnamed: 0,game_id,play_id,nfl_id,velocity_x,velocity_y,acceleration_x,acceleration_y,distance_to_ball,orientation_diff,geo_endpoint_x,geo_velocity_error
0,1,1,101,-0.850982,0.139225,0.094777,-0.579305,43.035984,63.962471,60.0,3.667886
1,1,1,101,-1.906846,-1.509316,1.059597,1.338678,40.850013,146.658886,60.0,6.031541
2,1,1,101,-8.106923,-2.77831,-0.392421,-1.145058,21.935078,121.086446,60.0,8.30021
3,1,1,101,6.012715,-2.270454,-0.696376,1.844172,26.1919,61.411787,60.0,3.704127
4,1,1,101,-8.584084,-0.562056,0.116889,1.785205,59.407816,177.287656,60.0,2.474424


## Build model-ready sequences

Runs the full `preprocess_pipeline` to assemble `(window_size, num_features)` inputs and trajectory targets.


In [4]:
sequences, targets, metadata, feature_cols = preprocess_pipeline(
    input_df,
    output_df,
    window_size=WINDOW_SIZE,
    add_ball_features=True,
)

print(f"Sequences: {len(sequences)} | Window: {WINDOW_SIZE} | Features: {len(feature_cols)}")
if sequences:
    print(f"Example sequence shape: {sequences[0].shape}")
    print(f"Example target shape:   {targets[0].shape}")
    print("First 5 features:", feature_cols[:5])


Preprocessing pipeline...
  1. Adding basic features...
  2. Adding ball-relative features...
  3. Adding temporal features...
  4. Preparing sequences...
 Preprocessing complete
  Sequences: 2
  Features: 93
  Window size: 5
Sequences: 2 | Window: 5 | Features: 93
Example sequence shape: (5, 93)
Example target shape:   (10, 2)
First 5 features: ['x', 'y', 's', 'a', 'dir']


## Synthetic datasets (research-only)

Synthetic trajectories live in `/mnt/raid0/BigData2` and provided slight CV improvements but cannot be used for competition submissions. Use them only for offline analysis or ablation studies.


In [5]:
synthetic_root = Path("/mnt/raid0/BigData2")
synthetic_dirs = sorted(
    p for p in synthetic_root.glob("synthetic_*") if p.is_dir()
)

print(f"Found {len(synthetic_dirs)} synthetic dataset folders:")
for p in synthetic_dirs:
    print(f"- {p.name}")

# Example usage (kept commented to avoid pulling large files by default):
# sample_dir = synthetic_root / "synthetic_w9s42_v3"
# sample_input = sample_dir / "input_synthetic_w9s42_v3_w01.csv"
# sample_output = sample_dir / "output_synthetic_w9s42_v3_w01.csv"
# if sample_input.exists() and sample_output.exists():
#     synth_in = pd.read_csv(sample_input, nrows=2000)  # take a small slice
#     synth_out = pd.read_csv(sample_output, nrows=2000)
#     synth_sequences, synth_targets, _, _ = preprocess_pipeline(
#         synth_in, synth_out, window_size=WINDOW_SIZE
#     )
#     print(f"Synthetic sample â†’ sequences: {len(synth_sequences)}")


Found 10 synthetic dataset folders:
- synthetic_concepts
- synthetic_fixed
- synthetic_plays
- synthetic_season_1
- synthetic_season_2
- synthetic_season_3
- synthetic_season_4
- synthetic_season_5
- synthetic_v2
- synthetic_w9s42_v3
