NFL Big Data Bowl 2026 - Exploratory Data Analysis
===========================================================

In [None]:
# -*- coding: utf-8 -*-


# --- 0. SETUP & IMPORTS ---
import os
import glob
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from tqdm import tqdm
from IPython.display import display
import warnings

# --- 1. CONFIGURATION & UTILITY FUNCTIONS ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)

# Configure plot styles for a professional look
sns.set_theme(style="whitegrid", palette="viridis")
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['axes.facecolor'] = '#f0f0f0'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['axes.titlecolor'] = 'black'
plt.rcParams['text.color'] = 'black'


def load_and_concat_data(directory, file_pattern):
    """Loads all weekly CSV files matching a pattern and concatenates them."""
    files = sorted(glob.glob(str(directory / file_pattern)))
    if not files:
        print(f"Warning: No files found for pattern {file_pattern} in {directory}")
        return pd.DataFrame()
    
    all_dfs = []
    for f in tqdm(files, desc=f"Loading {file_pattern.split('*')[0]}"):
        df = pd.read_csv(f)
        df['week'] = int(Path(f).stem.split('_w')[-1])
        all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True)

def summarize_data(df, name="DataFrame"):
    """Provides a comprehensive and robust summary of a DataFrame."""
    print(f"--- Summary for {name} ---")
    print(f"Shape: {df.shape}")
    df.info(verbose=False)
    missing_percent = df.isnull().sum() / len(df) * 100
    if missing_percent.sum() > 0:
        print("\nMissing Values (%):")
        print(missing_percent[missing_percent > 0].sort_values(ascending=False))
    print(f"\nDuplicate Rows: {df.duplicated().sum()}")
    print("\nSample Data:")
    display(df.head(3))
    print("\nDescriptive Statistics (Numeric):")
    display(df.describe())
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if not categorical_cols.empty:
        print("\nDescriptive Statistics (Object/Categorical):")
        display(df.describe(include=['object', 'category']))
    else:
        print("\nNo Object/Categorical columns to describe.")
    print("-" * (len(name) + 20) + "\n")

def create_football_field(ax):
    """Adds a football field background to a Matplotlib Axes object."""
    # Field color
    ax.add_patch(patches.Rectangle((0, 0), 120, 53.3, facecolor='#6aaa5b', zorder=0))
    # Endzones
    ax.add_patch(patches.Rectangle((0, 0), 10, 53.3, facecolor='#4a8c3a', zorder=1))
    ax.add_patch(patches.Rectangle((110, 0), 10, 53.3, facecolor='#4a8c3a', zorder=1))
    # Yard lines
    for y in range(10, 110, 10):
        ax.axvline(y, color='white', linestyle='--', linewidth=0.8, zorder=2)
    ax.axvline(60, color='white', linewidth=1.5, zorder=2) # Midfield
    
    ax.set_xlim(0, 120)
    ax.set_ylim(0, 53.3)
    ax.set_xlabel("Yardline")
    ax.set_ylabel("Field Width (yards)")
    return ax

# --- 2. DATA LOADING, PRE-PROCESSING & FEATURE ENGINEERING ---
print("="*50 + "\nSection 2: DATA LOADING & PREPARATION\n" + "="*50)
BASE_DIR = Path('/kaggle/input/nfl-big-data-bowl-2026-prediction')
TRAIN_DIR = BASE_DIR / 'train'

input_df = load_and_concat_data(TRAIN_DIR, 'input_*.csv')
output_df = load_and_concat_data(TRAIN_DIR, 'output_*.csv')

print("\n--- Initial Data Overview ---")
summarize_data(input_df, "Input Data (Train)")
summarize_data(output_df, "Output Data (Train)")

print("\n--- Engineering Base Features ---")
def feature_engineer(df):
    """Applies foundational feature engineering to the input DataFrame."""
    df['player_birth_date'] = pd.to_datetime(df['player_birth_date'])
    df['player_age'] = 2023 - df['player_birth_date'].dt.year
    def height_to_inches(h):
        if pd.isna(h) or not isinstance(h, str) or '-' not in h: return np.nan
        feet, inches = map(int, h.split('-'))
        return feet * 12 + inches
    df['player_height_inches'] = df['player_height'].apply(height_to_inches)
    return df

input_df = feature_engineer(input_df)

last_frame_df = input_df.loc[input_df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmax()].copy()

print("\n--- Engineering Hypothesis-Driven Features on Last Input Frame ---")
def create_hypothesis_features(df):
    df['dist_to_ball_land'] = np.sqrt((df['x'] - df['ball_land_x'])**2 + (df['y'] - df['ball_land_y'])**2)
    vec_player_to_ball_x = df['ball_land_x'] - df['x']
    vec_player_to_ball_y = df['ball_land_y'] - df['y']
    player_dir_rad = np.radians(df['dir'])
    vec_player_motion_x = np.cos(player_dir_rad)
    vec_player_motion_y = np.sin(player_dir_rad)
    dot_product = vec_player_motion_x * vec_player_to_ball_x + vec_player_motion_y * vec_player_to_ball_y
    norm_motion = np.sqrt(vec_player_motion_x**2 + vec_player_motion_y**2)
    norm_ball_vec = np.sqrt(vec_player_to_ball_x**2 + vec_player_to_ball_y**2)
    with np.errstate(divide='ignore', invalid='ignore'):
        cos_theta = dot_product / (norm_motion * norm_ball_vec)
    df['angle_diff_to_ball_land'] = np.degrees(np.arccos(np.clip(cos_theta, -1.0, 1.0)))
    df['angle_diff_to_ball_land'] = df['angle_diff_to_ball_land'].fillna(0)
    return df
last_frame_df = create_hypothesis_features(last_frame_df)

print("\n--- Creating Analysis DataFrames (Displacement & Difficulty) ---")
last_output_df = output_df.loc[output_df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmax()]
displacement_df = pd.merge(
    last_frame_df[['game_id', 'play_id', 'nfl_id', 'x', 'y', 'player_role']],
    last_output_df[['game_id', 'play_id', 'nfl_id', 'x', 'y']],
    on=['game_id', 'play_id', 'nfl_id'],
    suffixes=('_start', '_end')
)
displacement_df['total_displacement'] = np.sqrt(
    (displacement_df['x_end'] - displacement_df['x_start'])**2 +
    (displacement_df['y_end'] - displacement_df['y_start'])**2
)
difficulty_df = pd.merge(
    displacement_df,
    last_frame_df[['game_id', 'play_id', 'nfl_id', 'num_frames_output', 'dist_to_ball_land']],
    on=['game_id', 'play_id', 'nfl_id']
)
print("Analysis DataFrames created successfully.")


# --- 3. TARGET VARIABLE & PREDICTION DIFFICULTY ANALYSIS ---
print("\n" + "="*50 + "\nSection 3: TARGET VARIABLE & DIFFICULTY ANALYSIS\n" + "="*50)

fig, ax = plt.subplots(figsize=(16, 8))
sns.kdeplot(data=output_df, x='x', y='y', fill=True, cmap="Greens", ax=ax, thresh=0.05)
create_football_field(ax)
ax.set_title('Player Position Heatmap at End of Play (Output Data)', fontsize=16)
plt.show()

fig, ax = plt.subplots()
sns.histplot(data=displacement_df, x='total_displacement', hue='player_role', multiple="layer", kde=True, ax=ax)
ax.set_title('Total Player Displacement from Pass Release to End of Play', fontsize=16)
ax.set_xlabel('Displacement (yards)')
plt.show()

fig, ax = plt.subplots(figsize=(18, 8))
sns.boxplot(data=difficulty_df, x='num_frames_output', y='total_displacement', ax=ax)
ax.set_title('Total Displacement vs. Prediction Length', fontsize=16)
ax.set_xlabel('Number of Frames to Predict')
ax.set_ylabel('Total Displacement (yards)')
plt.xticks(rotation=45)
plt.show()


# --- 4. VALIDATING PROBLEM FRAMER'S WARNINGS ---
print("\n" + "="*50 + "\nSection 4: VALIDATING PROBLEM FRAMER'S WARNINGS\n" + "="*50)

print("\n--- Analysis of Temporal Drift ---")
features_to_check_drift = ['s', 'a', 'dist_to_ball_land']
for feature in features_to_check_drift:
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.boxplot(data=last_frame_df, x='week', y=feature, ax=ax)
    ax.set_title(f'Distribution of "{feature}" by Week', fontsize=16)
    plt.show()

print("\n--- Analysis of Generalization Risk ---")
player_play_counts = last_frame_df['nfl_id'].value_counts()
fig, ax = plt.subplots()
sns.histplot(player_play_counts, log_scale=(False, True), ax=ax)
ax.set_title('Distribution of Play Counts per Player (Log Scale)', fontsize=16)
ax.set_xlabel('Player Rank (by frequency)')
ax.set_ylabel('Number of Plays (Log Scale)')
plt.show()

print("\n--- Analysis of Prediction Difficulty by Role (Initial State) ---")
role_order = ['Passer', 'Targeted Receiver', 'Other Route Runner', 'Defensive Coverage']
for feat in ['s', 'a']:
    fig, ax = plt.subplots()
    sns.violinplot(data=last_frame_df, x='player_role', y=feat, ax=ax, order=role_order, inner='box')
    ax.set_title(f'Initial {feat.upper()} (Speed/Acceleration) by Player Role', fontsize=16)
    plt.xticks(rotation=15)
    plt.show()

# --- 5. VALIDATING PROBLEM FRAMER'S HYPOTHESES ---
print("\n" + "="*50 + "\nSection 5: VALIDATING PROBLEM FRAMER'S HYPOTHESES\n" + "="*50)

# lmplot creates its own figure, so we don't use plt.subplots
sns.lmplot(
    data=difficulty_df.sample(min(5000, len(difficulty_df))),
    x='dist_to_ball_land', y='total_displacement', hue='player_role',
    height=7, aspect=1.5, scatter_kws={'alpha':0.6}
).fig.suptitle('Initial Distance to Ball Landing vs. Total Displacement', y=1.02, fontsize=16)
plt.show()

fig, ax = plt.subplots()
sns.scatterplot(
    data=last_frame_df.sample(min(20000, len(last_frame_df))),
    x='angle_diff_to_ball_land', y='s', hue='player_role', alpha=0.5, ax=ax
)
ax.set_title('Player Speed vs. Angle Towards Ball Landing Spot', fontsize=16)
ax.set_xlabel('Angle Difference to Ball (0=towards, 180=away)')
ax.set_ylabel('Speed (yards/s)')
plt.show()

# --- 6. DEEP DIVE: TRAJECTORY KINEMATICS & INTERACTIONS ---
print("\n" + "="*50 + "\nSection 6: DEEP DIVE ANALYSIS\n" + "="*50)

static_features = ['game_id', 'play_id', 'nfl_id', 'player_role', 'player_side']
output_context_df = pd.merge(
    output_df, last_frame_df[static_features + ['ball_land_x', 'ball_land_y']],
    on=['game_id', 'play_id', 'nfl_id'], how='left'
)

def calculate_trajectory_kinematics(df):
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    df['dx'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['x'].diff().fillna(0)
    df['dy'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['y'].diff().fillna(0)
    df['s_out'] = np.sqrt(df['dx']**2 + df['dy']**2) / 0.1
    df['a_out'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['s_out'].diff().fillna(0) / 0.1
    return df

output_kinematics_df = calculate_trajectory_kinematics(output_context_df)

fig, axes = plt.subplots(1, 2, figsize=(20, 7), sharey=False)
sns.lineplot(data=output_kinematics_df, x='frame_id', y='s_out', hue='player_role', ax=axes[0], errorbar=None)
axes[0].set_title('Average Speed During Play', fontsize=16)
axes[0].set_xlabel('Frame ID (after pass)')
axes[0].set_ylabel('Speed (yards/s)')
sns.lineplot(data=output_kinematics_df, x='frame_id', y='a_out', hue='player_role', ax=axes[1], errorbar=None)
axes[1].set_title('Average Acceleration During Play', fontsize=16)
axes[1].set_xlabel('Frame ID (after pass)')
axes[1].set_ylabel('Acceleration (yards/s^2)')
fig.suptitle("Average Player Kinematics by Role After Pass is Thrown", fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

output_kinematics_df['dist_to_ball_land'] = np.sqrt((output_kinematics_df['x'] - output_kinematics_df['ball_land_x'])**2 + (output_kinematics_df['y'] - output_kinematics_df['ball_land_y'])**2)
output_kinematics_df['closing_speed'] = -output_kinematics_df.groupby(['game_id', 'play_id', 'nfl_id'])['dist_to_ball_land'].diff().fillna(0) / 0.1
fig, ax = plt.subplots()
sns.boxplot(data=output_kinematics_df, x='player_role', y='closing_speed', ax=ax, order=role_order)
ax.set_title('"Closing Speed" Towards Ball Landing Spot by Player Role', fontsize=16)
ax.set_ylabel('Closing Speed (yards/s) - Positive means getting closer')
ax.set_xlabel('Player Role')
plt.xticks(rotation=15)
plt.show()


# --- 7. ADVANCED VISUALIZATION & MULTIVARIATE ANALYSIS ---
print("\n" + "="*50 + "\nSection 7: ADVANCED VISUALIZATION & MULTIVARIATE ANALYSIS\n" + "="*50)

def visualize_full_play(game_id, play_id, input_df, output_context_df):
    play_input = input_df[(input_df['game_id'] == game_id) & (input_df['play_id'] == play_id)]
    play_output = output_context_df[(output_context_df['game_id'] == game_id) & (output_context_df['play_id'] == play_id)]

    fig, ax = plt.subplots(figsize=(16, 8))
    create_football_field(ax)
    
    palette = sns.color_palette("viridis", n_colors=play_input['player_role'].nunique())
    role_color_map = dict(zip(play_input['player_role'].unique(), palette))

    # Draw trajectories
    for role, group in play_input.groupby('player_role'):
        for _, player in group.groupby('nfl_id'):
            ax.plot(player['x'], player['y'], linestyle='--', color=role_color_map[role], label=f"{role} (Input)")
    for role, group in play_output.groupby('player_role'):
        for _, player in group.groupby('nfl_id'):
            ax.plot(player['x'], player['y'], linestyle='-', linewidth=2.5, color=role_color_map[role], label=f"{role} (Output)")
    
    # Ball landing spot
    ball_land_x = play_output['ball_land_x'].iloc[0]
    ball_land_y = play_output['ball_land_y'].iloc[0]
    ax.scatter(ball_land_x, ball_land_y, marker='x', color='yellow', s=200, label='Ball Landing Spot', zorder=10)
    
    ax.set_title(f'Full Player Trajectories for Game {game_id}, Play {play_id}', fontsize=18)
    # Create a clean legend
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), bbox_to_anchor=(1.02, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

sample_play = last_frame_df.nlargest(1, 'num_frames_output').iloc[0]
SAMPLE_GAME_ID = sample_play['game_id']
SAMPLE_PLAY_ID = sample_play['play_id']
print(f"Visualizing a sample play: Game ID {SAMPLE_GAME_ID}, Play ID {SAMPLE_PLAY_ID}")
visualize_full_play(SAMPLE_GAME_ID, SAMPLE_PLAY_ID, input_df, output_context_df)

print("\n--- Correlation Analysis of Initial Player State ---")
correlation_cols = ['player_age', 'player_height_inches', 'player_weight', 'x', 'y', 's', 'a',
                    'num_frames_output', 'dist_to_ball_land', 'angle_diff_to_ball_land']
corr_matrix = last_frame_df[correlation_cols].corr()
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', ax=ax, vmin=-1, vmax=1)
ax.set_title('Correlation Matrix of Key Numerical Features (Last Input Frame)', fontsize=16)
plt.show()

print("\n--- t-SNE Visualization of Player States (by Role) ---")
TSNE_SAMPLE_SIZE = 5000
tsne_features = ['player_age', 'player_height_inches', 'player_weight', 'x', 'y', 's', 'a', 'o', 'dir',
                 'dist_to_ball_land', 'angle_diff_to_ball_land', 'num_frames_output']
tsne_df_sample = last_frame_df.sample(TSNE_SAMPLE_SIZE, random_state=42).dropna(subset=tsne_features)
scaler = StandardScaler()
tsne_data_scaled = scaler.fit_transform(tsne_df_sample[tsne_features])
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=300, verbose=0)
tsne_results = tsne.fit_transform(tsne_data_scaled)
tsne_df_sample['tsne-1'] = tsne_results[:,0]
tsne_df_sample['tsne-2'] = tsne_results[:,1]
fig, ax = plt.subplots()
sns.scatterplot(data=tsne_df_sample, x='tsne-1', y='tsne-2', hue='player_role', ax=ax, alpha=0.8)
ax.set_title(f't-SNE Visualization of Player States (Colored by Role)', fontsize=16)
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
plt.show()

print("\n" + "="*50 + "\nADVANCED EDA SCRIPT COMPLETE\n" + "="*50)