# NFL Big Data Bowl - Random Forest Model for X, Y Prediction

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import glob
import os

## Data Loading

In [None]:
from pathlib import Path

# Check if /kaggle/input exists to determine if running on Kaggle
if os.path.exists("/kaggle/input"):
    # On Kaggle notebook
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
else:
    # On local PC
    DATA_DIR = Path("./datasets")

# Load all CSV files from train folder
train_input_files = sorted(glob.glob(str(DATA_DIR / 'train' / 'input_*.csv')))
train_output_files = sorted(glob.glob(str(DATA_DIR / 'train' / 'output_*.csv')))

print(f'Input files: {len(train_input_files)}')
print(f'Output files: {len(train_output_files)}')

In [None]:
# Combine data
input_dfs = []
output_dfs = []

for input_file, output_file in zip(train_input_files, train_output_files):
    input_df = pd.read_csv(input_file)
    output_df = pd.read_csv(output_file)
    input_dfs.append(input_df)
    output_dfs.append(output_df)

train_input = pd.concat(input_dfs, ignore_index=True)
train_output = pd.concat(output_dfs, ignore_index=True)

print(f'Train input shape: {train_input.shape}')
print(f'Train output shape: {train_output.shape}')

In [None]:
# Data Inspection
print("Input columns:")
print(train_input.columns.tolist())
print("\nOutput columns:")
print(train_output.columns.tolist())
print("\nInput sample:")
display(train_input.head())
print("\nOutput sample:")
display(train_output.head())

## Data Preprocessing

In [None]:
# Filter only players to predict
train_input_filtered = train_input[train_input['player_to_predict'] == True].copy()
print(f'Filtered input shape: {train_input_filtered.shape}')

In [None]:
# Merge input and output data
merged_data = train_input_filtered.merge(
    train_output,
    on=['game_id', 'play_id', 'nfl_id', 'frame_id'],
    suffixes=('_input', '_output')
)

print(f'Merged data shape: {merged_data.shape}')
display(merged_data.head())

In [None]:
# Feature Selection
feature_columns = [
    'absolute_yardline_number',
    'player_height',
    'player_weight',
    'x_input',  # Current x coordinate
    'y_input',  # Current y coordinate
    's',  # Speed
    'a',  # Acceleration
    'dir',  # Direction
    'o',  # Orientation
    'num_frames_output',  # Number of frames to predict
    'ball_land_x',  # Ball landing x
    'ball_land_y',  # Ball landing y
]

# Categorical variable encoding
merged_data['play_direction_encoded'] = (merged_data['play_direction'] == 'right').astype(int)
feature_columns.append('play_direction_encoded')

# One-hot encode player_position and player_role
position_dummies = pd.get_dummies(merged_data['player_position'], prefix='position')
role_dummies = pd.get_dummies(merged_data['player_role'], prefix='role')

merged_data = pd.concat([merged_data, position_dummies, role_dummies], axis=1)
feature_columns.extend(position_dummies.columns.tolist())
feature_columns.extend(role_dummies.columns.tolist())

# player_heightConvert to numeric(e.g., "6-1" -> 73 inches)
def height_to_inches(height_str):
    try:
        feet, inches = height_str.split('-')
        return int(feet) * 12 + int(inches)
    except:
        return np.nan

merged_data['player_height'] = merged_data['player_height'].apply(height_to_inches)

# Handle missing values
X = merged_data[feature_columns].fillna(0)
y_x = merged_data['x_output']
y_y = merged_data['y_output']

print(f'Features shape: {X.shape}')
print(f'Target X shape: {y_x.shape}')
print(f'Target Y shape: {y_y.shape}')

## Data Split

In [None]:
# Split into training and validation data
# y_trainConvert to 2D array (predict x, y coordinates simultaneously)
y = np.column_stack([y_x, y_y])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f'Training set size: {X_train.shape[0]}')
print(f'Validation set size: {X_val.shape[0]}')
print(f'Target shape: {y_train.shape} (samples, [x, y])')

## Model Training

In [None]:
# Random Forest model to predict X and Y coordinates simultaneously
print("Training Random Forest for X and Y coordinates...")
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rf.fit(X_train, y_train)
print("Model training completed!")

## Model Evaluation

In [None]:
# Prediction on validation data
predictions = rf.predict(X_val)
y_x_pred = predictions[:, 0]
y_y_pred = predictions[:, 1]

y_x_val = y_val[:, 0]
y_y_val = y_val[:, 1]

# X coordinate evaluation
mse_x = mean_squared_error(y_x_val, y_x_pred)
mae_x = mean_absolute_error(y_x_val, y_x_pred)
rmse_x = np.sqrt(mse_x)

print("X coordinate metrics:")
print(f"  MSE: {mse_x:.4f}")
print(f"  RMSE: {rmse_x:.4f}")
print(f"  MAE: {mae_x:.4f}")

# Y coordinate evaluation
mse_y = mean_squared_error(y_y_val, y_y_pred)
mae_y = mean_absolute_error(y_y_val, y_y_pred)
rmse_y = np.sqrt(mse_y)

print("\nY coordinate metrics:")
print(f"  MSE: {mse_y:.4f}")
print(f"  RMSE: {rmse_y:.4f}")
print(f"  MAE: {mae_y:.4f}")

# Overall error (Euclidean distance)
euclidean_distances = np.sqrt((y_x_val - y_x_pred)**2 + (y_y_val - y_y_pred)**2)
mean_euclidean_distance = euclidean_distances.mean()

print(f"\nMean Euclidean Distance: {mean_euclidean_distance:.4f}")

# Overall RMSE
overall_rmse = np.sqrt((rmse_x**2 + rmse_y**2) / 2)
print(f"\n=== Final RMSE: {overall_rmse:.4f} ===")

## Create Submission

In [None]:
# Verify prediction on sample data
sample_idx = 0
sample_input = X_val.iloc[[sample_idx]]

prediction = rf.predict(sample_input)[0]
pred_x = prediction[0]
pred_y = prediction[1]

actual_x = y_val[sample_idx, 0]
actual_y = y_val[sample_idx, 1]

print(f"Predicted: ({pred_x:.2f}, {pred_y:.2f})")
print(f"Actual: ({actual_x:.2f}, {actual_y:.2f})")
print(f"Error: {np.sqrt((pred_x - actual_x)**2 + (pred_y - actual_y)**2):.2f}")

In [None]:
# Load test data
test_input = pd.read_csv(DATA_DIR / 'test_input.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')

print(f'Test input shape: {test_input.shape}')
print(f'Test shape: {test.shape}')

# Filter only players to predict
test_input_filtered = test_input[test_input['player_to_predict'] == True].copy()
print(f'Filtered test input shape: {test_input_filtered.shape}')

In [None]:
# Test data feature processing (same as training data)
test_input_filtered['play_direction_encoded'] = (test_input_filtered['play_direction'] == 'right').astype(int)

# player_heightConvert to numeric
test_input_filtered['player_height'] = test_input_filtered['player_height'].apply(height_to_inches)

# Rename columns to _input (match training data)
test_input_filtered = test_input_filtered.rename(columns={'x': 'x_input', 'y': 'y_input'})

# One-hot encode player_position and player_role
test_position_dummies = pd.get_dummies(test_input_filtered['player_position'], prefix='position')
test_role_dummies = pd.get_dummies(test_input_filtered['player_role'], prefix='role')

# Adjust to have same columns as training data
for col in position_dummies.columns:
    if col not in test_position_dummies.columns:
        test_position_dummies[col] = 0
        
for col in role_dummies.columns:
    if col not in test_role_dummies.columns:
        test_role_dummies[col] = 0

test_features = pd.concat([test_input_filtered, test_position_dummies, test_role_dummies], axis=1)

# Feature selection (same order as training data)
X_test = test_features[feature_columns].fillna(0)

print(f'Test features shape: {X_test.shape}')

In [None]:
# Predict on test data
print('Predicting test data...')
test_predictions = rf.predict(X_test)
test_pred_x = test_predictions[:, 0]
test_pred_y = test_predictions[:, 1]

# Add predictions to test_input_filtered
test_input_filtered['pred_x'] = test_pred_x
test_input_filtered['pred_y'] = test_pred_y

print(f'Predictions completed: {len(test_pred_x)} samples')
print(f'Sample predictions:')
print(test_input_filtered[['game_id', 'play_id', 'nfl_id', 'frame_id', 'pred_x', 'pred_y']].head())

In [None]:
# submission.csvCreation
# Create ID matching test.csv format
submission = test.copy()
submission['id'] = submission['game_id'].astype(str) + '_' + \
                   submission['play_id'].astype(str) + '_' + \
                   submission['nfl_id'].astype(str) + '_' + \
                   submission['frame_id'].astype(str)

# Merge predictions
test_pred_df = test_input_filtered[['game_id', 'play_id', 'nfl_id', 'frame_id', 'pred_x', 'pred_y']].copy()
submission = submission.merge(
    test_pred_df,
    on=['game_id', 'play_id', 'nfl_id', 'frame_id'],
    how='left'
)

# Final submission format
submission_final = submission[['id', 'pred_x', 'pred_y']].rename(columns={'pred_x': 'x', 'pred_y': 'y'})

# Fill missing values with 0 (as precaution)
submission_final = submission_final.fillna(0)

# Save to CSV
submission_final.to_csv("./submission.csv", index=False)