In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
from scipy.spatial import Voronoi, voronoi_plot_2d
from IPython.display import display
import time
from sklearn.utils import shuffle
from joblib import Parallel, delayed

# Import custom modules for data preparation, preprocessing, cleaning, and feature extraction
from prepare_data import aggregate_play_types, aggregate_play_level_features, final_merge
from preprocess_tracking_data import filter_for_presnap
from clean_data import clean_data, downcast_ints_and_floats, convert_to_DICT
from defensive_features import get_defensive_features
from offensive_features import get_offensive_features
from constants import RUN_PASS_DICT, MOTION_CAT_DICT, OFFENSIVE_FORMATION_DICT, REC_ALIGNMENT_DICT

# Uncomment these lines during development to reload custom modules automatically after changes
# %load_ext autoreload
# %autoreload 2

# Load the datasets (ensure file paths are correct)
games_df = pd.read_csv('../data/games.csv')          # Game-level data
plays_df = pd.read_csv('../data/plays.csv')          # Play-level data
players_df = pd.read_csv('../data/players.csv')      # Player metadata
player_play_df = pd.read_csv('../data/player_play.csv')  # Player-play metadata
fps_df = pd.read_csv('../data/FINAL_FPS.csv')        # FPS data
    
# Initialize an empty list to store the DataFrames for each week
def_features_data = []
off_features_data = []

# Aggregate play types (e.g., run, pass) and merge play data with other game-level information
df = aggregate_play_types(plays_df, player_play_df, games_df)

# Aggregate play-level features such as yards gained, possession, etc.
agg_df = aggregate_play_level_features(df, fps_df, plays_df)

# Loop over weeks 1 to 9
for week in range(1, 10):  # week goes from 1 to 9
    print(f"INFO: [ PROCESSING WEEK {week} ]")
    start_time = time.time()

    # Filter the tracking data to focus on pre-snap movements for the specified week
    presnap_tracking = filter_for_presnap(week, players_df).copy()

    # Clean the tracking data to remove invalid entries or apply transformations
    df_clean = clean_data(presnap_tracking)

    # Extract defensive features, such as coverage schemes, based on the cleaned data
    def_features = get_defensive_features(df_clean, agg_df, players_df, player_play_df)

    # Extract offensive features, such as formations and pre-snap motion
    off_features = get_offensive_features(df_clean, player_play_df)
    
    # Append the final dataframe for the current week to the list
    def_features_data.append(def_features)
    off_features_data.append(off_features)

    # Record the end time and calculate the duration of the data processing
    end_time = time.time()
    duration = end_time - start_time

    # Print information about the process duration and the number of rows in the final dataset
    print(f"INFO: Tracking Features took {duration:.4f} seconds to process week {week} tracking data.\n")

def_features_data = pd.concat(def_features_data, ignore_index=True)
off_features_data = pd.concat(off_features_data, ignore_index=True)

# Merge aggregated play-level features with defensive and offensive features
merged_df = final_merge(agg_df, def_features_data, off_features_data).copy()

# Convert categorical features (e.g., run/pass type, motion categories, formations) into dictionaries
final_df = convert_to_DICT(merged_df, RUN_PASS_DICT, MOTION_CAT_DICT, OFFENSIVE_FORMATION_DICT, REC_ALIGNMENT_DICT)

# Downcast final df for training
final_df_clean = downcast_ints_and_floats(final_df).copy()

# Shuffle Data for training
final_df_shuffle = shuffle(final_df_clean, random_state=42)

# Select training data for weeks 1 through 8
training_data = final_df_shuffle[final_df_shuffle['week'].isin(range(1, 9))]

# Create the target for training data
train_labels = training_data['play_type']

# Split week 9 for testing the model
week_9_data = final_df_shuffle[final_df_shuffle['week'] == 9]

# Remove target for testing data 
testing_data = week_9_data.drop(columns=['play_type'])

# Create the target for training data
test_labels = week_9_data['play_type']

end_time = time.time()
duration = end_time - start_time

print(f"INFO: [ TRAINING DATA ROWS: {training_data.shape[0]} rows ]")
print(f"INFO: [ TESTING DATA ROWS:  {testing_data.shape[0]}  rows ]")
print(f"INFO: Data for all weeks (1-9) is combined into one DataFrame with {final_df_clean.shape[0]} rows. (Runtime: {duration:.4f})")

print(f"Total shape: {final_df_clean.shape}")
print(f"Training shape: {training_data.shape}")
print(f"Train Target shape {train_labels.shape}")
print(f"Testing shape: {testing_data.shape}")
print(f"Test Target shape {test_labels.shape}")

final_df_shuffle.to_csv(f'../data/processed_data/all_weeks_processed_data.csv', index=False)
training_data.to_csv(f'../data/processed_data/training_data.csv', index=False)
testing_data.to_csv(f'../data/processed_data/testing_data.csv', index=False)
train_labels.to_csv(f'../data/processed_data/train_labels.csv', index=False)
test_labels.to_csv(f'../data/processed_data/test_labels.csv', index=False)

INFO: [ START OF DATA LOADING]
INFO: Filtering for run and pass plays...
INFO: Found 8728 unique PASS plays...
INFO: Found 6788 unique RUN plays...
INFO: Combining run-pass dataframes...
INFO: Combined DataFrame contains 15516 unique plays (pass and run), with week column added.
INFO: Merging FPS df for play level features.
INFO: Merged DataFrame contains 15516 rows and 23 columns.

INFO: [ PROCESSING WEEK 1 ]
INFO: Filtering 7104700 rows of tracking data for pre-snap frames in week 1...
INFO: Found 4692460 pre-snap frames of tracking data in week 1.
INFO: Found 1952 plays of tracking data in week 1.
INFO: Merging player position data into tracking data...
WARN: Found 204020 players without position information.
INFO: Merged position data into tracking data. New shape: (4692460, 19)
INFO: Transforming orientation and direction angles so that 0° points from left to right, and increasing angle goes counterclockwise...
INFO: Flipping plays so that they all run from left to right...
INFO: 

In [11]:
# Check for missing values
print(final_df_shuffle.isna().sum())  # Shows the number of NaN values for each column

gameId                          0
playId                          0
play_type                       0
week                            0
quarter                         0
gameClockSeconds                0
gameQuarterWeight               0
down                            0
yardsToGo                       0
yardsGained                     0
expectedPoints                  0
expectedPointsAdded             0
absoluteYardlineNumber          0
offenseFormation                0
inMotionAtBallSnap              0
isDropback                      0
field_position_weight           0
scoreDifferential               0
playSuccessWeight               0
possessionTeamWinProbability    0
possessionTeamImpact            0
opponentTeamImpact              0
receiverAlignment               0
players_in_box_count            0
mismatchFound                   0
motion_type                     0
motion_player_count             0
pre_snap_time_duration          0
dtype: int64
