In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
from scipy.spatial import Voronoi, voronoi_plot_2d
from IPython.display import display
import time
from joblib import Parallel, delayed

# Import custom modules for data preparation, preprocessing, cleaning, and feature extraction
from prepare_data import aggregate_play_types, aggregate_play_level_features, final_merge
from preprocess_tracking_data import filter_for_presnap
from clean_data import clean_data, convert_to_DICT
from defensive_features import get_defensive_features
from offensive_features import get_offensive_features
from constants import RUN_PASS_DICT, MOTION_CAT_DICT, OFFENSIVE_FORMATION_DICT

# Uncomment these lines during development to reload custom modules automatically after changes
# %load_ext autoreload
# %autoreload 2

# Specify the week number for filtering data
week = 9

# Record the start time to measure execution duration
start_time = time.time()

# Step 1: Load the datasets (ensure file paths are correct)
games_df = pd.read_csv('../data/games.csv')          # Game-level data
plays_df = pd.read_csv('../data/plays.csv')          # Play-level data
players_df = pd.read_csv('../data/players.csv')      # Player metadata
player_play_df = pd.read_csv('../data/player_play.csv')  # Player-play metadata
fps_df = pd.read_csv('../data/FINAL_FPS.csv')        # FPS data

# Step 2: Aggregate play types (e.g., run, pass) and merge play data with other game-level information
df = aggregate_play_types(plays_df, player_play_df, games_df)

# Step 3: Aggregate play-level features such as yards gained, possession, etc.
agg_df = aggregate_play_level_features(df, fps_df, plays_df)

# Step 4: Filter the tracking data to focus on pre-snap movements for the specified week
presnap_tracking = filter_for_presnap(week, players_df)

# Step 5: Clean the tracking data to remove invalid entries or apply transformations
df_clean = clean_data(presnap_tracking)

# Step 6: Extract defensive features, such as coverage schemes, based on the cleaned data
def_features = get_defensive_features(df_clean, agg_df, players_df, player_play_df)

# Step 7: Extract offensive features, such as formations and pre-snap motion
off_features = get_offensive_features(df_clean, player_play_df)

# Step 8: Merge aggregated play-level features with defensive and offensive features
merged_df = final_merge(agg_df, def_features, off_features)

# Step 9: Convert categorical features (e.g., run/pass type, motion categories, formations) into dictionaries
df_DICT = convert_to_DICT(merged_df, RUN_PASS_DICT, MOTION_CAT_DICT, OFFENSIVE_FORMATION_DICT)

# Step 10: Filter the final dataset for the specified week
final_df = df_DICT[df_DICT['week'] == week]

# Record the end time and calculate the duration of the data processing
end_time = time.time()
duration = end_time - start_time

# Print information about the process duration and the number of rows in the final dataset
print(f"INFO: Load Data took {duration:.4f} seconds with contains {final_df.shape[0]} rows.. DONE!")

# Display the first few rows of the final processed dataframe
display(final_df.head())


INFO: Filtering for run and pass plays...
INFO: Found 8728 unique PASS plays...
INFO: Found 6788 unique RUN plays...
INFO: Combining run-pass dataframes...
INFO: Combined DataFrame contains 15516 unique plays (pass and run), with week column added.
INFO: Merging FPS df for play level features.
INFO: Merged DataFrame contains 15516 rows and 23 columns.
INFO: Filtering 5671685 rows of tracking data for pre-snap frames in week 9...
INFO: Found 3729473 pre-snap frames of tracking data in week 9.
INFO: Found 1535 plays of tracking data in week 9.
INFO: Merging player position data into tracking data...
INFO: Merged position data into tracking data. New shape: (3729473, 19)
INFO: Transforming orientation and direction angles so that 0° points from left to right, and increasing angle goes counterclockwise...
INFO: Flipping plays so that they all run from left to right...
INFO: Converting geometry variables from floats to int...
INFO: Memory usage reduced from 2192096002 to 2057834974
INFO: Do

Unnamed: 0,gameId,playId,play_type,week,quarter,gameClockSeconds,gameQuarterWeight,down,yardsToGo,yardsGained,...,playSuccessWeight,possessionTeamWinProbability,possessionTeamImpact,opponentTeamImpact,receiverAlignment,players_in_box_count,mismatchFound,motion_type,motion_player_count,pre_snap_time_duration
3,2022110610,348,1,9,1,568,1.0,2,10,4,...,0.644503,0.884223,-0.001308,0.001308,2x2,6.0,0.0,1.0,1.0,12.2
8,2022110603,2325,1,9,3,456,1.5,2,9,12,...,5.273397,0.342289,0.044392,-0.044392,2x2,7.0,2.0,0.0,0.0,11.5
11,2022110607,3178,1,9,4,599,2.0,2,7,-2,...,0.08234,0.835948,-0.016307,0.016307,3x2,6.0,2.0,5.0,3.0,11.5
43,2022110606,120,1,9,1,857,1.0,2,10,0,...,-9.181442,0.838068,-0.036861,0.036861,2x2,7.0,2.0,1.0,1.0,9.5
93,2022110606,287,1,9,1,669,1.0,1,10,0,...,0.572409,0.824369,-0.001621,0.001621,2x2,6.0,1.0,0.0,0.0,13.2


In [19]:
final_df.to_csv(f'../data/processed_data/processed_data_week_{week}.csv', index=False)

In [22]:
# Define the target column name (play_type) and the path to save the final dataset
target_column = 'play_type'

# Specify the weeks for training (1-8) and testing (9)
train_weeks = range(1, 9)  # Weeks 1 to 8 for training
test_week = 9  # Week 9 for testing

# Initialize empty lists to store DataFrames
train_dfs = []
train_target_dfs = []  # Target for training (play_type)
test_features_df = None
test_target_df = None

# Load training data for Weeks 1-8
for week in train_weeks:
    file_path = f'../data/processed_data/processed_data_week_{week}.csv'
    try:
        week_df = pd.read_csv(file_path)
        train_dfs.append(week_df)
        
        # Extract target column and add to target dataframe
        if target_column in week_df.columns:
            train_target_dfs.append(week_df[[target_column]])
        else:
            print(f"Warning: '{target_column}' column not found in week {week}.")
    except Exception as e:
        print(f"Error loading week {week}: {e}")

# Concatenate all the training data (Weeks 1-8)
all_train_features_df = pd.concat(train_dfs, axis=0)
all_train_target_df = pd.concat(train_target_dfs, axis=0)

# Reset index after merging
all_train_features_df.reset_index(drop=True, inplace=True)
all_train_target_df.reset_index(drop=True, inplace=True)

# Load the test data (Week 9)
test_file_path = f'../data/processed_data/processed_data_week_{test_week}.csv'
try:
    test_df = pd.read_csv(test_file_path)
    
    # Extract test features (exclude play_type)
    test_features_df = test_df.drop(columns=[target_column], errors='ignore')
    
    # Extract the target labels (play_type)
    test_target_df = test_df[[target_column]]
    
    # Reset index after merging
    test_features_df.reset_index(drop=True, inplace=True)
    test_target_df.reset_index(drop=True, inplace=True)
except Exception as e:
    print(f"Error loading Week {test_week} data: {e}")

# Display shapes of the final data
print(f"Training data shape: {all_train_features_df.shape}")
print(f"Training target shape: {all_train_target_df.shape}")
print(f"Test data shape: {test_features_df.shape}")
print(f"Test target shape: {test_target_df.shape}")

# Optionally, save the merged training and test data for future use
all_train_features_df.to_csv('../data/processed_data/merged_train_data.csv', index=False)
all_train_target_df.to_csv('../data/processed_data/merged_train_target.csv', index=False)
test_features_df.to_csv(f'../data/processed_data/week_{test_week}_test_features.csv', index=False)
test_target_df.to_csv(f'../data/processed_data/week_{test_week}_test_target.csv', index=False)

print("Training and test data saved successfully.")

Training data shape: (14040, 28)
Training target shape: (14040, 1)
Test data shape: (1476, 27)
Test target shape: (1476, 1)
Training and test data saved successfully.
