<h2>Slicing Ball In Air Window for Dataset</h2>

In [6]:
#importing libraries
import pandas as pd
from pathlib import Path

In [72]:
#Load data for specified weeks using pandas
def load_weeks_pandas(weeks, base="../data"):
    use_in = ["game_id","play_id","nfl_id","frame_id",
              "player_name","player_position","player_role","player_side",
              "x","y","s","a","o","dir",
              "num_frames_output","ball_land_x","ball_land_y"]
    use_out = ["game_id","play_id","nfl_id","frame_id","x","y"]
    supp_cols = ["game_id","play_id","season","week","pass_result",
                 "team_coverage_man_zone","pass_length","route_of_targeted_receiver",
                 "yards_gained","expected_points","expected_points_added"]

    inputs  = [pd.read_csv(f"{base}/input_2023_w{w:02d}.csv", usecols=use_in)  for w in weeks]
    outputs = [pd.read_csv(f"{base}/output_2023_w{w:02d}.csv", usecols=use_out) for w in weeks]

    input_df  = pd.concat(inputs,  ignore_index=True)
    output_df = pd.concat(outputs, ignore_index=True)
    supp_df   = pd.read_csv(f"{base}/supplementary_data.csv", usecols=supp_cols)

    # join play context into input/output
    input_df  = input_df.merge(supp_df,  on=["game_id","play_id"], how="left")
    output_df = output_df.merge(supp_df, on=["game_id","play_id"], how="left")
    
    # ----------------------------------------------------------
    # NEW FILTER: use ±1 second around throw → catch
    # throw = last input frame
    # catch = last output frame = num_frames_output from input df
    # ----------------------------------------------------------

    # compute throw/catch frames per player
    throw_frames = (
        input_df.groupby(["game_id","play_id","nfl_id"])["frame_id"]
        .max()
        .rename("throw_frame")
    )

    end_frames = (
        input_df.groupby(["game_id","play_id","nfl_id"])["num_frames_output"]
        .max()
        .rename("end_frame")
    )

    # Merge throw/catch frames into input/output
    input_df  = input_df.merge(throw_frames, on=["game_id","play_id","nfl_id"], how="left")
    output_df = output_df.merge(end_frames, on=["game_id","play_id","nfl_id"], how="left")

    # Filter frames 1 second (10 frames) before the ball is thrown
    input_df = input_df[
        (input_df["frame_id"] >= input_df["throw_frame"] - 10)
    ]
    
    # Filter frames 1 second (10 frames) before the end of the play
    output_df = output_df[
        (output_df["frame_id"] >= output_df["end_frame"] - 10)
    ]
    
    #filtering players so its only WR and CB
    input_df = input_df[(input_df.get('player_position') == 'WR') | (input_df.get('player_position') == 'CB')]

    output_df = output_df[output_df['nfl_id'].isin(input_df['nfl_id'])]

    #dropping unneccesary columns
    input_df = input_df.drop(columns=['player_name'])

    return input_df, output_df, supp_df

# Example:
# inp, out, supp = load_weeks_pandas([1,2,3])


In [73]:
#Load our data
df_input, df_output, df_supp = load_weeks_pandas([1,2,3])

In [74]:
df_input[['nfl_id','frame_id','num_frames_output']][0:20]

Unnamed: 0,nfl_id,frame_id,num_frames_output
67,52546,16,21
68,52546,17,21
69,52546,18,21
70,52546,19,21
71,52546,20,21
72,52546,21,21
73,52546,22,21
74,52546,23,21
75,52546,24,21
76,52546,25,21


In [75]:
df_output[['nfl_id','frame_id','end_frame']][0:20]

Unnamed: 0,nfl_id,frame_id,end_frame
31,52546,11,21
32,52546,12,21
33,52546,13,21
34,52546,14,21
35,52546,15,21
36,52546,16,21
37,52546,17,21
38,52546,18,21
39,52546,19,21
40,52546,20,21


<h2> Overview </h2>
Input (before the ball is thrown):
The tracking data up to the throw, including player positions, velocities, etc.

Output (after the ball is thrown):
The ground-truth trajectories of all players while the ball is in the air.

**Input dataframe:** <br>

You want roughly **1 second** (10 frames) before the throw.
So for each play:
- input_start = throw_frame - 10
- input_end   = throw_frame


**Output dataframe:** <br>

You want **1 second** before the end of the play.
So for outputs:
- output_start = end_frame - 10
- output_end   = end_frame

In [None]:
#in the input dataframe, the max number of frames per play_id is 11, meaing we only have 10 frames before throw + the throw frame so we're good
df_input.groupby(['game_id', 'nfl_id', 'play_id'])['frame_id'].count()#.sort_values(ascending=False)


game_id     nfl_id  play_id
2023090700  38696   361        11
                    436        11
                    621        11
                    1201       11
                    1300       11
                               ..
2023092501  56042   4153       11
                    4176       11
                    4241       11
                    4318       11
            56071   870        11
Name: frame_id, Length: 13644, dtype: int64

In [77]:
#in the putput dataframe, the max number of frames per play_id is 11, meaing we only have 10 frames before the 
# end of the play + the end frame so we're good here too.
df_output.groupby(['game_id','play_id', 'nfl_id'])['frame_id'].nunique().sort_values(ascending=False)

game_id     play_id  nfl_id
2023090700  101      44930     11
2023091712  894      46279     11
            643      43299     11
                     45185     11
            683      42357     11
                               ..
2023091706  3266     41233      5
2023091705  4117     46095      5
2023091711  3073     46109      5
2023092403  2462     54597      5
2023091704  206      55944      5
Name: frame_id, Length: 3987, dtype: int64

<h2>Save The Cleaned Datasets as New CSV Files</h2>

In [78]:
#Create copies
df_input_clean = df_input.copy()
df_output_clean = df_output.copy()

# Save to CSV
df_input_clean.to_csv("input_cleaned_w1_2_3.csv", index=False)
df_output_clean.to_csv("output_cleaned_w1_2_3.csv", index=False)