<h2>Slicing Ball In Air Window for Dataset</h2>

In [3]:
#importing libraries
import pandas as pd
from pathlib import Path

In [None]:
#Load data for specified weeks using pandas
def load_weeks_pandas(weeks, base="../data"):
    use_in = ["game_id","play_id","nfl_id","frame_id",
              "player_name","player_position","player_role","player_side",
              "x","y","s","a","o","dir",
              "num_frames_output","ball_land_x","ball_land_y"]
    use_out = ["game_id","play_id","nfl_id","frame_id","x","y"]
    supp_cols = ["game_id","play_id","season","week","pass_result",
                 "team_coverage_man_zone","pass_length","route_of_targeted_receiver",
                 "yards_gained","expected_points","expected_points_added"]

    inputs  = [pd.read_csv(f"{base}/input_2023_w{w:02d}.csv", usecols=use_in)  for w in weeks]
    outputs = [pd.read_csv(f"{base}/output_2023_w{w:02d}.csv", usecols=use_out) for w in weeks]

    input_df  = pd.concat(inputs,  ignore_index=True)
    output_df = pd.concat(outputs, ignore_index=True)
    supp_df   = pd.read_csv(f"{base}/supplementary_data.csv", usecols=supp_cols)

    # join play context into input/output
    input_df  = input_df.merge(supp_df,  on=["game_id","play_id"], how="left")
    output_df = output_df.merge(supp_df, on=["game_id","play_id"], how="left")
    #Get only the first and last frames
    input_df = (
        input_df.sort_values(['play_id', 'nfl_id', 'frame_id'])
                .groupby(['play_id', 'nfl_id'], group_keys=False)
                .apply(lambda g: g.loc[g['frame_id'].isin([g['frame_id'].min(), g['frame_id'].max()])]).reset_index()
    )
    output_df = (
        output_df.sort_values(['play_id', 'nfl_id', 'frame_id'])
                .groupby(['play_id', 'nfl_id'], group_keys=False)
                .apply(lambda g: g.loc[g['frame_id'].isin([g['frame_id'].min(), g['frame_id'].max()])]).reset_index()
    )
    
    #filtering players so its only WR and CB
    input_df = input_df[(input_df.get('player_position') == 'WR') | (input_df.get('player_position') == 'CB')]

    output_df = output_df[output_df['nfl_id'].isin(input_df['nfl_id'])]

    #dropping unneccesary columns
    input_df = input_df.drop(columns=['index', 'player_name'])
    output_df = output_df.drop(columns=['index'])

    return input_df, output_df, supp_df

# Example:
# inp, out, supp = load_weeks_pandas([1,2,3])


In [6]:
#Load our data
df_input, df_output, df_supp = load_weeks_pandas([1,2,3])

  .apply(lambda g: g.loc[g['frame_id'].isin([g['frame_id'].min(), g['frame_id'].max()])]).reset_index()
  .apply(lambda g: g.loc[g['frame_id'].isin([g['frame_id'].min(), g['frame_id'].max()])]).reset_index()


<h2> Overview </h2>
Input (before the ball is thrown):
The tracking data up to the throw, including player positions, velocities, etc.

Output (after the ball is thrown):
The ground-truth trajectories of all players while the ball is in the air.

**Input dataframe:** <br>

You want roughly **1 second** (10 frames) before the throw.
So for each play:
- input_start = throw_frame - 10
- input_end   = throw_frame


**Output dataframe:** <br>

You want the ball-in-air period, and optionally **1 second** after the catch/landing.
So for outputs:
- output_start = throw_frame
- output_end   = catch_frame + 10

In [None]:
#in the input dataframe, the max number of unique frames per play_id is 7, meaning we don't have to slice anything
df_input.groupby('play_id')['frame_id'].nunique().sort_values(ascending=False)

play_id
55      7
77      7
102     6
1027    6
3383    5
       ..
1770    2
1767    2
1766    2
1764    2
4974    2
Name: frame_id, Length: 1973, dtype: int64

In [None]:
#in the output dataframe, the max number of unique frames per play_id is 5, meaning we don't have to slice anything either
df_output.groupby('play_id')['frame_id'].nunique().sort_values(ascending=False)

play_id
1027    5
3257    5
77      5
1200    5
2792    5
       ..
1724    2
1721    2
1720    2
1717    2
4974    2
Name: frame_id, Length: 1689, dtype: int64

<h2>Save The Cleaned Datasets as New CSV Files</h2>

In [16]:
#Create copies
df_input_clean = df_input.copy()
df_output_clean = df_output.copy()

# Save to CSV
df_input_clean.to_csv("input_cleaned_w1_2_3.csv", index=False)
df_output_clean.to_csv("output_cleaned_w1_2_3.csv", index=False)