### Importing

In [39]:
import os
import glob
import pandas as pd
import numpy as np

### Downloading and Cleaning

In [16]:
# Specify the relative path to the data directory
data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Data"
non_games_data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Non_Games_Data"

# List all files in the data folder
file_list = os.listdir(data_folder_path)
file_list_non_games = os.listdir(non_games_data_folder_path)

# Use glob to filter specific file types
csv_files = glob.glob(os.path.join(data_folder_path, "*.csv"))
csv_files_non_games = glob.glob(os.path.join(non_games_data_folder_path, "*.csv"))

In [13]:
# Read in the weekly game data and concat into one combined df
dfs = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(dfs, ignore_index=True)

# Read in the supplementary data
games = pd.read_csv(csv_files_non_games[0])
nfl_colors = pd.read_csv(csv_files_non_games[1])
pff_scouting_data = pd.read_csv(csv_files_non_games[2])
players = pd.read_csv(csv_files_non_games[3])
plays = pd.read_csv(csv_files_non_games[4])

In [111]:
# Change the orientation of the plays
clean_df = combined_df.copy()
"""
clean_df['x'] = clean_df.apply(lambda row: 120 - row['x'] if row['playDirection'] == "left" else row['x'], axis=1)
clean_df['y'] = clean_df.apply(lambda row: 160 / 3 - row['y'] if row['playDirection'] == "left" else row['y'], axis=1)
clean_df['dir'] = clean_df.apply(lambda row: row['dir'] + 180 if row['playDirection'] == "left" else row['dir'], axis=1)
clean_df['dir'] = clean_df['dir'].apply(lambda val: val - 360 if val > 360 else val)
clean_df['o'] = clean_df.apply(lambda row: row['o'] + 180 if row['playDirection'] == "left" else row['o'], axis=1)
clean_df['o'] = clean_df['o'].apply(lambda val: val - 360 if val > 360 else val)
"""

# Merge nfl_colors and change the color of the football
clean_df = pd.merge(clean_df, nfl_colors, left_on="team", right_on="Code")
clean_df["primary"] = np.where(
    clean_df["primary"].isna(), "#8b4513", clean_df["primary"]
)

### Data prep and Filtering

In [112]:
# Mutate 'df'
filtered_df = clean_df.copy()

filtered_df["is_start"] = np.where(
    filtered_df["event"].isin(["autoevent_ballsnap", "ball_snap"]), 1, 0
)
filtered_df["is_end"] = np.where(
    filtered_df["event"].isin(
        [
            "fumble",
            "handoff",
            "lateral",
            "autoevent_passforward",
            "pass_forward",
            "qb_sack",
            "qb_strip_sack",
            "run",
        ]
    ),
    1,
    0,
)

# Group by and mutate 'df'
grouped = filtered_df.groupby(["gameId", "playId"])
filtered_df["any_start"] = grouped["is_start"].transform("any")
filtered_df["any_end"] = grouped["is_end"].transform("any")

# Filter and summarize 'df'
intermediate_df = filtered_df[(filtered_df["any_start"]) & (filtered_df["any_end"])]


# Define a function to calculate start_frame and end_frame
def calculate_frames(group):
    is_start_index = group[group["is_start"] == 1].index[0]
    start_frame = group.loc[is_start_index, "frameId"]

    is_end_index = group[
        (group["is_end"] == 1) & (group["frameId"] > start_frame)
    ].index[0]
    end_frame = group.loc[is_end_index, "frameId"]

    return pd.Series({"start_frame": start_frame, "end_frame": end_frame})


# Apply the function to each group and reset index
frames_of_interest = (
    intermediate_df.groupby(["gameId", "playId"]).apply(calculate_frames).reset_index()
)

# Mutate 'pff' for 'play_block_rush'
pff_scouting_data["pff_role"] = pff_scouting_data["pff_role"].str.replace("Pass ", "")
play_block_rush = pff_scouting_data[
    pff_scouting_data["pff_role"].isin(["Block", "Rush"])
][["gameId", "playId", "nflId", "pff_role", "pff_positionLinedUp"]]

# Filter 'pff' for 'pff_network'
pff_network = pff_scouting_data[
    pff_scouting_data["pff_role"].isin(["Pass Block", "Pass Rush", "Pass"])
][["gameId", "playId", "nflId", "pff_role", "pff_positionLinedUp"]]

In [113]:
# Creating a WorkDF that is filtered down to just the frames of interest
WorkDF = pd.merge(clean_df, frames_of_interest, on=["gameId", "playId"], how="inner")
WorkDF = WorkDF[
    (WorkDF["frameId"] >= WorkDF["start_frame"])
    & (WorkDF["frameId"] <= WorkDF["end_frame"])
]

### Feature Engineering

To create a feature on stunts, I would ideally want to differentiate between t-e stunts and e-t stunts

Potentially also want to create a feature that is what gap they ended up attacking. That way we can cluster based on the ending gap of each DL player and that would create more distinct clusters for the predicted path algorithm to train on.
Create gaps based on the ranges of y coordinates between the OL. I want to make it so that if you end up on the OL at the end of the play, it takes into account what orientation the OL is in. 


In [117]:
# Filtering the WorkDF down to just the blockers
StuntDF = pd.merge(
    WorkDF, play_block_rush, on=["gameId", "playId", "nflId"], how="inner"
)
GapDF = StuntDF[
    (StuntDF["pff_role"] == "Block")
    & (
        StuntDF["pff_positionLinedUp"].isin(
            ["LT", "LG", "C", "RG", "RT", "TE-L", "TE-R"]
        )
    )
]

# Creating a wider dataset with the y positions for each OL position as a new column
StuntWider = GapDF.pivot_table(
    index=["gameId", "playId", "frameId"], columns="pff_positionLinedUp", values="y"
).reset_index()

# Filling the missing data for TEs to help aid assignment later
StuntWider["TE-L"] = StuntWider["TE-L"].fillna(100)
StuntWider["TE-R"] = StuntWider["TE-R"].fillna(0)

# Making a new dataframe that is just the pass rushers
Starting_Gaps = StuntDF[(StuntDF["pff_role"] == "Rush")]

# Create Cutoff Point into the play
Time_Cutoff = 20

Starting_Gaps["Cutoff_Point"] = np.where(
    Starting_Gaps["end_frame"] < Starting_Gaps["start_frame"] + Time_Cutoff,
    Starting_Gaps["end_frame"],
    Starting_Gaps["start_frame"] + Time_Cutoff,
)

Starting_Gaps = Starting_Gaps[
    (Starting_Gaps["frameId"] >= Starting_Gaps["start_frame"])
    & (Starting_Gaps["frameId"] <= Starting_Gaps["Cutoff_Point"])
]

# Joining the OL Y positional data onto the Starting_Gaps df
Starting_Gaps = pd.merge(Starting_Gaps, StuntWider, on=["gameId", "playId", "frameId"])

# Assigning the Gaps based on y position data
conditions = [
    Starting_Gaps["y"] > Starting_Gaps["TE-L"],
    (Starting_Gaps["y"] < Starting_Gaps["TE-L"])
    & (Starting_Gaps["y"] > Starting_Gaps["LT"]),
    (Starting_Gaps["y"] < Starting_Gaps["LT"])
    & (Starting_Gaps["y"] > Starting_Gaps["LG"]),
    (Starting_Gaps["y"] < Starting_Gaps["LG"])
    & (Starting_Gaps["y"] > Starting_Gaps["C"]),
    (Starting_Gaps["y"] < Starting_Gaps["C"])
    & (Starting_Gaps["y"] > Starting_Gaps["RG"]),
    (Starting_Gaps["y"] < Starting_Gaps["RG"])
    & (Starting_Gaps["y"] > Starting_Gaps["RT"]),
    (Starting_Gaps["y"] < Starting_Gaps["RT"])
    & (Starting_Gaps["y"] > Starting_Gaps["TE-R"]),
    Starting_Gaps["y"] < Starting_Gaps["TE-R"],
]
choices = ["L-D", "L-C", "L-B", "L-A", "R-A", "R-B", "R-C", "R-D"]

Starting_Gaps["Gap_Assignment"] = np.select(conditions, choices, default=np.NAN)

# Filling any NAN Gaps (which occur when the DL is directly on the OL Y value) with their previous Gap Assignment
Starting_Gaps["Gap_Assignment"] = Starting_Gaps.groupby(["gameId", "playId"])[
    "Gap_Assignment"
].fillna(method="ffill")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Starting_Gaps["Cutoff_Point"] = np.where(Starting_Gaps["end_frame"] < Starting_Gaps["start_frame"] + Time_Cutoff, Starting_Gaps["end_frame"], Starting_Gaps["start_frame"] + Time_Cutoff)


The Starting Gaps DF is setup so that for each defensive player, you have all of their positional data for the entire play, along with the OL Y positions for the entire play, and the Gap assignment for each frame as well.

Depending on how the Clustering Algorithm wants the data in, it may be better to make new columns for starting and ending (x, y, gap) for each play. But given that I don't know what format the clustering algorithm needs, I will leave it in this format currently.