### Importing

In [79]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, ArtistAnimation, Animation
from matplotlib.lines import Line2D
import matplotlib

# matplotlib.use('TkAgg')  # Use an appropriate backend (e.g., TkAgg)
import matplotlib.patheffects as path_effects
import matplotlib.patches as patches

### Downloading and Cleaning

In [2]:
# Specify the relative path to the data directory
data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Data"
non_games_data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Non_Games_Data"

# List all files in the data folder
file_list = os.listdir(data_folder_path)
file_list_non_games = os.listdir(non_games_data_folder_path)

# Use glob to filter specific file types
csv_files = glob.glob(os.path.join(data_folder_path, "*.csv"))
csv_files_non_games = glob.glob(os.path.join(non_games_data_folder_path, "*.csv"))

In [3]:
# Read in the weekly game data and concat into one combined df
dfs = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(dfs, ignore_index=True)

# Read in the supplementary data
games = pd.read_csv(csv_files_non_games[0])
nfl_colors = pd.read_csv(csv_files_non_games[1])
pff_scouting_data = pd.read_csv(csv_files_non_games[2])
players = pd.read_csv(csv_files_non_games[3])
plays = pd.read_csv(csv_files_non_games[4])

In [53]:
# Change the orientation of the plays
clean_df = combined_df.copy()
"""
clean_df['x'] = clean_df.apply(lambda row: 120 - row['x'] if row['playDirection'] == "left" else row['x'], axis=1)
clean_df['y'] = clean_df.apply(lambda row: 160 / 3 - row['y'] if row['playDirection'] == "left" else row['y'], axis=1)
clean_df['dir'] = clean_df.apply(lambda row: row['dir'] + 180 if row['playDirection'] == "left" else row['dir'], axis=1)
clean_df['dir'] = clean_df['dir'].apply(lambda val: val - 360 if val > 360 else val)
clean_df['o'] = clean_df.apply(lambda row: row['o'] + 180 if row['playDirection'] == "left" else row['o'], axis=1)
clean_df['o'] = clean_df['o'].apply(lambda val: val - 360 if val > 360 else val)
"""

# Merge nfl_colors and change the color of the football
clean_df = pd.merge(clean_df, nfl_colors, left_on="team", right_on="Code", how="left")
clean_df["primary"] = np.where(
    clean_df["primary"].isna(), "#8b4513", clean_df["primary"]
)

### Data prep and Filtering

In [5]:
# Mutate 'df'
filtered_df = clean_df.copy()

filtered_df["is_start"] = np.where(
    filtered_df["event"].isin(["autoevent_ballsnap", "ball_snap"]), 1, 0
)
filtered_df["is_end"] = np.where(
    filtered_df["event"].isin(
        [
            "fumble",
            "handoff",
            "lateral",
            "autoevent_passforward",
            "pass_forward",
            "qb_sack",
            "qb_strip_sack",
            "run",
        ]
    ),
    1,
    0,
)

# Group by and mutate 'df'
grouped = filtered_df.groupby(["gameId", "playId"])
filtered_df["any_start"] = grouped["is_start"].transform("any")
filtered_df["any_end"] = grouped["is_end"].transform("any")

# Filter and summarize 'df'
intermediate_df = filtered_df[(filtered_df["any_start"]) & (filtered_df["any_end"])]


# Define a function to calculate start_frame and end_frame
def calculate_frames(group):
    is_start_index = group[group["is_start"] == 1].index[0]
    start_frame = group.loc[is_start_index, "frameId"]

    is_end_index = group[
        (group["is_end"] == 1) & (group["frameId"] > start_frame)
    ].index[0]
    end_frame = group.loc[is_end_index, "frameId"]

    return pd.Series({"start_frame": start_frame, "end_frame": end_frame})


# Apply the function to each group and reset index
frames_of_interest = (
    intermediate_df.groupby(["gameId", "playId"]).apply(calculate_frames).reset_index()
)

# Mutate 'pff' for 'play_block_rush'
pff_scouting_data["pff_role"] = pff_scouting_data["pff_role"].str.replace("Pass ", "")
play_block_rush = pff_scouting_data[
    pff_scouting_data["pff_role"].isin(["Block", "Rush"])
][["gameId", "playId", "nflId", "pff_role", "pff_positionLinedUp"]]

# Filter 'pff' for 'pff_network'
pff_network = pff_scouting_data[
    pff_scouting_data["pff_role"].isin(["Pass Block", "Pass Rush", "Pass"])
][["gameId", "playId", "nflId", "pff_role", "pff_positionLinedUp"]]

In [54]:
# Creating a WorkDF that is filtered down to just the frames of interest
WorkDF = pd.merge(clean_df, frames_of_interest, on=["gameId", "playId"], how="inner")
WorkDF = WorkDF[
    (WorkDF["frameId"] >= WorkDF["start_frame"])
    & (WorkDF["frameId"] <= WorkDF["end_frame"])
]

### Feature Engineering

To create a feature on stunts, I would ideally want to differentiate between t-e stunts and e-t stunts

Potentially also want to create a feature that is what gap they ended up attacking. That way we can cluster based on the ending gap of each DL player and that would create more distinct clusters for the predicted path algorithm to train on.
Create gaps based on the ranges of y coordinates between the OL. I want to make it so that if you end up on the OL at the end of the play, it takes into account what orientation the OL is in. 


In [7]:
# Filtering the WorkDF down to just the blockers
StuntDF = pd.merge(
    WorkDF, play_block_rush, on=["gameId", "playId", "nflId"], how="inner"
)
GapDF = StuntDF[
    (StuntDF["pff_role"] == "Block")
    & (
        StuntDF["pff_positionLinedUp"].isin(
            ["LT", "LG", "C", "RG", "RT", "TE-L", "TE-R"]
        )
    )
]

# Creating a wider dataset with the y positions for each OL position as a new column
StuntWider = GapDF.pivot_table(
    index=["gameId", "playId", "frameId"], columns="pff_positionLinedUp", values="y"
).reset_index()

# Filling the missing data for TEs to help aid assignment later
StuntWider["TE-L"] = StuntWider["TE-L"].fillna(100)
StuntWider["TE-R"] = StuntWider["TE-R"].fillna(0)

# Making a new dataframe that is just the pass rushers
Starting_Gaps = StuntDF[(StuntDF["pff_role"] == "Rush")]

# Create Cutoff Point into the play
Time_Cutoff = 20

Starting_Gaps["Cutoff_Point"] = np.where(
    Starting_Gaps["end_frame"] < Starting_Gaps["start_frame"] + Time_Cutoff,
    Starting_Gaps["end_frame"],
    Starting_Gaps["start_frame"] + Time_Cutoff,
)

Starting_Gaps = Starting_Gaps[
    (Starting_Gaps["frameId"] >= Starting_Gaps["start_frame"])
    & (Starting_Gaps["frameId"] <= Starting_Gaps["Cutoff_Point"])
]

# Joining the OL Y positional data onto the Starting_Gaps df
Starting_Gaps = pd.merge(Starting_Gaps, StuntWider, on=["gameId", "playId", "frameId"])

# Assigning the Gaps based on y position data
conditions = [
    Starting_Gaps["y"] > Starting_Gaps["TE-L"],
    (Starting_Gaps["y"] < Starting_Gaps["TE-L"])
    & (Starting_Gaps["y"] > Starting_Gaps["LT"]),
    (Starting_Gaps["y"] < Starting_Gaps["LT"])
    & (Starting_Gaps["y"] > Starting_Gaps["LG"]),
    (Starting_Gaps["y"] < Starting_Gaps["LG"])
    & (Starting_Gaps["y"] > Starting_Gaps["C"]),
    (Starting_Gaps["y"] < Starting_Gaps["C"])
    & (Starting_Gaps["y"] > Starting_Gaps["RG"]),
    (Starting_Gaps["y"] < Starting_Gaps["RG"])
    & (Starting_Gaps["y"] > Starting_Gaps["RT"]),
    (Starting_Gaps["y"] < Starting_Gaps["RT"])
    & (Starting_Gaps["y"] > Starting_Gaps["TE-R"]),
    Starting_Gaps["y"] < Starting_Gaps["TE-R"],
]
choices = ["L-D", "L-C", "L-B", "L-A", "R-A", "R-B", "R-C", "R-D"]

Starting_Gaps["Gap_Assignment"] = np.select(conditions, choices, default=np.NAN)

# Filling any NAN Gaps (which occur when the DL is directly on the OL Y value) with their previous Gap Assignment
Starting_Gaps["Gap_Assignment"] = Starting_Gaps.groupby(["gameId", "playId"])[
    "Gap_Assignment"
].fillna(method="ffill")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Starting_Gaps["Cutoff_Point"] = np.where(


The Starting Gaps DF is setup so that for each defensive player, you have all of their positional data for the entire play, along with the OL Y positions for the entire play, and the Gap assignment for each frame as well.

Depending on how the Clustering Algorithm wants the data in, it may be better to make new columns for starting and ending (x, y, gap) for each play. But given that I don't know what format the clustering algorithm needs, I will leave it in this format currently.

### Validation Viz

In [187]:
gameId = 2021090900
playId = 137

# Select specific columns from 'plays'
selected_plays = plays[["gameId", "playId", "playDescription"]]

# Filter rows based on conditions
filtered_plays = selected_plays[
    (selected_plays["gameId"] == gameId) & (selected_plays["playId"] == playId)
]

# Perform left join with 'df'
VizDF = filtered_plays.merge(clean_df, how="left")

VizDF["x_temp"] = VizDF["y"]
VizDF["y_temp"] = VizDF["x"]
VizDF["y"] = VizDF["y_temp"]
VizDF["x"] = VizDF["x_temp"]
VizDF = VizDF.drop(["x_temp", "y_temp"], axis=1)
VizDF["jerseyNumber"] = np.where(VizDF["jerseyNumber"].isna(), 0, VizDF["jerseyNumber"])
VizDF["sinO"] = np.sin(np.radians(VizDF["o"]))
VizDF["cosO"] = np.cos(np.radians(VizDF["o"]))

ymin = 0
ymax = 160 / 3
hash_right = 38.35
hash_left = 12
hash_width = 3.3
xmin = max(round(min(VizDF["y"]) - 10, -1), 0)
xmax = min(round(max(VizDF["y"]) + 10, -1), 120)

# Create a grid of coordinates (x, y)
y_values = [0, 23.36667, 29.96667, ymax]
x_values = list(range(10, 111))  # 10 to 110
grid = pd.DataFrame({"x": x_values * len(y_values), "y": y_values * len(x_values)})

# Filter rows based on conditions
df_hash = grid[(grid["x"] % 5 != 0) & (grid["x"] < xmax) & (grid["x"] > xmin)]
# Create the base plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)

for index, row in df_hash.iterrows():
    if row["y"] > 55 / 2:
        ax.annotate(
            "|", xy=(row["x"] - 0.25, row["y"]), va="top", color="gray", zorder=0
        )
    else:
        ax.annotate(
            "|", xy=(row["x"] - 0.25, row["y"]), va="bottom", color="gray", zorder=0
        )

# Add segment annotations
for x in range(int(max(9, xmin)), int(min(xmax, 111)), 5):
    ax.plot([x, x], [ymin, ymax], color="gray", zorder=0)


# Add text annotations on the sides
y_labels = (
    ["   G"]
    + [str(val) for val in range(10, 51, 10)]
    + [str(val) for val in reversed(range(10, 41, 10))]
    + ["G   "]
)

yard_labels = pd.DataFrame(zip(range(10, 120, 10), y_labels), columns=["yard", "label"])

for x in range(int(max(10, xmin)), int(min(xmax, 120)), 10):
    if (x != xmin) & (x != xmax):
        label_top = yard_labels.loc[yard_labels["yard"] == x, "label"].values[0]
        label_bottom = yard_labels.loc[yard_labels["yard"] == x, "label"].values[0]

        ax.text(
            x,
            hash_left - 2,
            label_top,
            size=12,
            color="gray",
            va="center",
            ha="center",
            zorder=0,
        )
        ax.text(
            x,
            ymax - hash_left + 2,
            label_bottom,
            size=12,
            color="gray",
            va="center",
            ha="center",
            zorder=0,
        )

# Add field boundary lines
ax.plot(
    [xmin, xmin, xmax, xmax, xmin],
    [ymin, ymax, ymax, ymin, ymin],
    color="gray",
    zorder=0,
)

# Turn off both axes and tick labels
ax.axis("off")

# Initialize the artists
point_artists = []
annotation_artists = []
arrow_artists = []


def init():
    # Initialize the artists (empty at the beginning)
    return arrow_artists + point_artists + annotation_artists


def update(frame):
    # Update the plot for each frame

    # Clear the artists from the previous frame
    for artist in arrow_artists + point_artists + annotation_artists:
        artist.remove()
    point_artists.clear()
    annotation_artists.clear()
    arrow_artists.clear()

    # Get the rows for the current frame from the DataFrame
    frame_rows = VizDF[VizDF["frameId"] == frame + 5]

    # Add/update dots for this frame
    for index, row in frame_rows.iterrows():
        x = row["y"]
        y = row["x"]
        color = row["primary"]
        jersey = row["jerseyNumber"]
        team = row["team"]
        sinO = row["sinO"]
        cosO = row["cosO"]

        if team == "football":
            dot_artist1 = Line2D([x], [y], marker="d", markersize=8, color=color)
            ax.add_artist(dot_artist1)
            point_artists.append(dot_artist1)

        else:
            dot_artist = Line2D(
                [x], [y], marker="o", markersize=20, color=color, zorder=2
            )
            ax.add_artist(dot_artist)
            point_artists.append(dot_artist)

            annotation_artist = ax.text(
                x,
                y,
                str(round(int(jersey), 0)),
                fontsize=10,
                ha="center",
                va="center",
                color="white",
                zorder=3,
                path_effects=[
                    path_effects.Stroke(linewidth=0.5, foreground="black"),
                    path_effects.Normal(),
                ],
            )
            annotation_artists.append(annotation_artist)

            arrow_artist = Line2D(
                [x, x + sinO * 1],
                [y, y + cosO * 1],
                linestyle="--",
                dash_capstyle="round",
                linewidth=10,
                zorder=1,
                color=color,
            )
            ax.add_artist(arrow_artist)
            arrow_artists.append(arrow_artist)

    return arrow_artists + annotation_artists + point_artists


ex_play_lengthVIZ = VizDF["frameId"].nunique()

# Create the animation
animation = FuncAnimation(
    fig,
    update,
    frames=range(ex_play_lengthVIZ),
    init_func=init,
    blit=True,
    interval=100,
)

animation.save("Sample Validation Viz.gif")
plt.show()

MovieWriter ffmpeg unavailable; using Pillow instead.
