### Importing

In [None]:
import os
import glob
import pandas as pd
import numpy as np
np.random.seed(42)
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


### Downloading and Cleaning

In [None]:
# Specify the relative path to the data directory
data_folder_path = "insert path"

# List all files in the data folder
file_list = os.listdir(data_folder_path)

# Use glob to filter specific file types
csv_files = glob.glob(os.path.join(data_folder_path, "*.csv"))

# Read in the weekly game data and concat into one combined df
dfs = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
# Specify the relative path to the data directory
non_games_data_folder_path = "insert path"

# List all files in the data folder
file_list_non_games = os.listdir(non_games_data_folder_path)

# Use glob to filter specific file types
csv_files_non_games = glob.glob(os.path.join(non_games_data_folder_path, "*.csv"))


# Read in the supplementary data
games = pd.read_csv(csv_files_non_games[0])
nfl_colors = pd.read_csv(csv_files_non_games[1])
players = pd.read_csv(csv_files_non_games[2])
plays = pd.read_csv(csv_files_non_games[3])
tackles = pd.read_csv(csv_files_non_games[4])

In [None]:

clean_df = pd.read_csv("insert path")

In [None]:
# Change the orientation of the plays
clean_df = combined_df.copy()

clean_df['clean_x'] = clean_df.apply(lambda row: 120 - row['x'] if row['playDirection'] == "left" else row['x'], axis=1)
clean_df['clean_y'] = clean_df.apply(lambda row: 160 / 3 - row['y'] if row['playDirection'] == "left" else row['y'], axis=1)
clean_df['clean_dir'] = clean_df.apply(lambda row: row['dir'] + 180 if row['playDirection'] == "left" else row['dir'], axis=1)
clean_df['clean_dir'] = clean_df['clean_dir'].apply(lambda val: val - 360 if val > 360 else val)
clean_df['clean_o'] = clean_df.apply(lambda row: row['o'] + 180 if row['playDirection'] == "left" else row['o'], axis=1)
clean_df['clean_o'] = clean_df['clean_o'].apply(lambda val: val - 360 if val > 360 else val)


# Merge nfl_colors and change the color of the football
clean_df = pd.merge(clean_df, nfl_colors, left_on="club", right_on="Code", how="left")
clean_df["primary"] = np.where(
    clean_df["primary"].isna(), "#8b4513", clean_df["primary"]
)

In [None]:
clean_df.to_csv("insert path", index=False)

In [None]:
print(clean_df["event"].drop_duplicates())

### Data prep and Filtering

In [None]:
# Mutate 'df'
filtered_df = clean_df.copy()

filtered_df["is_start"] = np.where(
    filtered_df["event"].isin(
        [
            "autoevent_ballsnap",
              "ball_snap",
                "pass_arrived",
                  "pass_outcome_caught",
                  "pass_outcome_touchdown",
                    "run",
                      "snap_direct",
                      "handoff"
    ]), 1, 0
)
filtered_df["is_end"] = np.where(
    filtered_df["event"].isin(
        [
            "tackle",
            "touchdown",
            "out_of_bounds",
            "fumble",
            "lateral",
            "qb_sack",
            "autoevent_passinterrupted",
            "safety",
            "autoevent_passinterrupted"
        ]
    ),
    1,
    0,
)
# Group by and mutate 'df'
grouped = filtered_df.groupby(["gameId", "playId"])
filtered_df["any_start"] = grouped["is_start"].transform("any")
filtered_df["any_end"] = grouped["is_end"].transform("any")

# Filter and summarize 'df'
intermediate_df = filtered_df[(filtered_df["any_start"]) & (filtered_df["any_end"])]


# Define a function to calculate start_frame and end_frame
def calculate_frames(group):
    start_indices = group[group["is_start"] == 1].index
    if len(start_indices) == 0:
        return pd.Series({"start_frame": None, "end_frame": None})

    start_frame = group.loc[start_indices[0], "frameId"]

    end_indices = group[(group["is_end"] == 1) & (group["frameId"] > start_frame)].index
    if len(end_indices) == 0:
        last_row_index = group.index[-1]  # Get the index of the last row in the group
        end_frame = group.loc[last_row_index, "frameId"]
    else:
        end_frame = group.loc[end_indices[0], "frameId"]

    return pd.Series({"start_frame": start_frame, "end_frame": end_frame})


# Apply the function to each group and reset index
frames_of_interest = (
    intermediate_df.groupby(["gameId", "playId"]).apply(calculate_frames).reset_index()
)

GotTheBall = plays[["gameId", "playId", "ballCarrierId"]]
GotTheBall["GotTheBall"] = 1


filtered_df["Pass"] = np.where(
    filtered_df["event"].isin(
        [
             "pass_arrived",
                  "pass_outcome_caught",
                  "pass_outcome_touchdown"
    ]), 1, 0
)

filtered_df['PassPlay'] = filtered_df.groupby(['gameId', 'playId'])['Pass'].transform('max')

PassPlay = filtered_df[['gameId', 'playId', 'PassPlay']].drop_duplicates()


In [None]:
# Creating a WorkDF that is filtered down to just the frames of interest
WorkDF = pd.merge(clean_df, frames_of_interest, on=["gameId", "playId"], how="inner")
WorkDF = WorkDF[
    (WorkDF["frameId"] >= WorkDF["start_frame"])
    & (WorkDF["frameId"] <= WorkDF["end_frame"])
]

WorkDF = pd.merge(WorkDF, GotTheBall,
                   left_on = ["gameId", "playId", "nflId"],
                   right_on=["gameId", "playId", "ballCarrierId"],
                   how = "left")


WorkDF = pd.merge(WorkDF, PassPlay,
                   on = ["gameId", "playId"],
                   how = "left")


WorkDF = WorkDF[(WorkDF["club"] != "football")]

WorkDF['area'] = np.nan

WorkDF['Time'] = WorkDF.groupby(['gameId', 'playId'])['frameId'].transform(lambda x: x - x.min() + 1)

WorkDF.fillna(0, inplace=True)

# Calculate the maximum absolute value in the column
max_abs_value = WorkDF[WorkDF["GotTheBall"] == 1]['a'].abs().max()

# Calculate the percent rank based on scaled absolute values
WorkDF['PercentRankA'] = (WorkDF['a'].abs() / max_abs_value)

WorkDF = pd.merge(WorkDF, plays, on = ["gameId", "playId"])

In [None]:
gamesGroup = WorkDF.groupby("gameId")


for gameName, game in gamesGroup: 

    vor_filepath = "insert path"
    
    vor_filename = f"WorkDF{gameName}.csv"
    
    # Save the processed chunk to a specific location
    game.to_csv(f"{vor_filepath}{vor_filename}", index=False)


In [None]:
WorkDF.to_csv("insert path")