### Importing

In [1]:
import os
import glob
import pandas as pd
import numpy as np
np.random.seed(42)
import random
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import plot_importance
from scipy.stats import multivariate_normal
import scipy.stats as stats
import scipy
import math
from scipy.spatial import Voronoi, voronoi_plot_2d

### Data Importing

In [2]:
# Specify the relative path to the data directory
data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Data"
non_games_data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Non_Games_Data"

# List all files in the data folder
file_list = os.listdir(data_folder_path)
file_list_non_games = os.listdir(non_games_data_folder_path)

# Use glob to filter specific file types
csv_files = glob.glob(os.path.join(data_folder_path, "*.csv"))
csv_files_non_games = glob.glob(os.path.join(non_games_data_folder_path, "*.csv"))

# Read in the weekly game data and concat into one combined df
#dfs = [pd.read_csv(file) for file in csv_files]
#combined_df = pd.concat(dfs, ignore_index=True)

# Read in the supplementary data
games = pd.read_csv(csv_files_non_games[0])
nfl_colors = pd.read_csv(csv_files_non_games[1])
players = pd.read_csv(csv_files_non_games[2])
plays = pd.read_csv(csv_files_non_games[3])

### ModelingDF

In [3]:
ModelingDF = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/WorkDF.csv")
ModelingDF = pd.merge(ModelingDF, plays, on = ["gameId", "playId"])
ModelingDF['DistFromLOS'] = (ModelingDF["x"] - 10) - ( ModelingDF["absoluteYardlineNumber"])
ModelingDF["DistFromLOS"] = np.where(ModelingDF["DistFromLOS"] < 0, ModelingDF["DistFromLOS"]*-1, ModelingDF["DistFromLOS"])

ModelingDF['DistFromMOF'] = (ModelingDF["y"] - 26.5)
ModelingDF["DistFromMOF"] = np.where(ModelingDF["DistFromMOF"] < 0, ModelingDF["DistFromMOF"]*-1, ModelingDF["DistFromMOF"])


### Voronoi Tesselations

In [None]:

VOR_df = ModelingDF.copy()

#VOR_df = VOR_df[(VOR_df["gameId"] == 2022090800)]

VOR_df = VOR_df[(VOR_df["club"] != "football")]

VOR_df['area'] = np.nan

VOR_df['Time'] = VOR_df.groupby(['gameId', 'playId'])['frameId'].transform(lambda x: x - x.min() + 1)

VOR_df.fillna(0, inplace=True)

# Calculate the maximum absolute value in the column
max_abs_value = VOR_df[VOR_df["GotTheBall"] == 1]['DistFromLOS'].abs().max()

# Calculate the percent rank based on scaled absolute values
VOR_df['PercentRankLOS'] = 1 - (VOR_df['DistFromLOS'].abs() / max_abs_value)

# Calculate the maximum absolute value in the column
max_abs_valueMOF = 75

# Calculate the percent rank based on scaled absolute values
VOR_df['PercentRankMOF'] = 1 - (VOR_df['DistFromMOF'].abs() / max_abs_valueMOF)

def calculate_adjusted_change(row):
    if row['Time'] <= 10:
        return row['area_percentage_change'] / (math.log(row['Time'] + 1, 11))
    else:
        return row['area_percentage_change'] * (np.exp(-(row["Time"] - 10) / 5))


def Calculate_Gravity(playerCoordinates = []):

    TotalGravity = pd.DataFrame()

    unique_frames = playerCoordinates["frameId"].unique()


    for frame in unique_frames:

        frame_data = playerCoordinates[playerCoordinates["frameId"] == frame]

        bounding_lowerL = {'x' : frame_data['x'].min() - 1, 'y' : frame_data['y'].min() - 1} 
        bounding_upperL = {'x' : frame_data['x'].max() + 1, 'y' : frame_data['y'].max() + 1}
        bounding_lowerR = {'x' : frame_data['x'].min() - 1, 'y' : frame_data['y'].max() + 1}
        bounding_upperR = {'x' : frame_data['x'].max() + 1, 'y' : frame_data['y'].min() - 1}

        bounding_lowerL = pd.DataFrame([bounding_lowerL])
        bounding_upperL = pd.DataFrame([bounding_upperL])
        bounding_lowerR = pd.DataFrame([bounding_lowerR])
        bounding_upperR = pd.DataFrame([bounding_upperR])


        frame_data = pd.concat([frame_data, bounding_lowerL], ignore_index=True)
        frame_data = pd.concat([frame_data, bounding_upperL], ignore_index=True)
        frame_data = pd.concat([frame_data, bounding_lowerR], ignore_index=True)
        frame_data = pd.concat([frame_data, bounding_upperR], ignore_index=True)


        points = frame_data[['x', 'y']].dropna().values

        # Create a Voronoi diagram
        vor = Voronoi(points)

        # Iterate through the input points
        for i, point in enumerate(points):
            # Find the Voronoi region index for the current point
            region_index = vor.point_region[i]
            
            # Get the vertices of the region
            region_vertices = vor.regions[region_index]
            
            # Filter out invalid vertices
            region_vertices = [vertex for vertex in region_vertices if vertex != -1]
            
            if len(region_vertices) > 0:
                # Get the vertices of the region
                vertices = vor.vertices[region_vertices]
                
                # Calculate the area using the Shoelace formula
                area = 0.5 * np.abs(np.dot(vertices[:, 0], np.roll(vertices[:, 1], 1)) -
                                np.dot(vertices[:, 1], np.roll(vertices[:, 0], 1)))
                
                # Store the area in the 'area' column of VOR_df for the current point
                frame_data.at[i, 'area'] = area


        TotalGravity = pd.concat([TotalGravity, frame_data], ignore_index=True) 

        TotalGravity.dropna(subset = ["gameId"], inplace = True)

        TotalGravity = TotalGravity[TotalGravity["GotTheBall"] == 1]

        # Sort the DataFrame by the grouping columns and frameId in ascending order
        TotalGravity.sort_values(by=['gameId', 'playId', 'nflId', 'frameId'], inplace=True)

        # Calculate the percentage change in the 'area' column within each group
        TotalGravity['area_percentage_change'] = TotalGravity.groupby(['gameId', 'playId', 'nflId'])['area'].diff()

        TotalGravity['area_percentage_change'].fillna(0, inplace=True)

        # Create the 'adjusted_change' column by applying the custom function to each row",
        TotalGravity['adjusted_change'] = TotalGravity.apply(calculate_adjusted_change, axis=1)

        TotalGravity['adjusted_change'] = TotalGravity['adjusted_change'] * TotalGravity['a'] * TotalGravity["PercentRankLOS"] * TotalGravity["PercentRankMOF"]

    return(TotalGravity)

VOR_df_output = Calculate_Gravity(VOR_df)

In [None]:
Min_Play_Count = 15
ExtraWork = VOR_df_output.copy()

IQR = stats.iqr(VOR_df_output["adjusted_change"])
# Calculate the first quartile (Q1)
Q1 = np.percentile(VOR_df_output["adjusted_change"], 25)

# Calculate the third quartile (Q3)
Q3 = np.percentile(VOR_df_output["adjusted_change"], 75)


ExtraWork = ExtraWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] <= Q3 + 1.5 * IQR]).reset_index(drop=True)
ExtraWork = ExtraWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] >= Q1 - 1.5 * IQR]).reset_index(drop=True)


# Group by 'nflId' and count the unique 'frameId' values in each group
play_counts = ExtraWork.groupby('nflId')['playId'].nunique()

# Create a new column 'TotalPlays' in the original DataFrame
ExtraWork['TotalPlays'] = ExtraWork['nflId'].map(play_counts)
ExtraWork = ExtraWork[ExtraWork["TotalPlays"] >= Min_Play_Count]

Summarized = pd.DataFrame()
Summarized["MeanPlay"] = ExtraWork.groupby(["nflId"])["adjusted_change"].sum()
Summarized = Summarized.drop_duplicates()
Summarized.reset_index(inplace=True)

Summarized = Summarized.sort_values(by=['MeanPlay'], ascending = False)

player_name = players[["nflId", "position", "displayName"]]

Summarized = pd.merge(Summarized, player_name, on = "nflId")

TotalPlays = ExtraWork[["nflId", "club", "TotalPlays"]].drop_duplicates()

Summarized = pd.merge(Summarized, TotalPlays, on = "nflId")
Summarized = Summarized.drop_duplicates()

EPA = plays.groupby(["ballCarrierId"])["expectedPointsAdded"].mean()

Summarized = pd.merge(Summarized, EPA, left_on="nflId", right_on="ballCarrierId")

Summarized = Summarized.drop_duplicates()


In [None]:
VOR_df_output.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYAC.csv", index=False)
Summarized.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYACSummarized.csv", index=False)


### Defensive Closest Gravity

In [None]:
def ClosestToTheBall(playerCoordinates):

  #Determined each speed in x and y, as well as distance from the QB
  xCoordinateBall=playerCoordinates.loc[(playerCoordinates['club']=='football')]['clean_x'].unique()[0]
  yCoordinateBall=playerCoordinates.loc[(playerCoordinates['club']=='football')]['clean_y'].unique()[0]
  playerCoordinates['distanceFromBall']=((playerCoordinates['clean_x']-xCoordinateBall)**2 + (playerCoordinates['clean_y']-yCoordinateBall)**2)**(1/2)
  
  output_df = playerCoordinates[(playerCoordinates["distanceFromBall"] <= 10) &
                                 (playerCoordinates["club"] == playerCoordinates["defensiveTeam"])][["gameId", "playId", "frameId", "nflId"]]

  return output_df

ClosestPlayers = ClosestToTheBall(ModelingDF)


In [None]:

ClosestPlayersWork = pd.merge(VOR_df_output, ClosestPlayers, on = ["gameId", "playId", "frameId"])

ClosestPlayersWork = ClosestPlayersWork.groupby('nflId_x').apply(lambda x: x[x['adjusted_change'] <= Q3 + 1.5 * IQR]).reset_index(drop=True)
ClosestPlayersWork = ClosestPlayersWork.groupby('nflId_x').apply(lambda x: x[x['adjusted_change'] >= Q1 - 1.5 * IQR]).reset_index(drop=True)


# Group by 'nflId' and count the unique 'frameId' values in each group
play_counts = ClosestPlayersWork.groupby('nflId_y')['playId'].nunique()

# Create a new column 'TotalPlays' in the original DataFrame
ClosestPlayersWork['TotalPlays'] = ClosestPlayersWork['nflId_y'].map(play_counts)
#ClosestPlayersWork = ClosestPlayersWork[ClosestPlayersWork["TotalPlays"] >= Min_Play_Count]


ClosestSummary = pd.DataFrame()

ClosestSummary["ClosingPlay"] = ClosestPlayersWork.groupby('nflId_y')["adjusted_change"].sum()



ClosestSummary = ClosestSummary.drop_duplicates()
ClosestSummary.reset_index(inplace=True)
ClosestSummary = pd.merge(ClosestSummary, player_name, left_on = "nflId_y", right_on = "nflId")

TotalPlaysClosest = ClosestPlayersWork[["nflId_y", "club", "TotalPlays"]].drop_duplicates()

ClosestSummary = pd.merge(ClosestSummary, TotalPlaysClosest, on = "nflId_y")

ClosestSummary = ClosestSummary.sort_values(by=['ClosingPlay'], ascending = False)

In [None]:
ClosestPlayers.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesDefenseClosest.csv", index=False)
ClosestSummary.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesDefenseClosestSummary.csv", index=False)


### Team Gravity 

In [45]:
VOR_df_output = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYAC.csv")


  VOR_df_output = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYAC.csv")


In [71]:

DefensiveGravityWork = VOR_df_output.copy()


IQR = stats.iqr(VOR_df_output["adjusted_change"])
# Calculate the first quartile (Q1)
Q1 = np.percentile(VOR_df_output["adjusted_change"], 25)

# Calculate the third quartile (Q3)
Q3 = np.percentile(VOR_df_output["adjusted_change"], 75)


DefensiveGravityWork = DefensiveGravityWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] <= Q3 + 1.5 * IQR]).reset_index(drop=True)
DefensiveGravityWork = DefensiveGravityWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] >= Q1 - 1.5 * IQR]).reset_index(drop=True)

#player_name = players[["nflId", "position", "displayName"]]
#DefensiveGravityWork = pd.merge(DefensiveGravityWork, player_name, on = "nflId")
#DefensiveGravityWork = DefensiveGravityWork[DefensiveGravityWork["position"] != "QB"]

DefensiveGravity = pd.DataFrame()
DefensiveGravity["MeanPlayD"] = DefensiveGravityWork.groupby(["defensiveTeam"])["adjusted_change"].sum()
DefensiveGravity = DefensiveGravity.drop_duplicates()
DefensiveGravity.reset_index(inplace=True)

DefensiveGravity = DefensiveGravity.sort_values(by=['MeanPlayD'], ascending = True)




OffensiveGravity = pd.DataFrame()
OffensiveGravity["MeanPlayO"] = DefensiveGravityWork.groupby(["possessionTeam"])["adjusted_change"].sum()
OffensiveGravity = OffensiveGravity.drop_duplicates()
OffensiveGravity.reset_index(inplace=True)

OffensiveGravity = OffensiveGravity.sort_values(by=['MeanPlayO'], ascending = False)



In [50]:
OffensiveGravity.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/OffensiveGravityTeam.csv", index=False)
DefensiveGravity.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/DefensiveGravityTeam.csv", index=False)


### Break Through

In [81]:
def find_rows_above_threshold(array, threshold=1e-3):
    max_values = np.amax(array, axis=1)  # Get maximum values for each row
    above_threshold = max_values > threshold
    first_index = np.argmax(above_threshold)
    last_index = len(above_threshold) - np.argmax(above_threshold[::-1]) - 1
    return first_index.astype(int), last_index.astype(int)


def uncovered_frame_list(row):
    Ball_min = row["Ball_Min_y"]
    Ball_Max = row["Ball_Max_y"]
    Player_Min = row["Min_y"]
    Player_Max = row["Max_y"]
    Ball_Carrier = row['GotTheBall']
    
    condition = np.all((Player_Min <= Ball_min) & (Player_Max >= Ball_Max) & (Ball_Carrier != 1))
    
    return 1 if condition else 0



def CalculateBreakthrough(playerCoordinates=[]):

  np.random.seed(42)
  
  #Determined each speed in x and y, as well as distance from the QB
  xCoordinateBall=playerCoordinates.loc[(playerCoordinates['club']=='football')]['clean_x'].unique()[0]
  yCoordinateBall=playerCoordinates.loc[(playerCoordinates['club']=='football')]['clean_y'].unique()[0]
  playerCoordinates['distanceFromBall']=((playerCoordinates['clean_x']-xCoordinateBall)**2 + (playerCoordinates['clean_y']-yCoordinateBall)**2)**(1/2)
  playerCoordinates['radiansDirection'] = playerCoordinates['dir'].astype(float).apply(math.radians) #Converts angle in degrees to radians
  playerCoordinates['xComponent']=playerCoordinates['radiansDirection'].astype(float).apply(math.cos) #Converts angle into an x and y component
  playerCoordinates['yComponent']=playerCoordinates['radiansDirection'].astype(float).apply(math.sin)
  playerCoordinates['xspeed']=playerCoordinates['xComponent']*playerCoordinates['s'] #Determines magnitude of speed by multiplying x and y component by magnitude of speed
  playerCoordinates['yspeed']=playerCoordinates['yComponent']*playerCoordinates['s']
  playerCoordinates['xComponent']= np.where((playerCoordinates['xComponent'] < 1e-2) & 
                                            (playerCoordinates['xComponent'] > -1e-2), 1e-2, playerCoordinates['xComponent'])
  playerCoordinates['yComponent']= np.where((playerCoordinates['yComponent'] < 1e-2) & 
                                            (playerCoordinates['yComponent'] > -1e-2), 1e-2, playerCoordinates['yComponent'])


  playerCoordinates = playerCoordinates[((playerCoordinates["distanceFromBall"] <= 15) & (playerCoordinates["clean_x"] >= xCoordinateBall) )| 
                                        (playerCoordinates["GotTheBall"] == 1)]

  unique_frames = playerCoordinates["frameId"].unique()

  output_df = pd.DataFrame()
  
  y, x = np.mgrid[0:53.3:1, 0:120:1]
  locations = np.dstack((x, y))
  
  for frame in unique_frames:

    frame_data = playerCoordinates[playerCoordinates["frameId"] == frame]
    frame_list = []

    # Generate pdf's for the defensive players and the quarteback
    for index, row in frame_data.iterrows():
      if((row['club'] == row['defensiveTeam'])) | (row["GotTheBall"] == 1):
        speed_Ratio=(row['s']**2)/(100)
        topLeftSMatrix=(row['distanceFromBall']+row['distanceFromBall']*speed_Ratio)/2
        bottomRightSMatrix=(row['distanceFromBall']-row['distanceFromBall']*speed_Ratio)/2

        try:
          #Setting up R and S matrix in bivariate normal distribution
          r_matrix=[(row['xComponent'], -row['yComponent']),(row['yComponent'], row['xComponent'])]
          r_matrix=pd.DataFrame(data=r_matrix)
          #Adds very small value to ensure matrix is invertible even if player is completely stationary
          s_matrix=[(topLeftSMatrix+0.00001,0), (0, bottomRightSMatrix-0.000001)]
          s_matrix=pd.DataFrame(data=s_matrix)
          inverse_r_Matrix=np.linalg.inv(r_matrix)
          multiplyingTogetherFirstTwoMatrices=r_matrix.dot(s_matrix)
          nextMatrix=multiplyingTogetherFirstTwoMatrices.dot(s_matrix)
          covariance_matrix=nextMatrix.dot(inverse_r_Matrix)
          mu_val_x=row['clean_x']+row['xspeed']*0.5
          mu_val_y=row['clean_y']+row['yspeed']*0.5
          mu=[mu_val_x,mu_val_y]
          player_pdf=multivariate_normal(mu,covariance_matrix).pdf(locations)

          Player_max = find_rows_above_threshold(player_pdf)

          frame_data.at[index, "Max_y"] = int(Player_max[1])
          frame_data.at[index, "Min_y"] = int(Player_max[0])

          frame_data["Max_y"] = frame_data["Max_y"].fillna(0)
          frame_data["Min_y"] = frame_data["Min_y"].fillna(53) 
          
        except np.linalg.LinAlgError:
           pass

    frame_data["Ball_Max_y"] = frame_data.loc[frame_data["GotTheBall"] == 1, "Max_y"].iloc[0].astype(int)
    frame_data["Ball_Min_y"] = frame_data.loc[frame_data["GotTheBall"] == 1, "Min_y"].iloc[0].astype(int)

    frame_data["Covering"] = frame_data.apply(uncovered_frame_list, axis=1)

    output_df = pd.concat([output_df, frame_data], ignore_index=True)

  
  return output_df
      

  
VOR_df = ModelingDF.copy()

#VOR_df = VOR_df[(VOR_df["gameId"] == 2022090800) & (VOR_df["playId"] == 3431)]

test_output = CalculateBreakthrough(VOR_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_data.at[index, "Max_y"] = int(Player_max[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_data.at[index, "Min_y"] = int(Player_max[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_data["Max_y"] = frame_data["Max_y"].fillna(0)
A value is trying to be set on a copy of a slice fr

In [86]:
Break_work = test_output[["gameId", "playId", "frameId", "club", "possessionTeam", "defensiveTeam", "GotTheBall", "nflId", "displayName", "Covering"]]
player_name = players[["nflId", "position"]]
BreakThrough = Break_work.groupby(["gameId", "playId", "frameId"])["Covering"].sum()

BreakThrough = pd.merge(Break_work, BreakThrough, on = ["gameId", "playId", "frameId"])

BreakThrough["BreakThrough"] = np.where(BreakThrough["Covering_y"] == 0, 1, 0)

# Group by 'nflId' and count the unique 'frameId' values in each group
play_counts = BreakThrough.groupby('nflId')['playId'].nunique()

# Create a new column 'TotalPlays' in the original DataFrame
BreakThrough['TotalPlays'] = BreakThrough['nflId'].map(play_counts)
#BreakThrough = BreakThrough[BreakThrough["TotalPlays"] >= Min_Play_Count]
TotalPlays = BreakThrough[["nflId","TotalPlays"]].drop_duplicates()

TeamNameJoin = BreakThrough[['club', "nflId", "displayName"]].drop_duplicates()


Summarized_BreakThrough = pd.DataFrame()
Summarized_BreakThrough["CoveringPerc"] = BreakThrough.groupby("nflId")["Covering_x"].mean()
Summarized_BreakThrough["UnCoveredPerc"] = BreakThrough.groupby("nflId")["BreakThrough"].mean()
Summarized_BreakThrough = pd.merge(Summarized_BreakThrough, TotalPlays, on = "nflId")
Summarized_BreakThrough = pd.merge(Summarized_BreakThrough, TeamNameJoin, on = "nflId")
Summarized_BreakThrough = pd.merge(Summarized_BreakThrough, player_name, on = "nflId")






In [87]:
test_output.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesBreakthrough.csv", index=False)
Summarized_BreakThrough.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesBreakthroughSummarized.csv", index=False)


### Team BreakThrough Perc

In [51]:
test_output = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesBreakthrough.csv")


  test_output = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesBreakthrough.csv")


In [58]:
Break_work = test_output[["gameId", "playId", "frameId", "club", "possessionTeam", "defensiveTeam", "GotTheBall", "nflId", "displayName", "Covering"]]
player_name = players[["nflId", "position"]]
BreakThrough = Break_work.groupby(["gameId", "playId", "frameId"])["Covering"].sum()

BreakThrough = pd.merge(Break_work, BreakThrough, on = ["gameId", "playId", "frameId"])

BreakThrough["BreakThrough"] = np.where(BreakThrough["Covering_y"] == 0, 1, 0)


DefensiveBreakThrough = pd.DataFrame()
DefensiveBreakThrough["BreakThroughD"] = BreakThrough.groupby(["defensiveTeam"])["BreakThrough"].mean()
DefensiveBreakThrough = DefensiveBreakThrough.drop_duplicates()
DefensiveBreakThrough.reset_index(inplace=True)

DefensiveBreakThrough = DefensiveBreakThrough.sort_values(by=['BreakThroughD'], ascending = True)



OffensiveBreakThrough = pd.DataFrame()
OffensiveBreakThrough["BreakThroughO"] = BreakThrough.groupby(["possessionTeam"])["BreakThrough"].mean()
OffensiveBreakThrough = OffensiveBreakThrough.drop_duplicates()
OffensiveBreakThrough.reset_index(inplace=True)

OffensiveBreakThrough = OffensiveBreakThrough.sort_values(by=['BreakThroughO'], ascending = False)



In [59]:
OffensiveBreakThrough.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/OffensiveBreakThroughTeam.csv", index=False)
DefensiveBreakThrough.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/DefensiveBreakThroughTeam.csv", index=False)


## Full Enchilada

In [4]:
YACSummarized = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYACSummarized.csv")
BreakthroughSummarized = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesBreakthroughSummarized.csv")
ClosestSummarized = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesDefenseClosestSummary.csv")


In [77]:
Full_Offense = pd.merge(YACSummarized, BreakthroughSummarized, on = ["nflId", "displayName", "club", "position"])

Full_Offense = Full_Offense[Full_Offense["TotalPlays_x"] >= 15]

# Calculate percent ranks within each position group
Full_Offense['PercentRankGravity'] = Full_Offense.groupby('position')['MeanPlay'].transform(
    lambda x: (x.abs() / x.abs().max())
)

Full_Offense['PercentRankBreakthrough'] = Full_Offense.groupby('position')['UnCoveredPerc'].transform(
    lambda x: (x.abs() / x.abs().max())
)

Full_Offense["KEEPAWAY"] = (Full_Offense["PercentRankBreakthrough"] + Full_Offense["PercentRankGravity"] * 2) / 3

Full_Offense = Full_Offense[["displayName", "nflId", "position", "club", "TotalPlays_x", "MeanPlay", "UnCoveredPerc", "expectedPointsAdded", 
                             "PercentRankGravity", "PercentRankBreakthrough", "KEEPAWAY"]]


Full_Offense = Full_Offense.sort_values(by=['KEEPAWAY'], ascending = False)

In [79]:
AltPositionNames = {
    "DT" : "DT",
    "NT" : "DT",
    "MLB" : "LB",
    "ILB" : "LB",
    "DE" : "ED",
    "OLB" : "ED",
    "CB" : "CB",
    "SS" : "S",
    "FS" : "S",
    "DB" : "LB",
}

AltPositionGroups = {
    "DT" : "DL",
    "NT" : "DL",
    "MLB" : "LB",
    "ILB" : "LB",
    "DE" : "DL",
    "OLB" : "DL",
    "CB" : "DB",
    "SS" : "DB",
    "FS" : "DB",
    "DB" : "LB",
}

In [80]:
Full_Defense = pd.merge(ClosestSummarized, BreakthroughSummarized, on = ["nflId", "displayName", "position"])

Full_Defense = Full_Defense[Full_Defense["TotalPlays_x"] >= 15]

Full_Defense['Alt_Position'] = Full_Defense['position'].map(AltPositionGroups)

# Calculate percent ranks within each position group
Full_Defense['PercentRankGravity'] = Full_Defense.groupby('Alt_Position')['ClosingPlay'].transform(
    lambda x: 1 - (x.abs() / x.abs().max())
)

Full_Defense['PercentRankBreakthrough'] = Full_Defense.groupby('Alt_Position')['CoveringPerc'].transform(
    lambda x: (x.abs() / x.abs().max())
)


Full_Defense["KEEPAWAY"] = (Full_Defense["PercentRankBreakthrough"] + Full_Defense["PercentRankGravity"] * 2) / 3

Full_Defense = Full_Defense[["displayName", "nflId", "Alt_Position", "club", "TotalPlays_x", 'ClosingPlay', 'CoveringPerc',
                             "PercentRankGravity", "PercentRankBreakthrough", "KEEPAWAY"]]


Full_Defense = Full_Defense.sort_values(by=['KEEPAWAY'], ascending = False)

In [81]:
Full_Offense.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/Full_Offense.csv", index=False)
Full_Defense.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/Full_Defense.csv", index=False)


In [60]:
OGravTable = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/OffensiveGravityTeam.csv")
OBreakTable = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/OffensiveBreakThroughTeam.csv")
DGravTable = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/DefensiveGravityTeam.csv")
DBreakTable = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/DefensiveBreakThroughTeam.csv")


In [68]:
OTeamTable = pd.merge(OGravTable, OBreakTable, on = "possessionTeam")

DTeamTable = pd.merge(DGravTable, DBreakTable, on = "defensiveTeam")

FullTeamTable = pd.merge(OTeamTable, DTeamTable, left_on = "possessionTeam", right_on="defensiveTeam")
FullTeamTable.drop("defensiveTeam", axis = 1, inplace=True)

In [69]:
FullTeamTable.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/Full_TeamTable.csv", index=False)


## Pick Plays

In [3]:
VOR_df_output = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYAC.csv")

  VOR_df_output = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYAC.csv")


In [97]:
gravplay = pd.DataFrame()
ExtraWork = VOR_df_output.copy()

IQR = stats.iqr(VOR_df_output["adjusted_change"])
# Calculate the first quartile (Q1)
Q1 = np.percentile(VOR_df_output["adjusted_change"], 25)

# Calculate the third quartile (Q3)
Q3 = np.percentile(VOR_df_output["adjusted_change"], 75)

ExtraWork = ExtraWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] <= Q3 + 1.5 * IQR]).reset_index(drop=True)
ExtraWork = ExtraWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] >= Q1 - 1.5 * IQR]).reset_index(drop=True)


FiltVor = ExtraWork[(ExtraWork["passResult"] == "C") & (ExtraWork["playDirection"] == "right")]


gravplay["MeanPlay"] = FiltVor.groupby(["gameId", "playId"])["adjusted_change"].sum()

gravplay = gravplay.sort_values(by=['MeanPlay'], ascending = True).reset_index()




In [92]:
print(VOR_df_output[VOR_df_output["gameId"] == 2022100211])

              gameId  playId    nflId    displayName  frameId  \
184182  2.022100e+09    58.0  47870.0  Damien Harris      6.0   
184183  2.022100e+09    58.0  47870.0  Damien Harris      7.0   
184184  2.022100e+09    58.0  47870.0  Damien Harris      8.0   
184185  2.022100e+09    58.0  47870.0  Damien Harris      9.0   
184186  2.022100e+09    58.0  47870.0  Damien Harris     10.0   
...              ...     ...      ...            ...      ...   
187904  2.022100e+09  4057.0  52470.0    A.J. Dillon     50.0   
187905  2.022100e+09  4057.0  52470.0    A.J. Dillon     51.0   
187906  2.022100e+09  4057.0  52470.0    A.J. Dillon     52.0   
187907  2.022100e+09  4057.0  52470.0    A.J. Dillon     53.0   
187908  2.022100e+09  4057.0  52470.0    A.J. Dillon     54.0   

                              time  jerseyNumber club playDirection      x  \
184182  2022-10-02 16:26:04.200000          37.0   NE         right  29.20   
184183  2022-10-02 16:26:04.299999          37.0   NE         r