### Importing

In [1]:
import os
import glob
import pandas as pd
import numpy as np
np.random.seed(42)
import random
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import plot_importance
from scipy.stats import multivariate_normal
import scipy.stats as stats
import scipy
import math
from scipy.spatial import Voronoi, voronoi_plot_2d

## Data Importing

In [2]:
# Specify the relative path to the data directory
data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Data"
non_games_data_folder_path = "C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Non_Games_Data"

# List all files in the data folder
file_list = os.listdir(data_folder_path)
file_list_non_games = os.listdir(non_games_data_folder_path)

# Use glob to filter specific file types
csv_files = glob.glob(os.path.join(data_folder_path, "*.csv"))
csv_files_non_games = glob.glob(os.path.join(non_games_data_folder_path, "*.csv"))

# Read in the weekly game data and concat into one combined df
#dfs = [pd.read_csv(file) for file in csv_files]
#combined_df = pd.concat(dfs, ignore_index=True)

# Read in the supplementary data
games = pd.read_csv(csv_files_non_games[0])
nfl_colors = pd.read_csv(csv_files_non_games[1])
players = pd.read_csv(csv_files_non_games[2])
plays = pd.read_csv(csv_files_non_games[3])

## ModelingDF

In [55]:
ModelingDF = pd.read_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/WorkDF.csv")


In [255]:
ModelingDF = pd.merge(ModelingDF, plays, on = ["gameId", "playId"])
ModelingDF['DistFromLOS'] = (ModelingDF["x"] - 10) - ( ModelingDF["absoluteYardlineNumber"])
ModelingDF["DistFromLOS"] = np.where(ModelingDF["DistFromLOS"] < 0, ModelingDF["DistFromLOS"]*-1, ModelingDF["DistFromLOS"])

  ModelingDF = pd.merge(ModelingDF, plays, on = ["gameId", "playId"])


## Voronoi Tesselations

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d

VOR_df = ModelingDF.copy()

VOR_df = pd.merge(VOR_df, team_join, on = ["gameId", "playId"])

VOR_df = VOR_df[(VOR_df["gameId"] == 2021090900) & (VOR_df["playId"] == 187) & (VOR_df["frameId"] == 6) ]

bounding_lowerL = {"gameId" : 2021090900, "playId" : 187, "frameId" : 6, "x" : VOR_df['x'].min() - 1, "y" : VOR_df['y'].min() - 1}
bounding_upperL = {"gameId" : 2021090900, "playId" : 187, "frameId" : 6, "x" : VOR_df['x'].max() + 1, "y" : VOR_df['y'].max() + 1}
bounding_lowerR = {"gameId" : 2021090900, "playId" : 187, "frameId" : 6, "x" : VOR_df['x'].min() - 1, "y" : VOR_df['y'].max() + 1}
bounding_upperR = {"gameId" : 2021090900, "playId" : 187, "frameId" : 6, "x" : VOR_df['x'].max() + 1, "y" : VOR_df['y'].min() - 1}

bounding_lowerL = pd.DataFrame([bounding_lowerL])
bounding_upperL = pd.DataFrame([bounding_upperL])
bounding_lowerR = pd.DataFrame([bounding_lowerR])
bounding_upperR = pd.DataFrame([bounding_upperR])


VOR_df = pd.concat([VOR_df, bounding_lowerL], ignore_index=True)
VOR_df = pd.concat([VOR_df, bounding_upperL], ignore_index=True)
VOR_df = pd.concat([VOR_df, bounding_lowerR], ignore_index=True)
VOR_df = pd.concat([VOR_df, bounding_upperR], ignore_index=True)


points = VOR_df[['x', 'y']].values

# Create a Voronoi diagram
vor = Voronoi(points)

# Plot the Voronoi diagram
fig, ax = plt.subplots()
voronoi_plot_2d(vor, ax=ax)

# Plot the original points if needed
ax.plot(points[:, 0], points[:, 1], 'ko')

# Customize the plot if desired
plt.xlim(VOR_df['x'].min() - 1, VOR_df['x'].max() + 1)
plt.ylim(VOR_df['y'].min() - 1, VOR_df['y'].max() + 1)

plt.gca().set_aspect('equal', adjustable='box')
plt.show()


In [6]:
StarPlayers = [52430, 41282, 42489, 43454, 44881, 41233, 40011, 53434, 47834]

In [256]:

VOR_df = ModelingDF.copy()

VOR_df = VOR_df[(VOR_df["gameId"] == 2022090800)]

VOR_df = VOR_df[(VOR_df["club"] != "football")]

VOR_df['area'] = np.nan

VOR_df['exp_area'] = np.nan

VOR_df['Time'] = VOR_df.groupby(['gameId', 'playId'])['frameId'].transform(lambda x: x - x.min() + 1)

VOR_df.fillna(0, inplace=True)

# Calculate the maximum absolute value in the column
max_abs_value = VOR_df[VOR_df["GotTheBall"] == 1]['DistFromLOS'].abs().max()

# Calculate the percent rank based on scaled absolute values
VOR_df['PercentRank'] = 1 - (VOR_df['DistFromLOS'].abs() / max_abs_value)

def calculate_adjusted_change(row):
    if row['Time'] <= 10:
        return row['area_percentage_change'] / (math.log(row['Time'] + 1, 11))
    else:
        return row['area_percentage_change'] * (np.exp(-(row["Time"] - 10) / 5))


def Calculate_Gravity(playerCoordinates = []):

    TotalGravity = pd.DataFrame()

    unique_frames = playerCoordinates["frameId"].unique()


    for frame in unique_frames:

        frame_data = playerCoordinates[playerCoordinates["frameId"] == frame]

        bounding_lowerL = {'x' : frame_data['x'].min() - 1, 'y' : frame_data['y'].min() - 1} 
        bounding_upperL = {'x' : frame_data['x'].max() + 1, 'y' : frame_data['y'].max() + 1}
        bounding_lowerR = {'x' : frame_data['x'].min() - 1, 'y' : frame_data['y'].max() + 1}
        bounding_upperR = {'x' : frame_data['x'].max() + 1, 'y' : frame_data['y'].min() - 1}

        bounding_lowerL = pd.DataFrame([bounding_lowerL])
        bounding_upperL = pd.DataFrame([bounding_upperL])
        bounding_lowerR = pd.DataFrame([bounding_lowerR])
        bounding_upperR = pd.DataFrame([bounding_upperR])


        frame_data = pd.concat([frame_data, bounding_lowerL], ignore_index=True)
        frame_data = pd.concat([frame_data, bounding_upperL], ignore_index=True)
        frame_data = pd.concat([frame_data, bounding_lowerR], ignore_index=True)
        frame_data = pd.concat([frame_data, bounding_upperR], ignore_index=True)


        points = frame_data[['x', 'y']].dropna().values

        # Create a Voronoi diagram
        vor = Voronoi(points)

        # Iterate through the input points
        for i, point in enumerate(points):
            # Find the Voronoi region index for the current point
            region_index = vor.point_region[i]
            
            # Get the vertices of the region
            region_vertices = vor.regions[region_index]
            
            # Filter out invalid vertices
            region_vertices = [vertex for vertex in region_vertices if vertex != -1]
            
            if len(region_vertices) > 0:
                # Get the vertices of the region
                vertices = vor.vertices[region_vertices]
                
                # Calculate the area using the Shoelace formula
                area = 0.5 * np.abs(np.dot(vertices[:, 0], np.roll(vertices[:, 1], 1)) -
                                np.dot(vertices[:, 1], np.roll(vertices[:, 0], 1)))
                
                # Store the area in the 'area' column of VOR_df for the current point
                frame_data.at[i, 'area'] = area


        TotalGravity = pd.concat([TotalGravity, frame_data], ignore_index=True) 

        TotalGravity.dropna(subset = ["gameId"], inplace = True)

        TotalGravity = TotalGravity[TotalGravity["GotTheBall"] == 1]

        # Sort the DataFrame by the grouping columns and frameId in ascending order
        TotalGravity.sort_values(by=['gameId', 'playId', 'nflId', 'frameId'], inplace=True)

        # Calculate the percentage change in the 'area' column within each group
        TotalGravity['area_percentage_change'] = TotalGravity.groupby(['gameId', 'playId', 'nflId'])['area'].diff()

        TotalGravity['area_percentage_change'].fillna(0, inplace=True)

        # Create the 'adjusted_change' column by applying the custom function to each row",
        TotalGravity['adjusted_change'] = TotalGravity.apply(calculate_adjusted_change, axis=1)

        TotalGravity['adjusted_change'] = TotalGravity['adjusted_change'] * TotalGravity['a'] * TotalGravity["PercentRank"]

    return(TotalGravity)

VOR_df_output = Calculate_Gravity(VOR_df)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [66]:
Min_Play_Count = 15
ExtraWork = VOR_df_output.copy()

IQR = stats.iqr(VOR_df_output["adjusted_change"])
# Calculate the first quartile (Q1)
Q1 = np.percentile(VOR_df_output["adjusted_change"], 25)

# Calculate the third quartile (Q3)
Q3 = np.percentile(VOR_df_output["adjusted_change"], 75)


ExtraWork = ExtraWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] <= Q3 + 1.5 * IQR]).reset_index(drop=True)
ExtraWork = ExtraWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] >= Q1 - 1.5 * IQR]).reset_index(drop=True)


# Group by 'nflId' and count the unique 'frameId' values in each group
play_counts = ExtraWork.groupby('nflId')['playId'].nunique()

# Create a new column 'TotalPlays' in the original DataFrame
ExtraWork['TotalPlays'] = ExtraWork['nflId'].map(play_counts)
ExtraWork = ExtraWork[ExtraWork["TotalPlays"] >= Min_Play_Count]
TotalPlays = ExtraWork[["nflId", "club", "TotalPlays"]]

Summarized = pd.DataFrame()
Summarized["MeanPlay"] = ExtraWork.groupby(["nflId"])["adjusted_change"].sum()
Summarized = Summarized.drop_duplicates()
Summarized.reset_index(inplace=True)

Summarized = Summarized.sort_values(by=['MeanPlay'], ascending = False)

player_name = players[["nflId", "position", "displayName"]]

Summarized = pd.merge(Summarized, player_name, on = "nflId")

Summarized = pd.merge(Summarized, TotalPlays, on = "nflId")
Summarized = Summarized.drop_duplicates()

EPA = plays.groupby(["ballCarrierId"])["expectedPointsAdded"].mean()

Summarized = pd.merge(Summarized, EPA, left_on="nflId", right_on="ballCarrierId")

Summarized = Summarized.drop_duplicates()


In [67]:
VOR_df_output.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYAC.csv", index=False)
Summarized.to_csv("C:/Users/sethl/OneDrive/Important Stuff/R/R files/NFL/DataBowl/2024-Big-Data-Bowl/Created_DF/AllGamesYACSummarized.csv", index=False)


### Defensive Gravity Allowed

In [77]:
DefensiveGravityWork = VOR_df_output.copy()


IQR = stats.iqr(VOR_df_output["adjusted_change"])
# Calculate the first quartile (Q1)
Q1 = np.percentile(VOR_df_output["adjusted_change"], 25)

# Calculate the third quartile (Q3)
Q3 = np.percentile(VOR_df_output["adjusted_change"], 75)


DefensiveGravityWork = DefensiveGravityWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] <= Q3 + 1.5 * IQR]).reset_index(drop=True)
DefensiveGravityWork = DefensiveGravityWork.groupby('nflId').apply(lambda x: x[x['adjusted_change'] >= Q1 - 1.5 * IQR]).reset_index(drop=True)

DefensiveGravity = pd.DataFrame()
DefensiveGravity["MeanPlayD"] = DefensiveGravityWork.groupby(["defensiveTeam"])["adjusted_change"].sum()
DefensiveGravity = DefensiveGravity.drop_duplicates()
DefensiveGravity.reset_index(inplace=True)

DefensiveGravity = DefensiveGravity.sort_values(by=['MeanPlayD'], ascending = True)




OffensiveGravity = pd.DataFrame()
player_name = players[["nflId", "position", "displayName"]]
OffensiveGravityWork = pd.merge(DefensiveGravityWork, player_name, on = "nflId")
OffensiveGravity["MeanPlayO"] = OffensiveGravityWork.groupby(["position", "possessionTeam"])["adjusted_change"].sum()
OffensiveGravity = OffensiveGravity.drop_duplicates()
OffensiveGravity.reset_index(inplace=True)

OffensiveGravity = OffensiveGravity.sort_values(by=['MeanPlayO'], ascending = False)



### Break Through

In [247]:
def find_rows_above_threshold(array, threshold=1e-3):
    max_values = np.amax(array, axis=1)  # Get maximum values for each row
    above_threshold = max_values > threshold
    first_index = np.argmax(above_threshold)
    last_index = len(above_threshold) - np.argmax(above_threshold[::-1]) - 1
    return first_index.astype(int), last_index.astype(int)

def generate_whole_numbers(start, end):    
    if start > end:
        return []  # Return an empty list if start is greater than end
    
    # Use range to generate the list of whole numbers
    whole_numbers = list(range(start, end + 1))
    return whole_numbers

def uncovered_frame_list(Ball_List):
    if any(val not in frame_list for val in Ball_List):
        return 1  # Return 0 if the condition is true (value not in Player_list)
    else:
        return 0 


def covering_ball_list(row):
    Max_y = int(row["Max_y"])
    Min_y = int(row["Min_y"])
    Player_list = generate_whole_numbers(Min_y, Max_y)
    # Check if any value in Ball_list is not in Player_list
    if any(val not in Player_list for val in Ball_list):
        return 0  # Return 0 if the condition is true (value not in Player_list)
    else:
        return 1 



def CalculateBreakthrough(playerCoordinates=[]):

  np.random.seed(42)
  
  #Determined each speed in x and y, as well as distance from the QB
  xCoordinateBall=playerCoordinates.loc[(playerCoordinates['club']=='football')]['clean_x'].unique()[0]
  yCoordinateBall=playerCoordinates.loc[(playerCoordinates['club']=='football')]['clean_y'].unique()[0]
  playerCoordinates['distanceFromBall']=((playerCoordinates['clean_x']-xCoordinateBall)**2 + (playerCoordinates['clean_y']-yCoordinateBall)**2)**(1/2)
  playerCoordinates['radiansDirection'] = playerCoordinates['clean_dir'].astype(float).apply(math.radians) #Converts angle in degrees to radians
  playerCoordinates['xComponent']=playerCoordinates['radiansDirection'].astype(float).apply(math.cos) #Converts angle into an x and y component
  playerCoordinates['yComponent']=playerCoordinates['radiansDirection'].astype(float).apply(math.sin)
  playerCoordinates['xspeed']=playerCoordinates['xComponent']*playerCoordinates['s'] #Determines magnitude of speed by multiplying x and y component by magnitude of speed
  playerCoordinates['yspeed']=playerCoordinates['yComponent']*playerCoordinates['s']

  unique_frames = playerCoordinates["frameId"].unique()

  output_df = pd.DataFrame()
  
  y, x = np.mgrid[0:53.3:1, 0:120:1]
  locations = np.dstack((x, y))
  
  for frame in unique_frames:

    frame_data = playerCoordinates[playerCoordinates["frameId"] == frame]
    frame_list = []

    # Generate pdf's for the defensive players and the quarteback
    for index, row in frame_data.iterrows():
      if((row['club'] == row['defensiveTeam']) & (row["distanceFromBall"] <= 15)) | (row["GotTheBall"] == 1):
        speed_Ratio=(row['s']**2)/(100)
        topLeftSMatrix=(row['distanceFromBall']+row['distanceFromBall']*speed_Ratio)/2
        bottomRightSMatrix=(row['distanceFromBall']-row['distanceFromBall']*speed_Ratio)/2
        #Setting up R and S matrix in bivariate normal distribution
        r_matrix=[(row['xComponent'], -row['yComponent']),(row['yComponent'], row['xComponent'])]
        r_matrix=pd.DataFrame(data=r_matrix)
        #Adds very small value to ensure matrix is invertible even if player is completely stationary
        s_matrix=[(topLeftSMatrix+0.00001,0), (0, bottomRightSMatrix-0.000001)]
        s_matrix=pd.DataFrame(data=s_matrix)
        inverse_r_Matrix=np.linalg.inv(r_matrix)
        multiplyingTogetherFirstTwoMatrices=r_matrix.dot(s_matrix)
        nextMatrix=multiplyingTogetherFirstTwoMatrices.dot(s_matrix)
        covariance_matrix=nextMatrix.dot(inverse_r_Matrix)
        mu_val_x=row['clean_x']+row['xspeed']*0.5
        mu_val_y=row['clean_y']+row['yspeed']*0.5
        mu=[mu_val_x,mu_val_y]
        player_pdf=multivariate_normal(mu,covariance_matrix).pdf(locations)

        Player_max = find_rows_above_threshold(player_pdf)

        frame_data.at[index, "Max_y"] = Player_max[1]
        frame_data.at[index, "Min_y"] = Player_max[0]   

        if row["GotTheBall"] != 1:
          Player_list = generate_whole_numbers(Player_max[0], Player_max[1])
          
          frame_list.append(Player_list)


        frame_data["Max_y"] = frame_data["Max_y"].fillna(0)
        frame_data["Min_y"] = frame_data["Min_y"].fillna(53) 
          
    frame_list = list(set(item for sublist in frame_list for item in sublist))
    
    Ball_Max_y = frame_data.loc[frame_data["GotTheBall"] == 1, "Max_y"].iloc[0].astype(int)
    Ball_Min_y = frame_data.loc[frame_data["GotTheBall"] == 1, "Min_y"].iloc[0].astype(int)
    Ball_list = generate_whole_numbers(Ball_Min_y, Ball_Max_y)
  
    #frame_data["BreakThrough"] = uncovered_frame_list(Ball_list)
    frame_data["Covering"] = frame_data.apply(covering_ball_list, axis = 1)
      
    output_df = pd.concat([output_df, frame_data], ignore_index=True)

  return output_df
      

  
VOR_df = ModelingDF.copy()

VOR_df = VOR_df[(VOR_df["gameId"] == 2022090800)]

test_output = CalculateBreakthrough(VOR_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_data.at[index, "Max_y"] = Player_max[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_data.at[index, "Min_y"] = Player_max[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame_data["Max_y"] = frame_data["Max_y"].fillna(0)
A value is trying to be set on a copy of a slice from a DataF

In [252]:
break_join = test_output[["gameId", "playId", "frameId", "nflId", "Covering"]]

Break_work = pd.merge(ExtraWork, break_join, on = ["gameId", "playId", "frameId", "nflId"])


Summarized_BreakThrough = pd.DataFrame()
Summarized_BreakThrough["BreakThrough"] = Break_work.groupby(["gameId", "playId", "frameId"])["Covering"].sum()
Summarized_BreakThrough["MeanPlay"] = Break_work.groupby(["nflId"])["adjusted_change"].sum()
Summarized_BreakThrough = Summarized_BreakThrough.drop_duplicates()
Summarized_BreakThrough.reset_index(inplace=True)

Summarized_BreakThrough = Summarized_BreakThrough.sort_values(by=['MeanPlay'], ascending = False)
