In [77]:
%matplotlib notebook
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd
from matplotlib.animation import FuncAnimation


In [56]:
#this notebook is used to format and clean up the orignal tracking_week_x.csv files
#so a generic week_df variable is used, care must be taken when exporting the CSV 
#to ensure the correct week is labeled correctly
#this notebook could also contain ways to make CSV files with more specifics to make nalysis more pointed
#and easier down the road
week_df = pd.read_csv('../base_datasets/tracking_week_4.csv')
week_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022092900,57,42654.0,La'el Collins,1,2022-09-29 20:16:00.099999,71.0,CIN,left,86.21,30.88,0.0,0.0,0.0,262.6,246.29,
1,2022092900,57,42654.0,La'el Collins,2,2022-09-29 20:16:00.200000,71.0,CIN,left,86.21,30.88,0.0,0.0,0.0,263.32,234.76,
2,2022092900,57,42654.0,La'el Collins,3,2022-09-29 20:16:00.299999,71.0,CIN,left,86.21,30.87,0.01,0.23,0.01,263.32,173.85,
3,2022092900,57,42654.0,La'el Collins,4,2022-09-29 20:16:00.400000,71.0,CIN,left,86.21,30.86,0.07,0.69,0.01,263.32,191.83,
4,2022092900,57,42654.0,La'el Collins,5,2022-09-29 20:16:00.500000,71.0,CIN,left,86.21,30.85,0.19,1.01,0.02,263.32,192.85,


In [57]:
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1418686 entries, 0 to 1418685
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1418686 non-null  int64  
 1   playId         1418686 non-null  int64  
 2   nflId          1357004 non-null  float64
 3   displayName    1418686 non-null  object 
 4   frameId        1418686 non-null  int64  
 5   time           1418686 non-null  object 
 6   jerseyNumber   1357004 non-null  float64
 7   club           1418686 non-null  object 
 8   playDirection  1418686 non-null  object 
 9   x              1418686 non-null  float64
 10  y              1418686 non-null  float64
 11  s              1418686 non-null  float64
 12  a              1418686 non-null  float64
 13  dis            1418686 non-null  float64
 14  o              1357361 non-null  float64
 15  dir            1357361 non-null  float64
 16  event          125983 non-null   object 
dtypes: float

In [58]:
#looking at the csv I 
week_df['playId'].nunique()

1219

In [59]:
#jerseyNumber and nflId do not nead to be floats
#however, doing an .astype() conversion threw up a bunch of errors related to NaN values
#so, looking at the csv I found the ball position for the play has NA in the nflId and jerseyNumber fields
# also the nflid in the players csv is datatype int64
# the playId and other identifiers are still intact, so I filled those values with 0 
week_df['jerseyNumber'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['nflId'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['jerseyNumber'] = week_df['jerseyNumber'].astype(int)
week_df['nflId'] = week_df['nflId'].astype(int)

In [60]:
#verify datatypes were changed
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1418686 entries, 0 to 1418685
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1418686 non-null  int64  
 1   playId         1418686 non-null  int64  
 2   nflId          1418686 non-null  int64  
 3   displayName    1418686 non-null  object 
 4   frameId        1418686 non-null  int64  
 5   time           1418686 non-null  object 
 6   jerseyNumber   1418686 non-null  int64  
 7   club           1418686 non-null  object 
 8   playDirection  1418686 non-null  object 
 9   x              1418686 non-null  float64
 10  y              1418686 non-null  float64
 11  s              1418686 non-null  float64
 12  a              1418686 non-null  float64
 13  dis            1418686 non-null  float64
 14  o              1357361 non-null  float64
 15  dir            1357361 non-null  float64
 16  event          125983 non-null   object 
dtypes: float

In [61]:
week_df['event'].value_counts()

event
first_contact                27163
tackle                       27047
ball_snap                    17616
handoff                      16445
pass_outcome_caught          13983
pass_arrived                 12557
out_of_bounds                 4117
run                           2553
touchdown                     1380
man_in_motion                 1081
play_action                    575
shift                          368
fumble                         253
qb_slide                       253
pass_forward                   204
snap_direct                     92
lateral                         92
qb_sack                         92
pass_shovel                     46
line_set                        46
autoevent_passinterrupted        9
autoevent_passforward            9
autoevent_ballsnap               2
Name: count, dtype: int64

In [62]:
#reduce the floats to two decimal points
week_df[['x', 'y', 's', 'a', 'dis']] = week_df[['x', 'y', 's', 'a', 'dis']].round(2)


In [63]:
#select only cleveland
cle_df = week_df.loc[week_df['club'] == 'CLE']
cle_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
182609,2022100201,63,41264,Joel Bitonio,1,2022-10-02 13:03:56.400000,75,CLE,left,87.65,11.34,1.12,2.42,0.11,266.53,178.23,
182610,2022100201,63,41264,Joel Bitonio,2,2022-10-02 13:03:56.500000,75,CLE,left,87.63,11.26,0.78,2.49,0.08,274.95,187.33,
182611,2022100201,63,41264,Joel Bitonio,3,2022-10-02 13:03:56.599999,75,CLE,left,87.61,11.2,0.55,2.41,0.06,280.61,203.48,
182612,2022100201,63,41264,Joel Bitonio,4,2022-10-02 13:03:56.700000,75,CLE,left,87.58,11.16,0.41,2.17,0.05,282.36,230.57,
182613,2022100201,63,41264,Joel Bitonio,5,2022-10-02 13:03:56.799999,75,CLE,left,87.53,11.17,0.47,2.09,0.05,284.85,275.56,pass_arrived


In [64]:
#list of playId numbers, may or may not use it laster
cle_plays=cle_df['playId']
cle_plays.head()

182609    63
182610    63
182611    63
182612    63
182613    63
Name: playId, dtype: int64

In [65]:
#1012 touchdowns for week 1 seemed like a lot, so I looked at my favorite team, and found that each player on the field
#for the team that scored has 'event' equal to touchdown, makes more sense
TD_cle_df = week_df.loc[(week_df['event'] == 'touchdown') & (week_df['club'] == 'CLE')]
TD_cle_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
203280,2022100201,778,44903,John Johnson,62,2022-10-02 13:28:19.099999,43,CLE,right,110.12,46.41,7.07,2.18,0.72,7.92,19.73,touchdown
203346,2022100201,778,45038,Isaac Rochell,62,2022-10-02 13:28:19.099999,98,CLE,right,106.63,44.15,5.75,2.81,0.6,31.47,46.9,touchdown
203478,2022100201,778,46073,Denzel Ward,62,2022-10-02 13:28:19.099999,21,CLE,right,107.7,51.53,7.73,3.34,0.78,39.15,40.9,touchdown
203742,2022100201,778,47863,Sione Takitaki,62,2022-10-02 13:28:19.099999,44,CLE,right,107.19,29.84,0.37,0.23,0.04,12.63,2.99,touchdown
203874,2022100201,778,52452,Grant Delpit,62,2022-10-02 13:28:19.099999,22,CLE,right,108.61,44.9,6.73,2.07,0.68,20.78,24.19,touchdown


In [66]:
#looked up Chubb's run on the internet to get the playclock and quarter time,
chubb_run_df = week_df[week_df['playId'] == 2864]
chubb_run_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
263465,2022100201,2864,41264,Joel Bitonio,1,2022-10-02 15:17:43.200000,75,CLE,left,39.31,28.0,0.0,0.0,0.0,269.96,236.98,
263466,2022100201,2864,41264,Joel Bitonio,2,2022-10-02 15:17:43.299999,75,CLE,left,39.31,28.0,0.0,0.0,0.0,269.06,246.48,man_in_motion
263467,2022100201,2864,41264,Joel Bitonio,3,2022-10-02 15:17:43.400000,75,CLE,left,39.31,28.0,0.01,0.22,0.0,268.19,304.38,
263468,2022100201,2864,41264,Joel Bitonio,4,2022-10-02 15:17:43.500000,75,CLE,left,39.3,28.0,0.05,0.44,0.0,267.27,299.25,
263469,2022100201,2864,41264,Joel Bitonio,5,2022-10-02 15:17:43.599999,75,CLE,left,39.3,28.0,0.09,0.44,0.01,266.46,283.72,


In [67]:
chubb_run_cle_players_df = chubb_run_df.loc[chubb_run_df['club'] == 'CLE']
chubb_run_cle_players_df.head()


Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
263465,2022100201,2864,41264,Joel Bitonio,1,2022-10-02 15:17:43.200000,75,CLE,left,39.31,28.0,0.0,0.0,0.0,269.96,236.98,
263466,2022100201,2864,41264,Joel Bitonio,2,2022-10-02 15:17:43.299999,75,CLE,left,39.31,28.0,0.0,0.0,0.0,269.06,246.48,man_in_motion
263467,2022100201,2864,41264,Joel Bitonio,3,2022-10-02 15:17:43.400000,75,CLE,left,39.31,28.0,0.01,0.22,0.0,268.19,304.38,
263468,2022100201,2864,41264,Joel Bitonio,4,2022-10-02 15:17:43.500000,75,CLE,left,39.3,28.0,0.05,0.44,0.0,267.27,299.25,
263469,2022100201,2864,41264,Joel Bitonio,5,2022-10-02 15:17:43.599999,75,CLE,left,39.3,28.0,0.09,0.44,0.01,266.46,283.72,


In [68]:
chubb_run_atl_players_df = chubb_run_df.loc[chubb_run_df['club'] == 'ATL']
chubb_run_atl_players_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
263543,2022100201,2864,42480,Grady Jarrett,1,2022-10-02 15:17:43.200000,97,ATL,left,37.17,30.04,0.22,0.68,0.02,71.3,132.68,
263544,2022100201,2864,42480,Grady Jarrett,2,2022-10-02 15:17:43.299999,97,ATL,left,37.19,30.02,0.33,0.68,0.03,71.3,141.13,man_in_motion
263545,2022100201,2864,42480,Grady Jarrett,3,2022-10-02 15:17:43.400000,97,ATL,left,37.21,29.98,0.46,0.72,0.05,74.25,143.55,
263546,2022100201,2864,42480,Grady Jarrett,4,2022-10-02 15:17:43.500000,97,ATL,left,37.25,29.92,0.61,0.8,0.06,76.44,147.76,
263547,2022100201,2864,42480,Grady Jarrett,5,2022-10-02 15:17:43.599999,97,ATL,left,37.29,29.85,0.78,0.84,0.08,78.66,148.47,


In [69]:
chubb_run_frame_1=chubb_run_df.loc[chubb_run_df['frameId'] == 1]
chubb_run_frame_1.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
263465,2022100201,2864,41264,Joel Bitonio,1,2022-10-02 15:17:43.200000,75,CLE,left,39.31,28.0,0.0,0.0,0.0,269.96,236.98,
263543,2022100201,2864,42480,Grady Jarrett,1,2022-10-02 15:17:43.200000,97,ATL,left,37.17,30.04,0.22,0.68,0.02,71.3,132.68,
263621,2022100201,2864,43297,Jack Conklin,1,2022-10-02 15:17:43.200000,78,CLE,left,39.17,32.84,0.0,0.0,0.0,236.72,175.35,
263699,2022100201,2864,43380,Jacoby Brissett,1,2022-10-02 15:17:43.200000,7,CLE,left,39.61,29.6,0.0,0.0,0.0,272.57,78.55,
263777,2022100201,2864,44841,David Njoku,1,2022-10-02 15:17:43.200000,85,CLE,left,38.91,34.8,0.0,0.0,0.01,269.99,287.1,


In [85]:
chubb_run_frame_1_xy = chubb_run_frame_1[['displayName','jerseyNumber','club','x','y']]
chubb_run_frame_1_xy.reset_index()
chubb_run_frame_1_xy.head()

Unnamed: 0,displayName,jerseyNumber,club,x,y
263465,Joel Bitonio,75,CLE,39.31,28.0
263543,Grady Jarrett,97,ATL,37.17,30.04
263621,Jack Conklin,78,CLE,39.17,32.84
263699,Jacoby Brissett,7,CLE,39.61,29.6
263777,David Njoku,85,CLE,38.91,34.8


In [71]:
import matplotlib.pyplot as plt
import numpy as np  # Import NumPy for data manipulation

# Assuming you have already created the chubb_run_frame_1_xy DataFrame

# Create a scatter plot
plt.figure(figsize=(10, 6))  # Optional: Set the figure size

# Define an array of colors based on a condition (e.g., based on a column value)
# In this example, we set a different color for points where 'displayName' is 'Nick Chubb'
colors = np.where(chubb_run_frame_1_xy['club'] == 'CLE', 'orange', 'black')

# Use the 'colors' array to set the color of each point
plt.scatter(chubb_run_frame_1_xy['x'], chubb_run_frame_1_xy['y'], marker='o', c=colors)


# Add labels and title
plt.xlabel('X Coordinate')
plt.ylabel('Y Coordinate')
plt.title('Starting Positions')

# Annotate player names with custom font size
for i, row in chubb_run_frame_1_xy.iterrows():
    plt.annotate(row['displayName'], (row['x'], row['y']), textcoords="offset points", xytext=(0, 10), ha='center', fontsize=6)  # Adjust fontsize as needed

# Show the plot
plt.grid(False)  # Remove grid
plt.tight_layout()  # Ensure tight layout to prevent cutoff labels
plt.show()


<IPython.core.display.Javascript object>

In [100]:
chubb_run_players=chubb_run_df[['frameId','displayName','jerseyNumber','club','x','y']]
chubb_run_sorted_by_frame_df=chubb_run_players.sort_values('frameId')
chubb_run_sorted_by_frame_df.head()


Unnamed: 0,frameId,displayName,jerseyNumber,club,x,y
263465,1,Joel Bitonio,75,CLE,39.31,28.0
265103,1,Adetokunbo Ogundeji,92,ATL,36.92,24.7
263543,1,Grady Jarrett,97,ATL,37.17,30.04
264245,1,Wyatt Teller,77,CLE,39.06,31.36
265181,1,football,0,football,38.18,29.71


In [92]:

# Create a scatter plot
plt.figure(figsize=(10, 6))  # Optional: Set the figure size

# Define an array of colors based on multiple conditions
# In this example, we set colors based on 'club' and 'displayName'
colors = np.where((chubb_run_df['club'] == 'CLE') | (chubb_run_df['displayName'] == 'Nick Chubb'),
                   'orange',  # Color for CLE club and Nick Chubb
                   'black')  # Default color for others

# Use the 'colors' array to set the color of each point
plt.scatter(chubb_run_df['x'], chubb_run_df['y'], marker='o', c=colors)

# Add labels and title
plt.xlabel('X Coordinate')
plt.ylabel('Y Coordinate')
plt.title('Chubb Run - Player Positions')

# Show the plot
plt.grid(False)  # Remove grid
plt.tight_layout()
plt.show()


<IPython.core.display.Javascript object>

In [93]:
chubb_solo_run=chubb_run_df.loc[chubb_run_df['displayName'] == 'Nick Chubb']
chubb_solo_run.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
264089,2022100201,2864,46104,Nick Chubb,1,2022-10-02 15:17:43.200000,24,CLE,left,46.19,29.94,0.0,0.0,0.0,278.18,324.34,
264090,2022100201,2864,46104,Nick Chubb,2,2022-10-02 15:17:43.299999,24,CLE,left,46.19,29.94,0.0,0.0,0.0,277.57,319.83,man_in_motion
264091,2022100201,2864,46104,Nick Chubb,3,2022-10-02 15:17:43.400000,24,CLE,left,46.18,29.94,0.0,0.0,0.0,278.31,321.38,
264092,2022100201,2864,46104,Nick Chubb,4,2022-10-02 15:17:43.500000,24,CLE,left,46.18,29.94,0.0,0.0,0.0,278.31,317.17,
264093,2022100201,2864,46104,Nick Chubb,5,2022-10-02 15:17:43.599999,24,CLE,left,46.18,29.94,0.0,0.0,0.0,275.6,306.74,


In [94]:
%matplotlib notebook

# Create a figure and axis
fig, bx = plt.subplots()

# Define the update function
def update(frame):
    # Clear the previous frame's scatter plot
    bx.clear()
    
    # Extract 'x' and 'y' data for the current frame
    x_data = chubb_solo_run['x'][:frame]
    y_data = chubb_solo_run['y'][:frame]
    
    # Plot the scatter plot for the current frame
    scat = bx.scatter(x_data, y_data, marker='o', c=range(frame), cmap='viridis')
    # Set axis limits and labels if needed
    bx.set_xlim([0, 50])
    bx.set_ylim([0, 53])
    bx.set_xlabel('X Label')
    bx.set_ylabel('Y Label')
    
    return scat,

# Define the number of frames
num_frames = 78

# Create the animation
animate = FuncAnimation(fig=fig, func=update, frames=num_frames ,interval=100)


# Display the animation
plt.show()


<IPython.core.display.Javascript object>

In [101]:
# Create a figure and axis
fig, dx = plt.subplots()

# Define the update function
def update(frame):
    # Clear the previous frame's scatter plot
    dx.clear()
    
    # Extract 'x' and 'y' data for the current frame
    x_data = chubb_run_df['x'][:frame]
    y_data = chubb_run_df['y'][:frame]
    
    # Plot the scatter plot for the current frame
    scat = dx.scatter(x_data, y_data, marker='o', c=range(frame+1), cmap='viridis')
    
    # Set axis limits and labels if needed
    dx.set_xlim([0, 50])
    dx.set_ylim([0, 53])
    dx.set_xlabel('X Label')
    dx.set_ylabel('Y Label')
    
    return scat,

# Define the number of frames
num_frames = 78

# Create the animation
animate = FuncAnimation(fig=fig, func=update, frames=num_frames, interval=100)

# Display the animation
plt.show()




<IPython.core.display.Javascript object>

In [83]:
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

# Create a figure and axis
fig, ax = plt.subplots()

# Define the update function
def update(frame):
    # Clear the previous frame's scatter plot
    ax.clear()
    
    # Calculate the start and end indices for the current frame
    start_idx = frame * 23
    end_idx = (frame + 1) * 23
    
    # Extract 'x' and 'y' data for the current frame
    x_data = chubb_run_df['x'].iloc[start_idx:end_idx]
    y_data = chubb_run_df['y'].iloc[start_idx:end_idx]
    
    # Plot the scatter plot for the current frame with color based on the entire range of frames
    scat = ax.scatter(x_data, y_data, marker='o', c=range(num_frames), cmap='viridis')
    
    # Set axis limits and labels if needed
    ax.set_xlim([0, 50])
    ax.set_ylim([0, 53])
    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    
    return scat,

# Define the number of frames
num_frames = len(chubb_run_df) // 23  # Calculate the number of frames based on 23 points per frame

# Create the animation
animate = FuncAnimation(fig=fig, func=update, frames=num_frames, interval=100)

# Display the animation
plt.show()


<IPython.core.display.Javascript object>