In [1]:
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd

In [2]:
#this notebook is used to format and clean up the orignal tracking_week_x.csv files
#so a generic week_df variable is used, care must be taken when exporting the CSV 
#to ensure the correct week is labeled correctly
#this notebook could also contain ways to make CSV files with more specifics to make nalysis more pointed
#and easier down the road
week_df = pd.read_csv('../base_datasets/tracking_week_1.csv')
week_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,76.0,BUF,left,88.37,27.27,1.62,1.15,0.16,231.74,147.9,
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,76.0,BUF,left,88.47,27.13,1.67,0.61,0.17,230.98,148.53,pass_arrived
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,76.0,BUF,left,88.56,27.01,1.57,0.49,0.15,230.98,147.05,
3,2022090800,56,35472.0,Rodger Saffold,4,2022-09-08 20:24:05.500000,76.0,BUF,left,88.64,26.9,1.44,0.89,0.14,232.38,145.42,
4,2022090800,56,35472.0,Rodger Saffold,5,2022-09-08 20:24:05.599999,76.0,BUF,left,88.72,26.8,1.29,1.24,0.13,233.36,141.95,


In [3]:
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407439 entries, 0 to 1407438
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1407439 non-null  int64  
 1   playId         1407439 non-null  int64  
 2   nflId          1346246 non-null  float64
 3   displayName    1407439 non-null  object 
 4   frameId        1407439 non-null  int64  
 5   time           1407439 non-null  object 
 6   jerseyNumber   1346246 non-null  float64
 7   club           1407439 non-null  object 
 8   playDirection  1407439 non-null  object 
 9   x              1407439 non-null  float64
 10  y              1407439 non-null  float64
 11  s              1407439 non-null  float64
 12  a              1407439 non-null  float64
 13  dis            1407439 non-null  float64
 14  o              1346397 non-null  float64
 15  dir            1346397 non-null  float64
 16  event          130268 non-null   object 
dtypes: float

In [5]:
#looking at the csv I 
week_df.iloc[2270 : 2275]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
2270,2022090800,101,53522.0,Spencer Brown,46,2022-09-08 20:25:12.700000,79.0,BUF,left,67.31,44.18,2.92,2.55,0.3,314.88,351.11,
2271,2022090800,101,53522.0,Spencer Brown,47,2022-09-08 20:25:12.799999,79.0,BUF,left,67.26,44.46,2.68,2.52,0.28,316.86,350.09,
2272,2022090800,101,53522.0,Spencer Brown,48,2022-09-08 20:25:12.900000,79.0,BUF,left,67.22,44.71,2.41,2.51,0.25,329.03,349.35,
2273,2022090800,101,53522.0,Spencer Brown,49,2022-09-08 20:25:13.000000,79.0,BUF,left,67.18,44.93,2.12,2.48,0.22,342.33,349.86,
2274,2022090800,101,,football,1,2022-09-08 20:25:08.200000,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,


In [6]:
#jerseyNumber and nflId do not nead to be floats
#however, doing an .astype() conversion threw up a bunch of errors related to NaN values
#so, looking at the csv I found the ball position for the play has NA in the nflId and jerseyNumber fields
#as shown below
week_df.iloc[2274 : 2278] 

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
2274,2022090800,101,,football,1,2022-09-08 20:25:08.200000,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2275,2022090800,101,,football,2,2022-09-08 20:25:08.299999,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2276,2022090800,101,,football,3,2022-09-08 20:25:08.400000,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2277,2022090800,101,,football,4,2022-09-08 20:25:08.500000,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,


In [7]:
#the playId and other identifiers are still intact, so I filled those values with 0 
week_df.iloc[2270 : 2285]
week_df['jerseyNumber'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['nflId'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['jerseyNumber'] = week_df['jerseyNumber'].astype(int)
week_df['nflId'] = week_df['nflId'].astype(int)
week_df.iloc[2274 : 2278]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
2274,2022090800,101,0,football,1,2022-09-08 20:25:08.200000,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2275,2022090800,101,0,football,2,2022-09-08 20:25:08.299999,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2276,2022090800,101,0,football,3,2022-09-08 20:25:08.400000,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2277,2022090800,101,0,football,4,2022-09-08 20:25:08.500000,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,


In [8]:
#verify datatypes were changed
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407439 entries, 0 to 1407438
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1407439 non-null  int64  
 1   playId         1407439 non-null  int64  
 2   nflId          1407439 non-null  int64  
 3   displayName    1407439 non-null  object 
 4   frameId        1407439 non-null  int64  
 5   time           1407439 non-null  object 
 6   jerseyNumber   1407439 non-null  int64  
 7   club           1407439 non-null  object 
 8   playDirection  1407439 non-null  object 
 9   x              1407439 non-null  float64
 10  y              1407439 non-null  float64
 11  s              1407439 non-null  float64
 12  a              1407439 non-null  float64
 13  dis            1407439 non-null  float64
 14  o              1346397 non-null  float64
 15  dir            1346397 non-null  float64
 16  event          130268 non-null   object 
dtypes: float

In [9]:
week_df['event'].value_counts()

event
first_contact                28773
tackle                       26928
ball_snap                    16415
pass_outcome_caught          15870
handoff                      15364
pass_arrived                 13915
out_of_bounds                 5037
run                           2737
man_in_motion                 1288
play_action                   1035
touchdown                     1012
fumble                         621
shift                          368
qb_slide                       350
pass_forward                   248
snap_direct                     46
line_set                        46
lateral                         45
autoevent_ballsnap              30
run_pass_option                 23
qb_sack                         23
pass_shovel                     23
fumble_defense_recovered        23
fumble_offense_recovered        23
autoevent_passinterrupted       16
autoevent_passforward            9
Name: count, dtype: int64

In [14]:
#1012 touchdowns for week 1 seemed like a lot, so I looked at my favorite team, and found that each player on the field
#for the team that scored has 'event' equal to touchdown, makes more sense
touchdowns_df = week_df.loc[(week_df['event'] == 'touchdown') & (week_df['club'] == 'CLE')]