In [1]:
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd

In [2]:
#this notebook is used to format and clean up the orignal tracking_week_x.csv files
#so a generic week_df variable is used, care must be taken when exporting the CSV 
#to ensure the correct week is labeled correctly
#this notebook could also contain ways to make CSV files with more specifics to make nalysis more pointed
#and easier down the road
week_df = pd.read_csv('../base_datasets/tracking_week_1.csv')
week_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,76.0,BUF,left,88.37,27.27,1.62,1.15,0.16,231.74,147.9,
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,76.0,BUF,left,88.47,27.13,1.67,0.61,0.17,230.98,148.53,pass_arrived
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,76.0,BUF,left,88.56,27.01,1.57,0.49,0.15,230.98,147.05,
3,2022090800,56,35472.0,Rodger Saffold,4,2022-09-08 20:24:05.500000,76.0,BUF,left,88.64,26.9,1.44,0.89,0.14,232.38,145.42,
4,2022090800,56,35472.0,Rodger Saffold,5,2022-09-08 20:24:05.599999,76.0,BUF,left,88.72,26.8,1.29,1.24,0.13,233.36,141.95,


In [4]:
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407439 entries, 0 to 1407438
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1407439 non-null  int64  
 1   playId         1407439 non-null  int64  
 2   nflId          1346246 non-null  float64
 3   displayName    1407439 non-null  object 
 4   frameId        1407439 non-null  int64  
 5   time           1407439 non-null  object 
 6   jerseyNumber   1346246 non-null  float64
 7   club           1407439 non-null  object 
 8   playDirection  1407439 non-null  object 
 9   x              1407439 non-null  float64
 10  y              1407439 non-null  float64
 11  s              1407439 non-null  float64
 12  a              1407439 non-null  float64
 13  dis            1407439 non-null  float64
 14  o              1346397 non-null  float64
 15  dir            1346397 non-null  float64
 16  event          130268 non-null   object 
dtypes: float

In [5]:
#looking at the csv I 
week_df['playId'].nunique()

1247

In [6]:
#jerseyNumber and nflId do not nead to be floats
#however, doing an .astype() conversion threw up a bunch of errors related to NaN values
#so, looking at the csv I found the ball position for the play has NA in the nflId and jerseyNumber fields
#also the nflid in the players csv is datatype int64
week_df.iloc[2274 : 2278] 

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
2274,2022090800,101,,football,1,2022-09-08 20:25:08.200000,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2275,2022090800,101,,football,2,2022-09-08 20:25:08.299999,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2276,2022090800,101,,football,3,2022-09-08 20:25:08.400000,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2277,2022090800,101,,football,4,2022-09-08 20:25:08.500000,,football,left,72.029999,29.530001,0.0,0.0,0.0,,,


In [7]:
#the playId and other identifiers are still intact, so I filled those values with 0 
week_df.iloc[2270 : 2285]
week_df['jerseyNumber'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['nflId'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['jerseyNumber'] = week_df['jerseyNumber'].astype(int)
week_df['nflId'] = week_df['nflId'].astype(int)
week_df.iloc[2274 : 2278]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
2274,2022090800,101,0,football,1,2022-09-08 20:25:08.200000,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2275,2022090800,101,0,football,2,2022-09-08 20:25:08.299999,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2276,2022090800,101,0,football,3,2022-09-08 20:25:08.400000,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,
2277,2022090800,101,0,football,4,2022-09-08 20:25:08.500000,0,football,left,72.029999,29.530001,0.0,0.0,0.0,,,


In [8]:
#verify datatypes were changed
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407439 entries, 0 to 1407438
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1407439 non-null  int64  
 1   playId         1407439 non-null  int64  
 2   nflId          1407439 non-null  int64  
 3   displayName    1407439 non-null  object 
 4   frameId        1407439 non-null  int64  
 5   time           1407439 non-null  object 
 6   jerseyNumber   1407439 non-null  int64  
 7   club           1407439 non-null  object 
 8   playDirection  1407439 non-null  object 
 9   x              1407439 non-null  float64
 10  y              1407439 non-null  float64
 11  s              1407439 non-null  float64
 12  a              1407439 non-null  float64
 13  dis            1407439 non-null  float64
 14  o              1346397 non-null  float64
 15  dir            1346397 non-null  float64
 16  event          130268 non-null   object 
dtypes: float

In [9]:
week_df['event'].value_counts()

event
first_contact                28773
tackle                       26928
ball_snap                    16415
pass_outcome_caught          15870
handoff                      15364
pass_arrived                 13915
out_of_bounds                 5037
run                           2737
man_in_motion                 1288
play_action                   1035
touchdown                     1012
fumble                         621
shift                          368
qb_slide                       350
pass_forward                   248
snap_direct                     46
line_set                        46
lateral                         45
autoevent_ballsnap              30
run_pass_option                 23
qb_sack                         23
pass_shovel                     23
fumble_defense_recovered        23
fumble_offense_recovered        23
autoevent_passinterrupted       16
autoevent_passforward            9
Name: count, dtype: int64

In [12]:
cle_wk1_track_df = week_df.loc[week_df['club'] == 'CLE']
cle_wk1_track_df.reset_index()
cle_wk1_track_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
172799,2022091101,85,41227,Jadeveon Clowney,1,2022-09-11 13:05:42.500000,90,CLE,right,26.32,17.19,4.18,2.14,0.42,14.9,258.77,
172800,2022091101,85,41227,Jadeveon Clowney,2,2022-09-11 13:05:42.599999,90,CLE,right,25.91,17.12,4.01,2.4,0.42,14.04,259.13,
172801,2022091101,85,41227,Jadeveon Clowney,3,2022-09-11 13:05:42.700000,90,CLE,right,25.55,17.05,3.66,2.82,0.37,14.04,259.46,
172802,2022091101,85,41227,Jadeveon Clowney,4,2022-09-11 13:05:42.799999,90,CLE,right,25.2,17.0,3.38,2.93,0.35,14.04,260.68,
172803,2022091101,85,41227,Jadeveon Clowney,5,2022-09-11 13:05:42.900000,90,CLE,right,24.9,16.97,3.01,3.15,0.31,11.37,262.32,pass_arrived


In [13]:
#list of playId numbers
cle_wk1_track_df['playId'].unique()

array([  85,  109,  158,  184,  213,  251,  272,  296,  361,  382,  417,
        489,  521,  542,  599,  620,  641,  662,  748,  850,  993, 1077,
       1101, 1192, 1213, 1384, 1516, 1564, 1590, 1616, 1691, 1720, 1744,
       1785, 1826, 1901, 1945, 1980, 2001, 2051, 2320, 2341, 2365, 2386,
       2407, 2431, 2452, 2478, 2501, 2545, 2629, 2683, 2783, 2832, 2909,
       2930, 2951, 3040, 3080, 3101, 3125, 3168, 3221, 3242, 3263, 3287,
       3315, 3336, 3357, 3378, 3399, 3454, 3510, 3545, 3569, 3591, 3615,
       3669, 3707, 3789, 3841, 3862, 3923, 3961, 4068, 4104, 4150])

In [14]:
#1012 touchdowns for week 1 seemed like a lot, so I looked at my favorite team, and found that each player on the field
#for the team that scored has 'event' equal to touchdown, makes more sense
touchdowns_cle_df = week_df.loc[(week_df['event'] == 'touchdown') & (week_df['club'] == 'CLE')]
touchdowns_cle_df.head(40)

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
203222,2022091101,1616,41264,Joel Bitonio,64,2022-09-11 14:14:03.000000,75,CLE,right,83.95,25.54,0.63,0.21,0.06,149.38,100.3,touchdown
203290,2022091101,1616,42347,Amari Cooper,64,2022-09-11 14:14:03.000000,2,CLE,right,101.74,26.06,3.2,2.25,0.33,172.68,148.61,touchdown
203426,2022091101,1616,43380,Jacoby Brissett,64,2022-09-11 14:14:03.000000,7,CLE,right,81.63,24.54,1.67,0.49,0.17,90.8,92.95,touchdown
203630,2022091101,1616,44841,David Njoku,64,2022-09-11 14:14:03.000000,85,CLE,right,83.94,19.88,0.77,0.48,0.07,62.33,67.64,touchdown
203698,2022091101,1616,44870,Ethan Pocic,64,2022-09-11 14:14:03.000000,55,CLE,right,94.7,23.47,2.39,0.4,0.25,89.15,91.43,touchdown
203766,2022091101,1616,44898,Kareem Hunt,64,2022-09-11 14:14:03.000000,27,CLE,right,110.3,8.01,9.41,3.07,0.95,80.26,121.3,touchdown
203970,2022091101,1616,46235,Wyatt Teller,64,2022-09-11 14:14:03.000000,77,CLE,right,89.42,26.79,1.19,0.82,0.12,40.55,57.25,touchdown
204106,2022091101,1616,52418,Jedrick Wills,64,2022-09-11 14:14:03.000000,71,CLE,right,84.53,29.74,0.69,0.39,0.07,98.26,107.09,touchdown
204310,2022091101,1616,52523,Harrison Bryant,64,2022-09-11 14:14:03.000000,88,CLE,right,100.21,18.64,4.7,1.07,0.47,113.93,120.19,touchdown
204378,2022091101,1616,52595,Donovan Peoples-Jones,64,2022-09-11 14:14:03.000000,11,CLE,right,95.13,15.86,6.5,1.61,0.66,129.88,108.22,touchdown
