In [2]:
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd

In [3]:
#this notebook is used to format and clean up the orignal tracking_week_x.csv files
#so a generic week_df variable is used, care must be taken when exporting the CSV 
#to ensure the correct week is labeled correctly
#this notebook could also contain ways to make CSV files with more specifics to make nalysis more pointed
#and easier down the road
week_df = pd.read_csv('../base_datasets/tracking_week_8.csv')
week_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022102700,68,38557.0,Kevin Zeitler,1,2022-10-27 20:16:37.099999,70.0,BAL,right,25.82,28.56,0.92,1.22,0.1,62.16,327.7,
1,2022102700,68,38557.0,Kevin Zeitler,2,2022-10-27 20:16:37.200000,70.0,BAL,right,25.78,28.64,0.87,1.12,0.09,59.23,337.03,
2,2022102700,68,38557.0,Kevin Zeitler,3,2022-10-27 20:16:37.299999,70.0,BAL,right,25.77,28.72,0.78,1.14,0.08,58.48,348.42,pass_arrived
3,2022102700,68,38557.0,Kevin Zeitler,4,2022-10-27 20:16:37.400000,70.0,BAL,right,25.77,28.79,0.72,1.23,0.07,57.03,1.0,
4,2022102700,68,38557.0,Kevin Zeitler,5,2022-10-27 20:16:37.500000,70.0,BAL,right,25.79,28.86,0.7,1.26,0.07,54.68,15.53,


In [4]:
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1406772 entries, 0 to 1406771
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1406772 non-null  int64  
 1   playId         1406772 non-null  int64  
 2   nflId          1345608 non-null  float64
 3   displayName    1406772 non-null  object 
 4   frameId        1406772 non-null  int64  
 5   time           1406772 non-null  object 
 6   jerseyNumber   1345608 non-null  float64
 7   club           1406772 non-null  object 
 8   playDirection  1406772 non-null  object 
 9   x              1406772 non-null  float64
 10  y              1406772 non-null  float64
 11  s              1406772 non-null  float64
 12  a              1406772 non-null  float64
 13  dis            1406772 non-null  float64
 14  o              1345642 non-null  float64
 15  dir            1345642 non-null  float64
 16  event          123993 non-null   object 
dtypes: float

In [4]:
#looking at the csv I 
week_df['playId'].nunique()

1220

In [5]:
#jerseyNumber and nflId do not nead to be floats
#however, doing an .astype() conversion threw up a bunch of errors related to NaN values
#so, looking at the csv I found the ball position for the play has NA in the nflId and jerseyNumber fields
#also the nflid in the players csv is datatype int64
week_df.iloc[2274 : 2278] 

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
2274,2022102700,201,38557.0,Kevin Zeitler,21,2022-10-27 20:20:31.500000,70.0,BAL,right,103.12,24.34,1.47,1.08,0.16,44.43,346.11,
2275,2022102700,201,38557.0,Kevin Zeitler,22,2022-10-27 20:20:31.599999,70.0,BAL,right,103.09,24.48,1.38,1.02,0.14,41.5,345.74,run
2276,2022102700,201,38557.0,Kevin Zeitler,23,2022-10-27 20:20:31.700000,70.0,BAL,right,103.07,24.61,1.26,1.06,0.13,36.49,349.65,
2277,2022102700,201,38557.0,Kevin Zeitler,24,2022-10-27 20:20:31.799999,70.0,BAL,right,103.06,24.7,1.02,1.35,0.1,27.7,351.77,


In [5]:
#the playId and other identifiers are still intact, so I filled those values with 0 
week_df['jerseyNumber'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['nflId'].fillna(0, inplace=True)  # Replace NaN with 0
week_df['jerseyNumber'] = week_df['jerseyNumber'].astype(int)
week_df['nflId'] = week_df['nflId'].astype(int)

In [6]:
#verify datatypes were changed
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1406772 entries, 0 to 1406771
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gameId         1406772 non-null  int64  
 1   playId         1406772 non-null  int64  
 2   nflId          1406772 non-null  int64  
 3   displayName    1406772 non-null  object 
 4   frameId        1406772 non-null  int64  
 5   time           1406772 non-null  object 
 6   jerseyNumber   1406772 non-null  int64  
 7   club           1406772 non-null  object 
 8   playDirection  1406772 non-null  object 
 9   x              1406772 non-null  float64
 10  y              1406772 non-null  float64
 11  s              1406772 non-null  float64
 12  a              1406772 non-null  float64
 13  dis            1406772 non-null  float64
 14  o              1345642 non-null  float64
 15  dir            1345642 non-null  float64
 16  event          123993 non-null   object 
dtypes: float

In [8]:
week_df['event'].value_counts()

event
first_contact                27186
tackle                       25231
ball_snap                    16905
handoff                      15571
pass_outcome_caught          14122
pass_arrived                 11684
out_of_bounds                 5405
run                           2829
touchdown                     1219
man_in_motion                  966
play_action                    713
qb_slide                       506
shift                          391
pass_forward                   322
autoevent_passforward          206
autoevent_passinterrupted      185
fumble                         138
snap_direct                    138
pass_shovel                     92
line_set                        46
lateral                         46
qb_sack                         23
penalty_flag                    23
fumble_defense_recovered        23
fumble_offense_recovered        23
Name: count, dtype: int64

In [7]:
cle_wk1_track_df = week_df.loc[week_df['club'] == 'CLE']
cle_wk1_track_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
1317968,2022103100,98,43341,Deion Jones,1,2022-10-31 20:17:20.299999,54,CLE,right,32.76,30.93,0.48,0.54,0.05,260.99,274.07,
1317969,2022103100,98,43341,Deion Jones,2,2022-10-31 20:17:20.400000,54,CLE,right,32.72,30.94,0.38,0.66,0.04,266.79,274.92,
1317970,2022103100,98,43341,Deion Jones,3,2022-10-31 20:17:20.500000,54,CLE,right,32.68,30.94,0.32,0.67,0.04,267.96,273.98,
1317971,2022103100,98,43341,Deion Jones,4,2022-10-31 20:17:20.599999,54,CLE,right,32.65,30.94,0.3,0.49,0.03,267.13,270.06,
1317972,2022103100,98,43341,Deion Jones,5,2022-10-31 20:17:20.700000,54,CLE,right,32.61,30.94,0.35,0.31,0.04,267.74,261.17,


In [8]:
#list of playId numbers
cle_wk1_track_df['playId'].unique()

array([  98,  119,  140,  186,  237,  261,  311,  334,  363,  389,  417,
        477,  521,  550,  574,  598,  626,  687,  744,  791,  853,  874,
        895,  916, 1034, 1058, 1127, 1150, 1171, 1195, 1216, 1237, 1258,
       1329, 1353, 1488, 1509, 1559, 1580, 1689, 1713, 1737, 1785, 1843,
       1867, 2051, 2077, 2098, 2119, 2140, 2169, 2193, 2214, 2235, 2261,
       2286, 2341, 2428, 2449, 2473, 2494, 2515, 2536, 2570, 2678, 2707,
       2744, 2768, 2789, 2847, 2890, 2974, 2997, 3020, 3041, 3064, 3087,
       3111, 3132, 3189, 3213, 3254, 3275, 3304, 3360, 3383, 3406, 3527,
       3573, 3596, 3674, 3697])

In [9]:
#1012 touchdowns for week 1 seemed like a lot, so I looked at my favorite team, and found that each player on the field
#for the team that scored has 'event' equal to touchdown, makes more sense
touchdowns_cle_df = week_df.loc[(week_df['event'] == 'touchdown') & (week_df['club'] == 'CLE')]
touchdowns_cle_df.reset_index(drop=True, inplace=True)
touchdowns_cle_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022103100,1353,41264,Joel Bitonio,42,2022-10-31 21:11:54.900000,75,CLE,right,108.73,21.52,2.33,2.41,0.24,118.13,126.73,touchdown
1,2022103100,1353,43297,Jack Conklin,42,2022-10-31 21:11:54.900000,78,CLE,right,107.61,22.07,1.91,0.3,0.2,54.33,67.06,touchdown
2,2022103100,1353,43380,Jacoby Brissett,42,2022-10-31 21:11:54.900000,7,CLE,right,110.46,16.56,2.66,1.1,0.27,29.24,68.68,touchdown
3,2022103100,1353,44870,Ethan Pocic,42,2022-10-31 21:11:54.900000,55,CLE,right,110.4,20.03,3.05,1.71,0.31,111.68,111.45,touchdown
4,2022103100,1353,45798,Michael Dunn,42,2022-10-31 21:11:54.900000,68,CLE,right,109.78,23.46,2.9,2.52,0.3,121.74,130.41,touchdown
