In [1]:
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd

In [14]:
week_df = pd.read_csv('../base_datasets/tracking_week_3.csv')
week_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022092200,56,35449.0,Tyson Alualu,1,2022-09-22 20:16:26.500000,94.0,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,272.43,
1,2022092200,56,35449.0,Tyson Alualu,2,2022-09-22 20:16:26.599999,94.0,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,269.87,
2,2022092200,56,35449.0,Tyson Alualu,3,2022-09-22 20:16:26.700000,94.0,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,269.98,
3,2022092200,56,35449.0,Tyson Alualu,4,2022-09-22 20:16:26.799999,94.0,PIT,left,84.1,23.83,0.0,0.0,0.01,97.54,284.87,
4,2022092200,56,35449.0,Tyson Alualu,5,2022-09-22 20:16:26.900000,94.0,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,281.79,


In [15]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(week_df):
    # Replace missing values with 0 in columns: 'jerseyNumber', 'nflId'
    week_df = week_df.fillna({'jerseyNumber': 0, 'nflId': 0})
    # Change column type to string for column: 'nflId'
    week_df = week_df.astype({'nflId': 'string'})
    # Replace all instances of ".0" with "" in column: 'nflId'
    week_df['nflId'] = week_df['nflId'].str.replace(".0", "", case=False, regex=False)
    # Change column type to string for column: 'jerseyNumber'
    week_df = week_df.astype({'jerseyNumber': 'string'})
    # Replace all instances of ".0" with "" in column: 'jerseyNumber'
    week_df['jerseyNumber'] = week_df['jerseyNumber'].str.replace(".0", "", case=False, regex=False)
    # Change column type to string for columns: 'club', 'playDirection', 'event', 'displayName', 'time'
    week_df = week_df.astype({'club': 'string', 'playDirection': 'string','event': 'string','displayName': 'string'})
    # Reduce the floats to two decimal points
    week_df[['x', 'y', 's', 'a', 'dis']] = week_df[['x', 'y', 's', 'a', 'dis']].round(2)
    # Change column type to datetime64[ns] for column: 'time'
    week_df = week_df.astype({'time': 'datetime64[ns]'})
    return week_df

week_df_clean = clean_data(week_df.copy())
week_df_clean.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022092200,56,35449,Tyson Alualu,1,2022-09-22 20:16:26.500000,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,272.43,
1,2022092200,56,35449,Tyson Alualu,2,2022-09-22 20:16:26.599999,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,269.87,
2,2022092200,56,35449,Tyson Alualu,3,2022-09-22 20:16:26.700000,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,269.98,
3,2022092200,56,35449,Tyson Alualu,4,2022-09-22 20:16:26.799999,94,PIT,left,84.1,23.83,0.0,0.0,0.01,97.54,284.87,
4,2022092200,56,35449,Tyson Alualu,5,2022-09-22 20:16:26.900000,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,281.79,


In [16]:
week_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1415788 entries, 0 to 1415787
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   gameId         1415788 non-null  int64         
 1   playId         1415788 non-null  int64         
 2   nflId          1354232 non-null  string        
 3   displayName    1415788 non-null  string        
 4   frameId        1415788 non-null  int64         
 5   time           1415788 non-null  datetime64[ns]
 6   jerseyNumber   1354232 non-null  string        
 7   club           1415788 non-null  string        
 8   playDirection  1415788 non-null  string        
 9   x              1415788 non-null  float64       
 10  y              1415788 non-null  float64       
 11  s              1415788 non-null  float64       
 12  a              1415788 non-null  float64       
 13  dis            1415788 non-null  float64       
 14  o              1354417 non-null  f

In [17]:
unique_values=week_df_clean['gameId'].unique()
print(unique_values)

[2022092200 2022092500 2022092501 2022092502 2022092503 2022092504
 2022092505 2022092506 2022092507 2022092508 2022092509 2022092510
 2022092511 2022092512 2022092513 2022092600]


In [35]:
cle_pit_df = week_df_clean[week_df_clean['gameId'] ==  2022092200] 
no_car_df = week_df_clean[week_df_clean['gameId'] ==  2022092500]
hou_chi_df = week_df_clean[week_df_clean['gameId'] == 2022092501]
ind_kc_df = week_df_clean[week_df_clean['gameId'] ==  2022092502]
buf_mia_df = week_df_clean[week_df_clean['gameId'] == 2022092503] 
min_det_df = week_df_clean[week_df_clean['gameId'] == 2022092504]
ne_bal_df = week_df_clean[week_df_clean['gameId'] == 2022092505]
cin_nyj_df = week_df_clean[week_df_clean['gameId'] ==  2022092506]
ten_lv_df = week_df_clean[week_df_clean['gameId'] == 2022092507]
phi_wash_df = week_df_clean[week_df_clean['gameId'] == 2022092508]
jax_lac_df = week_df_clean[week_df_clean['gameId'] == 2022092509]
ari_la_df = week_df_clean[week_df_clean['gameId'] ==  2022092510]
atl_sea_df = week_df_clean[week_df_clean['gameId'] == 2022092511]
tb_gb_df = week_df_clean[week_df_clean['gameId'] ==  2022092512]
sf_den_df = week_df_clean[week_df_clean['gameId'] ==   2022092513]
nyg_den_df = week_df_clean[week_df_clean['gameId'] == 2022092600]
unique_values=_df['club'].unique()
print(unique_values)



<StringArray>
['NYG', 'DAL', 'football']
Length: 3, dtype: string


In [36]:
cle_pit_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022092200,56,35449,Tyson Alualu,1,2022-09-22 20:16:26.500000,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,272.43,
1,2022092200,56,35449,Tyson Alualu,2,2022-09-22 20:16:26.599999,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,269.87,
2,2022092200,56,35449,Tyson Alualu,3,2022-09-22 20:16:26.700000,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,269.98,
3,2022092200,56,35449,Tyson Alualu,4,2022-09-22 20:16:26.799999,94,PIT,left,84.1,23.83,0.0,0.0,0.01,97.54,284.87,
4,2022092200,56,35449,Tyson Alualu,5,2022-09-22 20:16:26.900000,94,PIT,left,84.1,23.83,0.0,0.0,0.0,97.54,281.79,
