# NFL 1st and Future 
An injury preventive gameplay EDA

In this notebook I providde insight on wether type of stadium effects chances of an injury.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import matplotlib.patches as patches
sns.set_style("whitegrid")

import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot
init_notebook_mode(connected=True)

pd.options.mode.chained_assignment = None

# Data Prep


In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
# inj = import_data("../input/nfl-playing-surface-analytics/InjuryRecord.csv")
# playlist = import_data("../input/nfl-playing-surface-analytics/PlayList.csv")
trk = import_data("../input/nfl-playing-surface-analytics/PlayerTrackData.csv")

In [None]:
trk.head()

In [None]:
# # Input files
playlist = pd.read_csv('../input/nfl-playing-surface-analytics/PlayList.csv')
inj = pd.read_csv('../input/nfl-playing-surface-analytics/InjuryRecord.csv')
# trk = pd.read_csv('../input/nfl-playing-surface-analytics/PlayerTrackData.csv')

In [None]:
inj.head()

In [None]:
playlist.head()

In [None]:
print(inj.PlayerKey.isnull().sum())
print(inj.GameID.isnull().sum())
print(inj.PlayKey.isnull().sum())

Seems like there are 28 empty fields in PlayKey column.

In [None]:
allplayers = playlist.PlayerKey.nunique()
allgames = playlist.GameID.nunique()
allplays = playlist.PlayKey.nunique()

print('Number of Players: {}'.format(allplayers))
print('Number of Games: {}'.format(allgames))
print('Number of Plays: {}'.format(allplays))

In [None]:
df1= playlist[['GameID', 'StadiumType', 'FieldType', 'Weather', 'Temperature','Position','RosterPosition','PlayType']].drop_duplicates().reset_index().drop(columns=['index'])
df = pd.merge(inj,df1,on='GameID', how='left').drop_duplicates(subset=['GameID']).drop(columns='Surface')
df.head()

In [None]:
len(df)

The Weather and StadiumType columns have a lot categorical data. 
For analytic sake I will regroup the data among new labels.

In [None]:
df['Weather'].value_counts()

In [None]:

df['Weather'].replace(['Sunny','Clear','Mostly Sunny','Clear and warm','Fair','Clear Skies','Clear skies','Mostly sunny','Clear and Sunny','Sunny and clear'],['Clear','Clear','Clear','Clear','Clear','Clear','Clear','Clear','Clear','Clear'],inplace=True)
df['Weather'].replace(['Cloudy','Partly Cloudy','Cold','Mostly cloudy','Cloudy, 50% change of rain','Coudy','Party Cloudy','Clear and cold','Cloudy and Cool','Sun & clouds'],['Cloudy','Cloudy','Cloudy','Cloudy','Cloudy','Cloudy','Cloudy','Cloudy','Cloudy','Cloudy'],inplace=True)
df['Weather'].replace(['Rain','Rain shower','Light Rain','Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.'],['Rain','Rain','Rain','Rain'],inplace=True)
df['Weather'].replace(['Indoor','Indoors','Controlled Climate','N/A (Indoors)'],['Indoor','Indoor','Indoor','Indoor'],inplace=True)

assert df['Weather'].nunique() == 4

In [None]:
df['StadiumType'].value_counts()

In [None]:
df['StadiumType'].replace(['Outdoors', 'Cloudy', 'Heinz Field','Outdor', 'Ourdoor', 'Outside', 'Outddors','Outdoor Retr Roof-Open', 'Oudoor', 'Bowl'],['Outdoor','Outdoor','Outdoor','Outdoor','Outdoor','Outdoor','Outdoor','Outdoor','Outdoor','Outdoor'],inplace=True)
df['StadiumType'].replace(['Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed','Retractable Roof', 'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed'],['Indoor_closed','Indoor_closed','Indoor_closed','Indoor_closed','Indoor_closed','Indoor_closed','Indoor_closed','Indoor_closed'],inplace=True)
df['StadiumType'].replace(['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open'],['Indoor_open','Indoor_open','Indoor_open','Indoor_open'],inplace=True)
df['StadiumType'].replace(['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed'],['Dome_closed','Dome_closed','Dome_closed','Dome_closed','Dome_closed'],inplace=True)
# df['StadiumType'].replace(['Domed, Open', 'Domed, open'],['Dome_open','Dome_open'],inplace=True)
# df['StadiumType'].value_counts()
assert df['StadiumType'].nunique() == 4

In [None]:
df.head()

# Analytics I
For first part analytics only use Playlist data and InjuryRecord. 
My hypothesis is that closed/indoor stadiums have less occurance of number of injuries.
If true, research goes deeper in finding reasons why.

In [None]:
df.groupby('FieldType').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='bar', figsize=(15, 5), title='Count of Field Surface in Injury cases', color = 'green')
plt.show()

In [None]:
df.groupby('RosterPosition').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='barh', figsize=(15, 5), title='Count of injuries by Stadium Type', color='blue')
plt.show()

In [None]:
df.groupby('PlayType').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='barh', figsize=(15, 5), title='Count of injuries by Stadium Type', color='orange')
plt.show()

In [None]:
df.groupby('Weather').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='barh', figsize=(15, 5), title='Count of injuries by Weather type', color='red')
plt.show()

These first plots trigger me to investigate combinations of features.
Do we see higher county of injuries in a certain combination?

In [None]:
sns.set(style='darkgrid')
sns.catplot(y="StadiumType",
              hue="FieldType",
              data=df, kind="count",
            height=6);


In [None]:

sns.set(style='darkgrid')
sns.catplot(y="StadiumType",
              hue="FieldType", col="Weather",
              data=df, kind="count",
            height=6)

I will focus on Outdoor and Indoor closed as the other StadiumType do not have significant counts.

In [None]:
dfst=df[df['StadiumType']=="Outdoor"]
sns.set(style='darkgrid')
sns.catplot(y="PlayType",
              hue="FieldType", col="Weather",
              data=dfst, kind="count",
            height=6)

In [None]:
#Note only synthetic field applicable in this case.
dfic=df[df['StadiumType']=="Indoor_closed"]
sns.set(style='darkgrid')
sns.catplot(y="PlayType",
              hue="Weather", col="FieldType",
              data=dfic, kind="count",
            height=6)

Outdoor Synthetic: Most injuries at Rush.
Outdoor Natural: Most injuries at Rush, Kickoff and Pass.

Indoor Synthetic: Most injuries occur at Rush and Pass.
Indoor Natural: None.

Let us take a look at severity of injuries


In [None]:
dfo = df[df["StadiumType"]=="Outdoor"]
dfom42=dfo[dfo['DM_M42']==1]
dfom28=dfo[dfo['DM_M28']==1]
dfom7=dfo[dfo['DM_M7']==1]
dfom1=dfo[dfo['DM_M1']==1]
g=sns.catplot(y="RosterPosition", hue="PlayType", col="FieldType", data=dfom42, kind="count")
g.set_titles("M42 {col_name} {col_var}")
h=sns.catplot(y="RosterPosition", hue="PlayType",  col="FieldType",data=dfom28, kind="count")
h.set_titles("M28 {col_name} {col_var}")
h1=sns.catplot(y="RosterPosition", hue="PlayType",  col="FieldType",data=dfom7, kind="count")
h1.set_titles("M7 {col_name} {col_var}")
h2=sns.catplot(y="RosterPosition", hue="PlayType",  col="FieldType",data=dfom1, kind="count")
h2.set_titles("M1 {col_name} {col_var}")
plt.show()

In [None]:
dficm42=dfic[dfic['DM_M42']==1]
dficm28=dfic[dfic['DM_M28']==1]
dficm7=dfic[dfic['DM_M7']==1]
dficm1=dfic[dfic['DM_M1']==1]
sns.catplot(y="RosterPosition", hue="PlayType", data=dficm42, kind="count")
plt.title('Injury M42', fontsize=16)
sns.catplot(y="RosterPosition", hue="PlayType", data=dficm28, kind="count")
plt.title('Injury M28', fontsize=16)
sns.catplot(y="RosterPosition", hue="PlayType", data=dficm7, kind="count")
plt.title('Injury M7', fontsize=16)
sns.catplot(y="RosterPosition", hue="PlayType", data=dficm1, kind="count")
plt.title('Injury M1', fontsize=16)
plt.show()

In [None]:
dfst=df[df['StadiumType']=="Outdoor"]
sns.set(style='darkgrid')
g=sns.catplot(y="BodyPart",
              hue="Weather", col="FieldType",
              data=dfst, kind="count",
            height=6)
g.set_titles("Outdoor, {col_name} {col_var}")

In [None]:
#Note only synthetic field applicable in this case.
dfic=df[df['StadiumType']=="Indoor_closed"]
sns.set(style='darkgrid')
g=sns.catplot(y="BodyPart",
              hue="Weather", col="FieldType",
              data=dfic, kind="count",
            height=6)
g.set_titles("Indoor_closed, {col_name} {col_var}")

****Temperature and injury counts****

Are there bodypart injuries that occur at certain temperatures?
And are there changes between outdoor and indoor?

In [None]:
dfo.drop(dfo.loc[dfo.Temperature==-999].index, inplace=True)
plt.hist(dfo.Temperature.values, normed=False, bins=30)
plt.xlabel('Temp (F)');
plt.ylabel('Count Injuries');

In [None]:
fig, axs = plt.subplots(3,2,figsize=(25,15))
bplist=['Ankle','Heel','Foot','Knee','Toes']
for i, v in enumerate(bplist):
    sns.distplot(dfst[dfst.BodyPart == 'Ankle'].Temperature, ax = axs[0,0], label=dfst[dfst.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[0,0].set_title('Temperature for Ankle injuries')
    sns.distplot(dfst[dfst.BodyPart == 'Knee'].Temperature, ax = axs[0,1], label=dfst[dfst.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[0,1].set_title('Temperature for Knee injuries')
    sns.distplot(dfst[dfst.BodyPart == 'Heel'].Temperature, ax = axs[1,0], label=dfst[dfst.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[1,0].set_title('Temperature for Heel injuries')
    sns.distplot(dfst[dfst.BodyPart == 'Toes'].Temperature, ax = axs[1,1], label=dfst[dfst.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[1,1].set_title('Temperature for Toes injuries')
    sns.distplot(dfst[dfst.BodyPart == 'Foot'].Temperature, ax = axs[2,0], label=dfst[dfst.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[2,0].set_title('Temperature for Foot injuries')
plt.legend()    
plt.suptitle('Histogram temperature distribution for bodypart injuries', fontsize=16)
plt.show()


In [None]:
dfic.drop(dfic.loc[dfic.Temperature==-999].index, inplace=True)
plt.hist(dfic.Temperature.values, bins=30)
plt.xlabel('Temp (F)');
plt.ylabel('Count Injuries');

In [None]:
fig, axs = plt.subplots(3,2,figsize=(25,15))
bplist=['Ankle','Heel','Foot','Knee','Toes']
for i, v in enumerate(bplist):
    sns.distplot(dfic[dfic.BodyPart == 'Ankle'].Temperature, ax = axs[0,0], label=dfic[dfic.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[0,0].set_title('Temperature for Ankle injuries')
    sns.distplot(dfic[dfic.BodyPart == 'Knee'].Temperature, ax = axs[0,1], label=dfic[dfic.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[0,1].set_title('Temperature for Knee injuries')
    sns.distplot(dfic[dfic.BodyPart == 'Heel'].Temperature, ax = axs[1,0], label=dfic[dfic.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[1,0].set_title('Temperature for Heel injuries')
    sns.distplot(dfic[dfic.BodyPart == 'Toes'].Temperature, ax = axs[1,1], label=dfic[dfic.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[1,1].set_title('Temperature for Toes injuries')
    sns.distplot(dfic[dfic.BodyPart == 'Foot'].Temperature, ax = axs[2,0], label=dfic[dfic.BodyPart == v],
                         kde=False,
                         bins=30)
    axs[2,0].set_title('Temperature for Foot injuries')
plt.legend()    
plt.suptitle('Histogram temperature distribution for bodypart injuries', fontsize=16)
plt.show()

Although it is visible that more Ankle, Knee and Toes injuries occur at the extreme low or high temperatures,
there is no hard evidence that certain BodyPart have a higher count of injuries at certain temperatures.


# Analysis II: Speed, Acceleration and Jerk


In this section I go into my final hypothesis, which involves jerk.

Synthetic turf shows higher jerk magnitude than natural turf.

Injuries on synthetic turf will therefore show a higher jerk magnitude, causing more severe injuries.


First of all what is jerk?
In mathematical terms it is the derivative of acceleration.

Let me give a practical explanation of jerk:
An experienced driver gradually applies the brakes, causing a slowly increasing deceleration (small jerk). An inexperienced driver, or a driver responding to an emergency, applies the brakes suddenly, causing a rapid increase in deceleration (large jerk). The sensation of jerk is noticeable, causing the passenger’s head to jerk forward.

Source https://www.physicsforums.com/threads/what-is-jerk-and-jounce-conceptually.716152/



Note: Looks like an injury is always after one big acceleration. Never more than 1 big acceleration.

In [None]:
# Calculate acceleration
trk['a'] = (trk.s - trk.s.shift(1)) / (trk.time - trk.time.shift(1))

# Calculate instantaneous jerk
trk['j'] = (trk.a - trk.a.shift(1)) / (trk.time - trk.time.shift(1))


In next blocks I take a look at some PlayKey where the first plots show speed, acceleration and jerk before injury.
And in the last plot (far right one) I take a look at the injury PlayKey.
My hypothesis is that jerk will show a calm pattern in pre-injury PlayKeys and a rough pattern in injury PlayKeys.
Note that I used a moving average filtering on jerk.

In [None]:
def movingaverage(interval, window_size):
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(interval, window, 'same')

# movingaverage(,2)

In [None]:
# idlist=['39873-4-1', '39873-4-2', '39873-4-3', '39873-4-4', '39873-4-5',
#        '39873-4-6', '39873-4-7', '39873-4-8', '39873-4-9', '39873-4-10',
#        '39873-4-11', '39873-4-12', '39873-4-13', '39873-4-14',
#        '39873-4-15', '39873-4-16', '39873-4-17', '39873-4-18',
#        '39873-4-19', '39873-4-20', '39873-4-21', '39873-4-22',

In [None]:
#Find the PlayKey marked as injury for that GameID
inj[inj.GameID=='39873-4'].PlayKey.unique()

In [None]:
#In this case '39873-4-32'is PlayKey marked as injury, all the other PlayKey show a more stationary pattern of jerk.
idlist=['39873-4-23', '39873-4-24', '39873-4-25', '39873-4-26',
       '39873-4-27', '39873-4-28', '39873-4-29', '39873-4-30',
       '39873-4-31', '39873-4-32']
# len(idlist)
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,2), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys', fontsize=16)    
plt.show()

In [None]:
# inj[inj.GameID=='39671-12'].head()
list(playlist[playlist.GameID=='39671-12'].PlayKey.unique())

In [None]:
#Now look into a player on natural field with severe injury plot in last plot
#This is an NaN example, however you can quite easily spot plot with the injury data. This time it is not the last one, but second last.
# idlist = list(playlist[playlist.GameID=='39671-12'].PlayKey.unique())
idlist = ['39671-12-19',
 '39671-12-20',
 '39671-12-21',
 '39671-12-22',
 '39671-12-23',
 '39671-12-24']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
# modes = ['full', 'same', 'valid']

for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys', fontsize=16)    
plt.show()

In [None]:
# dfs42[dfs42.PlayKey=='44492-3-23'].head()
list(playlist[playlist.GameID=='44492-3'].PlayKey.unique())

In [None]:
#In this case the player injury is noted with playkey 44492-3-23. Data suggest that player is still playing, probably injured, in later playkeys 24 and 25 after which jerk normalizes onwards from playkey 26 and 27.

# idlist=[ '44492-3-19',
#  '44492-3-20',
#  '44492-3-21',
#  '44492-3-22',
#  '44492-3-23']
idlist=['44492-3-23',
 '44492-3-24',
 '44492-3-25',
 '44492-3-26',
 '44492-3-27']

number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys', fontsize=16)    
plt.show()

In next two codeblocks I inspect all PlayKeys on Natural Turf that show severe injuries (M42=True). 
From the earlier examples I expect that all will show a remarkable pattern.

In [None]:
#Find a PlayKey of injured player on natural turf
dfsn = df[df.FieldType=='Natural']
dfs42= dfsn[dfsn.DM_M42==1]
list(dfs42.PlayKey.dropna().unique())
# dfs42[dfs42.GameID=='33337-2'].PlayKey.unique()

In [None]:
#All severe injury(M42=1) plays on natural turf 
idlist=['36621-13-58',
 '43505-2-49',
 '41094-1-55',
 '44434-10-31',
 '31070-3-7']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
#                 s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
#                 a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys for Natural turf', fontsize=16)    
plt.show()

In next two codeblocks I inspect all PlayKeys on Synthetic Turf that show severe injuries (M42=True). 
From the earlier examples I expect that all will show a remarkable pattern.

In [None]:
#Find a PlayKey of injured player on synthetic
dfss = df[df.FieldType=='Synthetic']
dfs42= df[df.DM_M42==1]
list(dfs42.PlayKey.dropna().unique())

In [None]:
#All severe injury(M42=1) plays on synthetic field 
idlist=['39873-4-32',
 '36557-1-70',
 '43532-5-69',
 '46014-10-22',
 '46331-4-44']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
#                 s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
#                 a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys for Synthetic turf', fontsize=16)    
plt.show()

It is hard to interpret jerk, however in many cases jerk magnitude shows a calm signal when players are not injured.
PlayKeys noted as injury and PlayKeys shortly before or after,show a rough pattern of jerk magnitude.
Although proof is not watertide is gives enough evidence for further investigation into jerk and probability of a player being injured.

Focusing on differences between synthetic and natural turf it can be seen that natural turf shows a more stationary signal than synthetic turf, meaning in natural turf you see more timesegments where the amplitude of jerk does not go higher or lower than -1 or 1. 
However jerk signals from both turf show remarkable difference between injury and non-injury PlayKeys.

Next we look at difference in Jerk Magnitude for different BodyPart injuries.


In [None]:
dfss = df[df.FieldType=='Synthetic']
dfskn= dfss[dfss.BodyPart=='Knee']
dfsan= dfss[dfss.BodyPart=='Ankle']
dfsto= dfss[dfss.BodyPart=='Toes']
dfsfo= dfss[dfss.BodyPart=='Foot']
dfshe= dfss[dfss.BodyPart=='Heel']
list(dfsan.PlayKey.dropna().unique())

In [None]:
#Some Knee injury PlayKeys on synthetic field 
idlist=[ '42398-15-33',
 '47220-4-16',
 '44482-20-21',
 '36607-16-19',
 '47307-10-18']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys of Knee injuries', fontsize=16)    
plt.show()

In [None]:
#Some Foot injury PlayKeys on synthetic field 
idlist=[ '38364-5-23', '47235-7-55']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys of Foot injuries', fontsize=16)    
plt.show()

In [None]:
#Some Ankle injury PlayKeys on synthetic field 
idlist=['36557-1-70',
 '43532-5-69',
 '46014-10-22',
 '45962-8-40',
 '46331-4-44',]
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys of Ankle injuries', fontsize=16)    
plt.show()

Some more for natural turf per Bodypart

In [None]:
dfsn = df[df.FieldType=='Natural']
dfskn= dfsn[dfsn.BodyPart=='Knee']
dfsan= dfsn[dfsn.BodyPart=='Ankle']
dfsto= dfsn[dfsn.BodyPart=='Toes']
dfsfo= dfsn[dfsn.BodyPart=='Foot']
dfshe= dfsn[dfsn.BodyPart=='Heel']
list(dfshe.PlayKey.dropna().unique())

In [None]:
idlist=['46646-3-30',
 '44492-3-23',
 '41943-1-12',
 '45950-8-18',
 '43540-3-14']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys of Ankle injuries', fontsize=16)    
plt.show()

In [None]:
idlist=['46074-7-26',
 '41145-2-60',
 '44860-5-52',
 '41094-1-55',
 '46119-3-16']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys of Knee injuries', fontsize=16)    
plt.show()

In [None]:
idlist=['36621-13-58', '43505-2-49', '44434-10-31', '33337-8-15']
number_of_subplots=len(idlist)
fig,ax = plt.subplots(1,number_of_subplots,figsize=(30,10),sharey=True)
for j, pk in enumerate(idlist):
    for i,v, in enumerate(range(number_of_subplots)):
            if i<=number_of_subplots:
                s1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].s, data=trk, ax=ax[j], color='blue', label='Speed {:}'.format(idlist[i]) if i==j else '')
                a1=sns.lineplot(trk[trk.PlayKey == pk].time, trk[trk.PlayKey == pk].a, data=trk, ax=ax[j], color='orange', label='Acceleration {:}'.format(idlist[i]) if i==j else '')
                j1=sns.lineplot(trk[trk.PlayKey == pk].time, movingaverage(trk[trk.PlayKey == pk].j,5), data=trk, ax=ax[j], color='red', label='Jerk {:}'.format(idlist[i]) if i==j else '')          
                i = i+1
    j = j+1
plt.suptitle('Speed, Acceleration, Jerk for different PlayKeys of Foot injuries', fontsize=16)    
plt.show()

Note that heel and toes injuries are available show NaN for both synthetic as natural turf data and are therefore no be used for this analysis.

# Conclusion

We have seen a lot of plots and data insights. This notebook provides you knowlegde that in most cases players with an injury show at those PlayKeys high spikes in jerk magnitude for a short period of time compared to non injury PlayKeys.
Furthermore a difference between synthetic and natural turf is visible, namely jerk magnitude of players on natural turf show a more stationary jerk magnitude (amplitude spikes to not pass -1 or +1 for long periods of time).

What can you do as NFL team with the insights of this notebook?
Determine muscle fatique and thus risk of injuries by the use of your players speed, acceleration and jerk magnitude data.
See following research: Amir Baghdadi, Fadel M. Megahed, Ehsan T. Esfahani & Lora A. Cavuoto
(2018): A machine learning approach to detect changes in gait parameters following a fatiguing
occupational task, Ergonomics, DOI: 10.1080/00140139.2018.1442936

More advanced research needed, however plausible:
Calculate probability of injury using acceleration and jerk magnitude data to determine play quality of natural or synthetic turfs in different weather conditions.