# **Variety in PlayType can be a factor for injury**

## **Objective**

Characterize any differences in player movement between the playing surfaces and to identify specific variables (e.g., field surface, weather, position, play type, etc.) that may influence player movement and the risk of injury.

Evaluation tab:
https://www.kaggle.com/c/nfl-playing-surface-analytics/overview/evaluation

## **Libraries**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
import gc
import matplotlib.patches as patches
import time
pd.options.mode.chained_assignment = None

## **Load Data**

In [None]:
# Reduce memory and read csv fies
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
print('-' * 80)
print('injury_record')
injury_record = import_data('../input/nfl-playing-surface-analytics/InjuryRecord.csv')

print('-' * 80)
print('player_data')
player_data = import_data('../input/nfl-playing-surface-analytics/PlayerTrackData.csv')

print('-' * 80)
print('play_list')
play_list = import_data('../input/nfl-playing-surface-analytics/PlayList.csv')

## **Data cleaning**

45 Play Keys in play data doesn't have player track data

In [None]:
play_list[~play_list.PlayKey.isin(player_data.PlayKey.unique())].PlayKey.count()

In [None]:
injury_record[injury_record.PlayKey.isin(player_data.PlayKey.unique())].PlayKey.count()

### **Injury record**

Analysis is based on player track data.So **drop 28 rows in which play key is empty**

In [None]:
#injury_record.dropna(subset = ['PlayKey'],inplace =True)

### **Convert one hot injury duration to Catagories for easier visualizations**

In [None]:
def catagorize_injury_duration(DM_M1,DM_M7 ,DM_M28 ,DM_M42):
    
    if (DM_M42 == 1):
        return '6 or more weeks'
    elif(DM_M28 == 1): 
        return '4 - 6 weeks'
    elif(DM_M7 == 1):
        return '1 - 3 weeks'
    elif(DM_M1 == 1):
        return 'less than a week'

In [None]:
injury_record['InjuryDuration'] = injury_record.apply(lambda X:catagorize_injury_duration(X.DM_M1,X.DM_M7 ,X.DM_M28 ,X.DM_M42) ,axis =1 )

injury_record.drop(columns=['DM_M1','DM_M7' ,'DM_M28' ,'DM_M42'],inplace =True)

injury_record.head()

## **Weather**

In [None]:
rain = ['30% Chance of Rain', 'Rainy', 'Rain Chance 40%', 'Showers', 'Cloudy, 50% change of rain', 'Rain likely, temps in low 40s.',
          'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
          'Scattered Showers', 'Cloudy, Rain', 'Rain shower', 'Light Rain', 'Rain']

overcast = ['Party Cloudy', 'Cloudy, chance of rain',
              'Coudy', 
              'Cloudy and cold', 'Cloudy, fog started developing in 2nd quarter',
              'Partly Clouidy', 'Mostly Coudy', 'Cloudy and Cool',
              'cloudy', 'Partly cloudy', 'Overcast', 'Hazy', 'Mostly cloudy', 'Mostly Cloudy',
              'Partly Cloudy', 'Cloudy']

clear = ['Partly clear', 'Sunny and clear', 'Sun & clouds', 'Clear and Sunny',
           'Sunny and cold', 'Sunny Skies', 'Clear and Cool', 'Clear and sunny',
           'Sunny, highs to upper 80s', 'Mostly Sunny Skies', 'Cold',
           'Clear and warm', 'Sunny and warm', 'Clear and cold', 'Mostly sunny',
           'T: 51; H: 55; W: NW 10 mph', 'Clear Skies', 'Clear skies', 'Partly sunny',
           'Fair', 'Partly Sunny', 'Mostly Sunny', 'Clear', 'Sunny']

snow = ['Cloudy, light snow accumulating 1-3"', 'Heavy lake effect snow', 'Snow']

none = ['N/A Indoor', 'Indoors', 'Indoor', 'N/A (Indoors)', 'Controlled Climate']


In [None]:
play_list['Weather'] = np.where(play_list.Weather.isin(rain), 'rain' , \
         np.where(play_list.Weather.isin(overcast) , 'overcast',\
          np.where(play_list.Weather.isin(clear) , 'clear' ,\
           np.where(play_list.Weather.isin(snow) ,'snow',\
            np.where(play_list.Weather.isin(none),'indoors','unknown')))))

## **Roof**

In [None]:
opened = ['Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 
              'Outdor', 'Ourdoor', 'Outside', 'Outddors', 
              'Outdoor Retr Roof-Open', 'Oudoor', 'Bowl','Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open','Domed, Open', 'Domed, open']

closed = ['Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed',
                   'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed','Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']

In [None]:
play_list['roof'] = np.where(play_list.StadiumType.isin(opened), 'open' , \
         np.where(play_list.StadiumType.isin(closed) , 'closed','unknown'))

play_list.drop(columns = ['StadiumType'],inplace = True)

## **Feature engineering**

## **Distance**

### Distance per play

In [None]:
play_distance = player_data.groupby('PlayKey')['dis'].sum(axis =1).reset_index().rename(columns={'dis':'play_dis'})

In [None]:
play_distance.head()

In [None]:
play_list = play_list.merge( play_distance , on = 'PlayKey' , how ='left')

In [None]:
del play_distance
gc.collect()

### Distance per game

In [None]:
game_distance = play_list.groupby('GameID')['play_dis'].sum(axis =1).reset_index().rename(columns={'play_dis':'game_dis'})

In [None]:
game_distance.head()

In [None]:
play_list = play_list.merge( game_distance , on = 'GameID' , how ='left')

In [None]:
del game_distance
gc.collect()

### Distance per player

In [None]:
player_distance = play_list.groupby('PlayerKey')['play_dis'].sum(axis =1).reset_index().rename(columns={'play_dis':'player_dis'})

In [None]:
player_distance.head()

In [None]:
play_list = play_list.merge( player_distance , on = 'PlayerKey' , how ='left')

In [None]:
del player_distance
gc.collect()

## **Time**

*Time per play*

In [None]:
time_data = player_data.groupby('PlayKey')['time'].apply(lambda x:x.iloc[-1]).reset_index()

In [None]:
play_list = play_list.merge( time_data , on = 'PlayKey' , how ='left')

In [None]:
del time_data
gc.collect()

In [None]:
play_list.isna().sum()

Time data not available for 45 play keys which doesn't have player track data

**Acceleration**

In [None]:
import time
start_time = time.time()
rows_list = []
for k, v in  player_data.groupby('PlayKey'):
    #coords = v[['x', 'y']].to_numpy()
    dists = (v['s'] -v['s'].shift(1)) / (v['time'] - v['time'].shift(1))
    dists.iloc[0] = 0
    rows_list.extend(dists)

print(time.time() - start_time)

In [None]:
acceleration = pd.Series(rows_list)

In [None]:
del rows_list
gc.collect()

In [None]:
acceleration.isna().sum()

There are values in acceleration which are 'infinity' and 'NaN'. On below situations and subsitute with '0'



*   infinity (1/0)(Change in speed / no change in time)
*   NaN (0/0) (No change in speed / no change in time)







In [None]:
acceleration = acceleration.replace([np.inf, -np.inf], np.nan)

In [None]:
acceleration.fillna(0 , inplace = True)

In [None]:
acceleration.isna().sum()

In [None]:
player_data['a'] = acceleration

In [None]:
del acceleration
gc.collect()

In [None]:
player_data.head()

## **Pitch location**


Devide the pitch into Middle and wide on y -axis: Identify difference between Middle and wide player movements



*   *Lower Wide* (0-17.76 yards)
*   *Middle* (17.77-35.6 yards)
*   *Upper wide* (35.7 -53.5 yards)





In [None]:
#start = time.time()
#player_data['PlayLoc'] = np.where((player_data.y >= 17.7) & (player_data.y <= 35.6) ,'Middle' , 'Wide' )
#print('time',time.time() - start)

In [None]:
#start = time.time()
#player_data['PlayLoc'] = np.where((player_data.x < 40) ,'Left' , np.where(player_data.x > 80 , 'Right' ,'Middle') )
#print('time',time.time() - start)

## **Merge datsets**

In [None]:
inj_merge = injury_record.merge(play_list , on = 'PlayKey' , how ='left')

inj_merge = inj_merge.merge(player_data , on='PlayKey' , how = 'left')

# **EDA**

## **Player movements in injured Plays**

In [None]:
fig, axes = plt.subplots(2,1)
fig.set_size_inches(12, 19)

fig.suptitle(' Player movement in injured Plays', fontsize=16)


#Injury locations- Natural
natural_inj_loc = inj_merge.groupby('Surface').get_group('Natural')
sns.kdeplot(natural_inj_loc["x"].dropna(), natural_inj_loc["y"].dropna(),ax=axes[0] ,shade = "True", color = 'red').set_title('Natural')

#Injury locations- Synthetic
Synthetic_inj_loc = inj_merge.groupby('Surface').get_group('Synthetic')

sns.kdeplot(Synthetic_inj_loc["x"].dropna(), Synthetic_inj_loc["y"].dropna(),ax=axes[1] ,shade = "True", color = 'red').set_title('Synthetic')

#axes[0].xlim(0, 120)
#axes[0].ylim(0, 53.3)


#axes[1].xlim(0, 120)
#axes[1].ylim(0, 53.3)

plt.show()

## **Number of plays per surface**

In [None]:
play_list.FieldType.value_counts() / play_list.FieldType.value_counts().sum() * 100

In [None]:
play_list.FieldType.value_counts().plot(kind = 'bar',figsize = (15,7),rot = 0,sort_columns = True,title = 'Number of Plays per surface').set(xlabel="Number of plays per surface", ylabel="Pitch region")
plt.show()

## **Number of injuries per surface**

In [None]:
#Number of injuries per surface

injury_record.Surface.value_counts().plot(kind ='bar',rot = 0,sort_columns = True,figsize = (15,7),title = 'Number of injuries per Surface').set(xlabel="Surface", ylabel="Number of injuries")

plt.show()

In [None]:
injury_record.Surface.value_counts()

### *46,799‬(17.52 %) more plays in natural, but 8 more injuries in synthetic, indicates that its more likely to have an injury on synthetic*

### Number of injuries per body part

In [None]:
#Number of injuries per body part

injury_record.BodyPart.value_counts().plot(kind ='bar',rot = 0,sort_columns = True,figsize = (15,7), title = 'Number of injuries per body part').set(xlabel="Body part", ylabel="Number of injuries")

plt.show()

## Body part by surface

In [None]:
ax =injury_record.groupby(['Surface','BodyPart']).count().unstack('BodyPart')['PlayerKey'].plot(kind='bar', rot = 0,sort_columns = True,figsize=(15, 7),alpha =0.8, title='Body Part by Surface')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.0, p.get_height() * 1.015))
plt.show()

### **Ankle** - *8 more in synthetic. Why?*
### **Knee** - *Although both surface have 24 injuries,17.52% more plays in natural indicate knee injuries are more likely in synthetic as well*

### **Toes** - *5 out of 6 in synthetic!*

## **Length of injury**:

Its difficult to conclude Length of injury as it requires the context such as


*   how injury prone a player is?
*   previous records
*   how exactly the injury occured?   



In [None]:
injury_record.groupby('Surface')['InjuryDuration'].value_counts().unstack('InjuryDuration').plot(kind='bar',  rot = 0,sort_columns = True,figsize=(15, 7), alpha =0.8,title='Injury duration per surface').set(ylabel = 'Count')
plt.show()

In [None]:
inj_play = injury_record.merge(play_list , on = ['PlayerKey','GameID','PlayKey'],how = 'left')

In [None]:
def fill_unkown(game_id,column):
    
    if game_id in play_list[play_list[column] !='Unknown'].GameID.values:
        return play_list[play_list.GameID == game_id][column].values[0]
    else:
        return 'Unknown'

In [None]:
handle_cols = ['roof','Temperature','Weather','RosterPosition','PlayerDay','PlayerGame','Position', 'PositionGroup']
for col in handle_cols:
    inj_play[col].loc[inj_play[col].isna()] = inj_play[['GameID',col]].loc[inj_play[col].isna()]\
                                                                               .apply(lambda x:fill_unkown(x['GameID'],col),axis =1)

In [None]:
fig,axes = plt.subplots(2,1,figsize = (15,17))
fig.suptitle('Play type - injury vs non injury Plays', fontsize=16)

inj_play.PlayType.value_counts().sort_values().plot(kind ='barh',ax = axes[0],title = 'Injury').set(xlabel="Number of Plays", ylabel="Play type")
play_list[~play_list.PlayKey.isin(injury_record.PlayKey)].PlayType.value_counts().sort_values().plot(kind ='barh',ax = axes[1],title = 'No injury').set(xlabel="Number of plays", ylabel="Play Type")
plt.subplots_adjust(hspace=0.35)
plt.show()

## **Players who play at Quarter back and kicker did not get any injury in 2 seasons**

In [None]:
fig,axes = plt.subplots(2,1,figsize = (15,17))
fig.suptitle('Roster position - injury vs non injury Plays', fontsize=16)

inj_play.RosterPosition.value_counts().sort_values().plot(kind ='barh',ax = axes[0],title = 'Injury').set(xlabel="Number of Plays", ylabel="Play type")
play_list[~play_list.PlayKey.isin(injury_record.PlayKey)].RosterPosition.value_counts().sort_values().plot(kind ='barh',ax = axes[1],title = 'No injury').set(xlabel="Number of plays", ylabel="Play Type")
plt.subplots_adjust(hspace=0.35)
plt.show()

## **Corner back is one of the least injured position in natural while its the 2nd most injured position in synthetic with 4000 less plays**

Natural - 2 injuries out of 16800 plays at cornerback

Synthetic - 11 injuries out of 12187 plays at cornerback

In [None]:
fig,axes = plt.subplots(2,1,figsize = (15,17))
fig.suptitle('Roster position difference between surface - injury vs non injury Plays', fontsize=16)

inj_play.groupby('Surface')['RosterPosition'].value_counts().unstack().plot(kind ='bar',rot =0 ,ax =axes[0],sort_columns = True, title = 'Injured',legend = False)

play_list[~play_list.PlayKey.isin(injury_record.PlayKey)][~play_list.RosterPosition.isin(['Quarterback','Kicker'])].groupby('FieldType')['RosterPosition'].value_counts().unstack().plot(kind ='bar',sort_columns = True,rot = 0,ax = axes[1],title = 'No injury',legend = False)
plt.subplots_adjust(hspace=0.35)
handles, labels = axes[1].get_legend_handles_labels()
fig.legend(handles, labels, loc='right')
plt.show()

## **Cornerbacks with 7 ankle injuries in synthetic and none in Natural**

In [None]:
fig,axes = plt.subplots(2,1,figsize = (20,14))
fig.suptitle('Roster Position', fontsize=16)

inj_play[(inj_play.Surface == 'Natural') & (inj_play.BodyPart != 'Heel')].groupby('RosterPosition')['BodyPart'].value_counts().unstack().plot(kind ='bar',rot =0 ,ax = axes[0],sort_columns = True, title = 'Natural',legend = False)
inj_play[(inj_play.Surface == 'Synthetic') & (inj_play.BodyPart != 'Heel')].groupby('RosterPosition')['BodyPart'].value_counts().unstack().plot(kind ='bar',rot =0 ,ax = axes[1],sort_columns = True, title = 'Synthetic',legend = False)

plt.subplots_adjust(hspace=0.35)
handles, labels = axes[1].get_legend_handles_labels()
fig.legend(handles, labels, loc='right')
plt.show()

## **Higher the variety in play greater the chance of injury. For example, quarternack only involved in pass and rush , never got any injury**

In [None]:
play_list.groupby('RosterPosition')['PlayType'].value_counts().unstack().plot(kind ='barh',rot =0 ,stacked =True,figsize = (20,14),sort_columns = True, title = 'Synthetic')

## **Events**

## **Cornerbacks in Synthetic are involved higher number of events compared to Natural which supports the logic of 'Higher the variety of Plays and events involved higher the chance of injury'** 

In [None]:
inj_merge[(inj_merge.RosterPosition == 'Cornerback') & (inj_merge.Surface == 'Synthetic')]['event'].value_counts()

In [None]:
inj_merge[(inj_merge.RosterPosition == 'Cornerback') & (inj_merge.Surface == 'Natural')]['event'].value_counts()

## Weather

In [None]:
inj_play.Weather.value_counts().plot(kind ='bar' ,rot =0,figsize =(15,7))

In [None]:
play_list.Weather.value_counts().plot(kind ='bar' ,rot =0,figsize =(15,7))

### Clear weather has less number of plays but more injuries

### **Corner backs have been anomaly so far.Lets check corner back player movement difference between Synthetic and Natural**

In [None]:
#Utility functions

def plt_var(var , label,nat_playkey , syn_playkey):
  fig,axes = plt.subplots(1,2,figsize = (20,7))
  fig.suptitle(label, fontsize=16)


  sns.distplot(player_data[player_data.PlayKey.isin(nat_playkey)][var] , ax = axes[0]).set_title('Natural')

  axes[0].set(xlabel=label, ylabel='density')

  sns.distplot(player_data[player_data.PlayKey.isin(syn_playkey)][var] , ax = axes[1]).set_title('Synthetic')

  axes[1].set(xlabel=label, ylabel='density')

  plt.subplots_adjust(wspace=0.35)
  plt.show()

def plt_std_var(var ,title,label, nat_playkey , syn_playkey):
  fig,axes = plt.subplots(1,2,figsize = (20,7))
  fig.suptitle(title, fontsize=16)


  sns.distplot(player_data[player_data.PlayKey.isin(nat_playkey)].groupby('PlayKey')[var].std() , ax = axes[0]).set_title('Natural')

  axes[0].set(xlabel=label, ylabel='density')

  sns.distplot(player_data[player_data.PlayKey.isin(syn_playkey)].groupby('PlayKey')[var].std() , ax = axes[1]).set_title('Synthetic')

  axes[1].set(xlabel=label, ylabel='density')

  plt.subplots_adjust(wspace=0.35)
  plt.show()


In [None]:
cb_nat_playkey = play_list[(play_list.RosterPosition == 'Cornerback') & (play_list.FieldType == 'Natural')].PlayKey
cb_syn_playkey = play_list[(play_list.RosterPosition == 'Cornerback') & (play_list.FieldType == 'Synthetic')].PlayKey

## Speed

In [None]:
plt_var('s' , 'Speed',cb_nat_playkey , cb_syn_playkey)

## Variation in speed( Measure using Standard deviation)

In [None]:
plt_std_var('s' , 'Variation in speed','Speed',cb_nat_playkey , cb_syn_playkey)

## *Acceleration*

In [None]:
plt_var('a' , 'Acceleration',cb_nat_playkey , cb_syn_playkey)

## Variation in acceleration

In [None]:
plt_std_var('a' , 'Variation in acceleration','Acceleration',cb_nat_playkey , cb_syn_playkey)

## Orientation

In [None]:
plt_var('o' , 'Orientation',cb_nat_playkey , cb_syn_playkey)

## Variation in Orientation

In [None]:
plt_std_var('o' , 'Variation in Orientation','Orientation',cb_nat_playkey , cb_syn_playkey)

### There isn't any noticable difference in player movement between Natural and Synthetic for cornerbacks . We could conclude , incase of Cornerback injury difference its the Variation of Play types and events that led to injury