# NFL Punt Analytics Competition - Starter EDA

In this competition we are tasked with analzying punt plays for player safety and proposing rule changes.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
import os
import seaborn as sns
sns.set_style('whitegrid')

# For interactive plots
from plotly import offline
import plotly.graph_objs as go


pd.set_option('max.columns', None)
offline.init_notebook_mode()
config = dict(showLink=False)

In [None]:
# Read the input data
ppd = pd.read_csv('../input/player_punt_data.csv')
gd = pd.read_csv('../input/game_data.csv')
pprd = pd.read_csv('../input/play_player_role_data.csv')
vr = pd.read_csv('../input/video_review.csv')
vfi = pd.read_csv('../input/video_footage-injury.csv')
pi = pd.read_csv('../input/play_information.csv')

In [None]:
# I want to create another variable that is concussive_games so that I can leave vr for later
# I'm going to build on this variable
concussive_games = vr
concussive_games.head()

There are only 37 games with a concussion but a lot of extra data. But we need to add more data regarding the games

In [None]:
# Here I'm going to merge games where someone got a concussion with the game data
# game data has a lot more information: 
# Game level data that specifies the type of season (pre, reg, post), week, and hosting city and team. 
# Each game is uniquely identified across all seasons using GameKey.
conGD = pd.merge(concussive_games, gd)
conGD.head()

In [None]:
# lets verify we still have only 37 games to deal with
conGD.describe()

In [None]:
#lets look at concussions by season year
sns.factorplot('Season_Year',data=conGD,kind='count')

In [None]:
# lets look at the season year but by season type: preseason, regular season and post season
sns.factorplot('Season_Year',data=conGD,hue='Season_Type', kind='count')

There are no Post Season concussions during punts in 2016 or 2017. 

In [None]:
# let's look play information
# this gives specific detail to the play
pi.tail()

In [None]:
pi.describe()

In [None]:
# lets look at the punts by season year and season type: preseason, regular season and post season
sns.factorplot('Season_Year',data=pi,hue='Season_Type', kind='count')

Interestingly, there were more punts in 2017 but yet there were less concussions.

In [None]:
# after testing different combinations I found that there is duplicate GSISID data
# however the player number was different so I'm looking at it here. 
ppd.head(20).sort_values('GSISID')

In [None]:
# here I want see what is duplicated
ppd[ppd.duplicated(['GSISID']) == True].sort_values('GSISID')

In [None]:
# FIND 33941 to see why it is duplicating when brought into concussive plays
ppd.loc[ppd['GSISID'] == 33941]

What is going on? There are 817 rows of duplicated players. 
A closer look at the data, some players have letters in their numbers.
But in the case of 33941 they have two different numbers. 

In [None]:
#remove the letters from the Numbers column
import re
for Number in ppd:
    ppd['Number'] = [''.join(re.findall("\d*\.?\d+", item)) for item in ppd['Number']]

In [None]:
# Lets see if we removed the characters from the Numbers column
ppd.loc[ppd['GSISID'] == 33941]

In [None]:
# here I want see what is duplicated after cleanup
ppd[ppd.duplicated(['GSISID']) == True]

Great. But now we need to merge duplicated data so we can focus on solving

In [None]:
#There are still issues. Such as a 0 before 3, we need to remove that. 
ppd['Number'] = ppd['Number'].str.replace('0','')

In [None]:
# Lets see if we removed the characters from the Numbers column
ppd.loc[ppd['GSISID'] == 33941]

In [None]:
# After all that cleanup it is clear that numbrer is meaningless so lets remove the jersey number
ppdDrop = ppd.drop('Number', 1)

In [None]:
ppdDrop.head()

In [None]:
# merge games where someone got a concussion with the game data
# and merged the player data in hopes that we still get 38
conPlayer = pd.merge(conGD, ppdDrop)
conPlayer.describe()

In [None]:
conPlayer[conPlayer.duplicated(['PlayID','GSISID']) == True]

We have 57 duplications. What I need to do is remove the duplicates that are the same playID because there cannot be duplicate Plays.

In [None]:
conPlayer = conPlayer.dropna(subset=['PlayID']).drop_duplicates(['PlayID','GSISID'])

In [None]:
conPlayer.describe()

In my cleanup I found that there was one occasion where the same play had two players concussed. 

In [None]:
sns.factorplot('Position',data=conPlayer,kind='count')

The numbers are small, so there doesn't seem to be a definitive position that gets injured.
It would be interesting to see if these players were only special teams players or if they played fulltime. 
For example. Was the player just on defense?
Reason: Are they cold, as in not warmed up and coming right off the bench to perform




In [None]:
sns.catplot(data=conPlayer, x="Position",col="Player_Activity_Derived", kind="count")

Player_Activity_Derived	- player activity during primary injury causing event
Blocked	player was blocked	
	Blocking	player was blocking	
	Tackled	player was tackled	
	Tackling	player was tackling	
	Diving/Leaping	player was diving or leaping	
	Other	other activity

In [None]:
sns.catplot('Primary_Partner_Activity_Derived',data=conPlayer,kind='count')

Primary_Partner_Activity_Derived	-categorical variable describing primary partner’s activity at the time of causing the concussion
Blocked	partner was blocked	
	Blocking	partner was blocking	
	Tackled	partner was tackled	
	Tackling	partner was tackling	
	Diving/Leaping	partner was diving or leaping	
	Other	other activity

In [None]:
sns.catplot(data=conPlayer, x="Position",col="Primary_Partner_Activity_Derived", kind="count")

Player GSISID vs. primary partner GSISID
- Can I see type of position of each?
GSISID vs. Primary_Partner_GSISID
With Primary_Partner_Activity_Derived
(Position type held in player_punt_data)


In [None]:
#conPlayer has primary partner GSISID
# ppdDrop has the player positions
conPlayer = conPlayer.rename(columns={'Position': 'Concussed_Position'})
conPlayer.head()

In [None]:
#NAme Position - Concussed Position
#Name Position - Enemy Position
conPlayer1 = conPlayer.rename(columns={'GSISID': 'Concussed_GSISID'})
conPlayer1.head()

In [None]:
conPlayer2 = conPlayer1.rename(columns={'Primary_Partner_GSISID': 'GSISID'})
conPlayer2.head()

In [None]:
# dropping NAN from GSISID because some or NAN becuase the ground can cause a concussion
conPlayer2['GSISID'] = pd.to_numeric(conPlayer2['GSISID'], errors='coerce')
conPlayer2 = conPlayer2.dropna(subset=['GSISID'])
conPlayer2['GSISID']=conPlayer2['GSISID'].apply(int)
conPlayer2.head()

In [None]:
conPlayer2.describe()

In [None]:
# merged enemy player with their ID and position. They caused the concussion
conPlayer3 = pd.merge(conPlayer2, ppdDrop)
conPlayer3.head(20)

In [None]:
conPlayer3.describe()

In [None]:
conPlayer4 = conPlayer3.dropna(subset=['PlayID']).drop_duplicates(['PlayID','Concussed_GSISID','GSISID'])

In [None]:
conPlayer4.describe()

In [None]:
# convert GSISID to Enemy_GSISID
conPlayer5 = conPlayer4.rename(columns={'GSISID': 'Enemy_GSISID'})
conPlayer5.head()

In [None]:
# convert Position to Enemy_Position and drop number
conPlayer5 = conPlayer4.rename(columns={'Position': 'Enemy_Position'})
#conPlayer5 = conPlayer4.drop('Number', 1)
conPlayer5.head()

In [None]:
conPlayer5.describe()

View Concussed Position vs. Enemy Position


In [None]:
sns.catplot(data=conPlayer5, x="Concussed_Position",col="Enemy_Position", kind="count", col_wrap=4 )

In [None]:
sns.catplot(data=conPlayer5, x="Primary_Partner_Activity_Derived",col="Enemy_Position", kind="count", col_wrap=4)

In [None]:
sns.catplot("Primary_Impact_Type", data=conPlayer5,kind="count")
#sns.catplot('Primary_Partner_Activity_Derived',data=conPlayer,kind='count')

The conversion from 1 yard/second to miles/hour is 1 * 2.04545. Since each row is a tenth of a second, we need to also multiply by 10.

The main fuction below reads in one of the csv's and filters for one of the specific punt plays were an injury occured. It then converts the distance field to MPH and returns the max and mean speeds for the player injured [1st row] and primary partner [2nd row]

In [None]:
conPlayer5.head()