In [2]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.neighbors import KernelDensity
import pickle
import seaborn as sns

from matplotlib import pyplot as plt
PATH = '../data/'
%matplotlib inline

# Functions

# Parse data

In [3]:
df = pd.read_csv(PATH + 'train.csv', low_memory=False)
df.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509762 entries, 0 to 509761
Data columns (total 49 columns):
GameId                    509762 non-null int64
PlayId                    509762 non-null int64
Team                      509762 non-null object
X                         509762 non-null float64
Y                         509762 non-null float64
S                         509762 non-null float64
A                         509762 non-null float64
Dis                       509762 non-null float64
Orientation               509744 non-null float64
Dir                       509748 non-null float64
NflId                     509762 non-null int64
DisplayName               509762 non-null object
JerseyNumber              509762 non-null int64
Season                    509762 non-null int64
YardLine                  509762 non-null int64
Quarter                   509762 non-null int64
GameClock                 509762 non-null object
PossessionTeam            509762 non-null object
Down   

# Single feature analysis

## Objects

In [34]:
df_play = df.loc[df.NflId == df.NflIdRusher]
object_cols = np.array(df.dtypes[df.dtypes == 'object'].index)
object_cols

array(['Team', 'DisplayName', 'GameClock', 'PossessionTeam',
       'FieldPosition', 'OffenseFormation', 'OffensePersonnel',
       'DefensePersonnel', 'PlayDirection', 'TimeHandoff', 'TimeSnap',
       'PlayerHeight', 'PlayerBirthDate', 'PlayerCollegeName', 'Position',
       'HomeTeamAbbr', 'VisitorTeamAbbr', 'Stadium', 'Location',
       'StadiumType', 'Turf', 'GameWeather', 'WindSpeed', 'WindDirection'],
      dtype=object)

### Teams features

In [37]:
df.Team.unique()

array(['away', 'home'], dtype=object)

In [40]:
df.FieldPosition.unique()

array(['NE', 'KC', nan, 'BUF', 'NYJ', 'ATL', 'CHI', 'CIN', 'BLT', 'CLV',
       'PIT', 'ARZ', 'DET', 'JAX', 'HST', 'TEN', 'OAK', 'WAS', 'PHI',
       'LA', 'IND', 'GB', 'SEA', 'CAR', 'SF', 'DAL', 'NYG', 'NO', 'MIN',
       'DEN', 'LAC', 'TB', 'MIA'], dtype=object)

In [14]:
df.OffenseFormation.unique()

array(['SHOTGUN', 'SINGLEBACK', 'JUMBO', 'PISTOL', 'I_FORM', 'ACE',
       'WILDCAT', nan, 'EMPTY'], dtype=object)

In [15]:
df.OffensePersonnel.unique()

array(['1 RB, 1 TE, 3 WR', '6 OL, 2 RB, 2 TE, 0 WR', '1 RB, 3 TE, 1 WR',
       '1 RB, 2 TE, 2 WR', '6 OL, 1 RB, 2 TE, 1 WR', '2 RB, 1 TE, 2 WR',
       '2 RB, 2 TE, 1 WR', '0 RB, 3 TE, 2 WR', '0 RB, 1 TE, 4 WR',
       '6 OL, 1 RB, 0 TE, 3 WR', '6 OL, 1 RB, 1 TE, 2 WR',
       '1 RB, 2 TE, 1 WR,1 DL', '1 RB, 3 TE, 0 WR,1 DL',
       '1 RB, 0 TE, 4 WR', '1 RB, 1 TE, 2 WR,1 DL',
       '6 OL, 2 RB, 0 TE, 2 WR', '2 RB, 0 TE, 3 WR',
       '6 OL, 2 RB, 1 TE, 1 WR', '7 OL, 1 RB, 0 TE, 2 WR',
       '7 OL, 2 RB, 0 TE, 1 WR', '7 OL, 1 RB, 2 TE, 0 WR',
       '2 RB, 3 TE, 0 WR', '3 RB, 1 TE, 1 WR', '6 OL, 1 RB, 3 TE, 0 WR',
       '6 OL, 1 RB, 2 TE, 0 WR,1 DL', '2 RB, 3 TE, 1 WR',
       '6 OL, 1 RB, 1 TE, 1 WR,1 DL', '1 RB, 4 TE, 0 WR',
       '1 RB, 2 TE, 1 WR,1 LB', '1 RB, 3 TE, 0 WR,1 LB',
       '7 OL, 2 RB, 1 TE, 0 WR', '0 RB, 2 TE, 3 WR',
       '1 RB, 0 TE, 3 WR,1 DB', '6 OL, 1 RB, 2 TE, 0 WR,1 LB',
       '1 RB, 1 TE, 2 WR,1 DB', '0 RB, 0 TE, 5 WR', '1 RB, 2 TE, 3 WR',
       '1 RB, 

In [16]:
df.PlayDirection.unique()

array(['left', 'right'], dtype=object)

In [39]:
df.Position.unique(), df_play.Position.unique()

(array(['SS', 'DE', 'ILB', 'FS', 'CB', 'DT', 'WR', 'TE', 'T', 'QB', 'RB',
        'G', 'C', 'OLB', 'NT', 'FB', 'MLB', 'LB', 'OT', 'OG', 'HB', 'DB',
        'S', 'DL', 'SAF'], dtype=object),
 array(['RB', 'WR', 'FB', 'HB', 'QB', 'TE', 'CB', 'DT', 'DE'], dtype=object))

In [50]:
team_enc = df_play.Team.apply(lambda x: 0 if x == 'home' else 1)
direction_enc = df_play.PlayDirection.apply(lambda x: 0 if x == 'left' else 1)
(direction_enc - team_enc).value_counts()

 0    11576
 1     5869
-1     5726
dtype: int64

### Time

In [22]:
(pd.to_datetime(df.TimeHandoff) - pd.to_datetime(df.TimeSnap)).sort_values().unique()/1e9

array([0, 1, 2, 3, 4, 5, 7], dtype='timedelta64[ns]')

### Conditions

In [24]:
df.Turf.unique()

array(['Field Turf', 'A-Turf Titan', 'Grass', 'UBU Sports Speed S5-M',
       'Artificial', 'DD GrassMaster', 'Natural Grass',
       'UBU Speed Series-S5-M', 'FieldTurf', 'FieldTurf 360',
       'Natural grass', 'grass', 'Natural', 'Artifical', 'FieldTurf360',
       'Naturall Grass', 'Field turf', 'SISGrass',
       'Twenty-Four/Seven Turf', 'natural grass'], dtype=object)

In [25]:
df.GameWeather.unique()

array(['Clear and warm', 'Sun & clouds', 'Sunny', 'Controlled Climate',
       'Mostly Sunny', 'Clear', nan, 'Indoor', 'Mostly Cloudy',
       'Mostly Coudy', 'Partly sunny', 'Partly Cloudy', 'Cloudy',
       'Sunny, highs to upper 80s', 'Indoors', 'Light Rain', 'Showers',
       'Partly cloudy', 'Partly Sunny', '30% Chance of Rain',
       'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Rain', 'Cloudy, fog started developing in 2nd quarter', 'Coudy',
       'Rain likely, temps in low 40s.', 'Cold', 'N/A (Indoors)',
       'Clear skies', 'cloudy', 'Fair', 'Mostly cloudy',
       'Cloudy, chance of rain', 'Heavy lake effect snow', 'Party Cloudy',
       'Cloudy, light snow accumulating 1-3"', 'Cloudy and cold', 'Snow',
       'Hazy', 'Scattered Showers', 'Cloudy and Cool', 'N/A Indoor',
       'Rain Chance 40%', 'Clear and sunny', 'Mostly sunny',
       'Sunny and warm', 'Partly clear', 'Cloudy, 50% change of rain',
       'Clear and Sunny', '

In [26]:
df.WindSpeed.unique()

array(['8', '6', '10', '9', '11', nan, '7', '5', '2', '12', '1', '3', '4',
       '13', '0', 'SSW', '14', '15', '17', '18', '16', '11-17', '23',
       '14-23', '13 MPH', '24', '12-22', '4 MPh', '15 gusts up to 25',
       '10MPH', '10mph', '22', 'E', '7 MPH', 'Calm', '6 mph', '19', 'SE',
       '20', '10-20', '12mph'], dtype=object)

In [27]:
df.WindDirection.unique()

array(['SW', 'NNE', 'SE', 'East', nan, 'NE', 'North', 'S', 'Northwest',
       'SouthWest', 'ENE', 'ESE', 'SSW', 'NW', 'Northeast', 'From S', 'W',
       'South', 'West-Southwest', 'E', '13', 'N', 'NNW',
       'South Southeast', 'SSE', 'West', 'WSW', 'From SW', 'WNW', 's',
       'NorthEast', 'from W', 'W-NW', 'South Southwest', 'Southeast',
       'From WSW', 'West Northwest', 'Calm', 'From SSE', 'From W',
       'East North East', 'From ESE', 'EAST', 'East Southeast',
       'From SSW', '8', 'North East', 'Southwest', 'North/Northwest',
       'From NNE', '1', 'N-NE', 'W-SW', 'From NNW'], dtype=object)