##### Objective: Make NFL Playcall Predictions
* Will the team on offense choose a rushing or passing play?
* Will the team convert a 1st down?
* Will the team punt, FG, go for it on 4th down?
* Percentage predictions for e.g. Sack, Interception, Touchdown, Safety

##### Needed data cleaning
* convert mis-categorized clockstops with defender named "Spikes"
* Pull in other dataset for offensive coordinators and coaches
* Make sure no null RushDirection on any PlayType == RUSH
* Make sure no null PassType on any PlayType == RUSH

##### Add columns
* Time remaining in game
* Time remaining in half
* Coach
* Offensive coordinator
* Formation (parse from description)
* Prev PlayType

In [8]:
import pandas as pd

CSV_NAME = 'data/NFLPlaybyPlay2015.csv'

DROP_PLAYTYPES = {
    'No Play',
    'Kickoff',
    'Timeout',
    'Quarter End',
    'Two Minute Warning',
    'End of Game',
    'Onside Kick',
    'Half End',
    'Extra Point',
}

DROP_COLUMNS = [
    'Unnamed: 0',
    'Returner',
    'BlockingPlayer',
    'Tackler1',
    'Tackler2',
    'PlayTimeDiff',
    'Challenge.Replay',
    'ChalReplayResult',
    'Accepted.Penalty',
    'PenalizedTeam',
    'PenaltyType',
    'PenalizedPlayer',
    'Penalty.Yards',
    'Season',
    'Interceptor',
    'Date',
    'ExPointResult',
    'Receiver',
    'ReturnResult',
    'Fumble',
    'RecFumbTeam',
    'RecFumbPlayer',
    'GoalToGo',
    'Drive',
    'TwoPointConv',
    'DefTwoPoint',
    'FieldGoalResult',
    'FieldGoalDistance',
    'PuntResult',
]

COL_RENAME_MAP = {
    'sp': 'ScoringPlay',
    'desc': 'Description',
    'qtr': 'Quarter',
    'down': 'Down',
    'time': 'Time',
    'posteam': 'OffensiveTeam',
    'Yards.Gained': 'YardsGained',
    'yrdln': 'YardLine',
    'yrdline100': 'YardLineOf100',
    'ydstogo': 'YardsToGo',
    'ydsnet': 'YardsNet',
}

DROPNA_COLS = [
    'ScoreDiff',
    'Time',
    'Down',
    'FirstDown',
]

FEATURE_COLS = [
    'Down',
    'Time',
    # Add "GameTimeRemaining" column
    'YardsToGo',
    'OffensiveTeam',
    'DefensiveTeam',
    # Add "Coach" column
    # Add "Offensive Coordinator" column
    'Passer',
    'ScoreDiff',
]

RESPONSE_COL = 'PlayType'

In [3]:

def clean_nfl_plays_dataframe(unclean_df):
    """Clean dataframe in a number of ways."""
    df = unclean_df.copy()

    # Drop irrelevant columns
    df.drop(DROP_COLUMNS, axis=1, inplace=True)
    
    # Rename poorly named columns
    df.rename(columns=COL_RENAME_MAP, inplace=True)

    # Drop any remaining rows with null values in specific columns (weird data)
    df.dropna(subset=DROPNA_COLS, inplace=True)

    # Drop plays with irrelevant playtypes
    df.drop(df[df.PlayType.isin(DROP_PLAYTYPES)].index, inplace=True)
    
    # Change SACK PlayType to PASS -- assuming sack was made during pass attempt
    df.PlayType.replace('Sack', 'Pass', inplace=True)
    
    return df


In [11]:
uncleaned_plays = pd.read_csv(CSV_NAME, header=0)

In [13]:
uncleaned_plays.columns

Index([u'Unnamed: 0', u'Date', u'GameID', u'Drive', u'qtr', u'down', u'time',
       u'TimeUnder', u'TimeSecs', u'PlayTimeDiff', u'SideofField', u'yrdln',
       u'yrdline100', u'ydstogo', u'ydsnet', u'GoalToGo', u'FirstDown',
       u'posteam', u'DefensiveTeam', u'desc', u'PlayAttempted',
       u'Yards.Gained', u'sp', u'Touchdown', u'ExPointResult', u'TwoPointConv',
       u'DefTwoPoint', u'Safety', u'PuntResult', u'PlayType', u'Passer',
       u'PassAttempt', u'PassOutcome', u'PassLength', u'PassLocation',
       u'InterceptionThrown', u'Interceptor', u'Rusher', u'RushAttempt',
       u'RunLocation', u'RunGap', u'Receiver', u'Reception', u'ReturnResult',
       u'Returner', u'BlockingPlayer', u'Tackler1', u'Tackler2',
       u'FieldGoalResult', u'FieldGoalDistance', u'Fumble', u'RecFumbTeam',
       u'RecFumbPlayer', u'Sack', u'Challenge.Replay', u'ChalReplayResult',
       u'Accepted.Penalty', u'PenalizedTeam', u'PenaltyType',
       u'PenalizedPlayer', u'Penalty.Yards', u'PosTeamS

In [14]:
uncleaned_plays.head(30)

Unnamed: 0.1,Unnamed: 0,Date,GameID,Drive,qtr,down,time,TimeUnder,TimeSecs,PlayTimeDiff,...,Accepted.Penalty,PenalizedTeam,PenaltyType,PenalizedPlayer,Penalty.Yards,PosTeamScore,DefTeamScore,ScoreDiff,AbsScoreDiff,Season
0,36,2015-09-10,2015091000,1,1,,15:00,15,3600.0,0.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
1,51,2015-09-10,2015091000,1,1,1.0,15:00,15,3600.0,0.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
2,72,2015-09-10,2015091000,1,1,1.0,14:21,15,3561.0,39.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
3,101,2015-09-10,2015091000,1,1,2.0,14:04,15,3544.0,17.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
4,122,2015-09-10,2015091000,1,1,1.0,13:26,14,3506.0,38.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
5,159,2015-09-10,2015091000,1,1,1.0,12:42,13,3462.0,44.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
6,180,2015-09-10,2015091000,1,1,1.0,12:05,13,3425.0,37.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
7,199,2015-09-10,2015091000,1,1,2.0,11:20,12,3380.0,45.0,...,1,PIT,,M.Gilbert,10,0.0,0.0,0.0,0.0,2015
8,236,2015-09-10,2015091000,1,1,2.0,10:53,11,3353.0,27.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015
9,261,2015-09-10,2015091000,1,1,3.0,10:28,11,3328.0,25.0,...,0,,,,0,0.0,0.0,0.0,0.0,2015


In [25]:
uncleaned_plays.PlayType.value_counts()

Pass                  18323
Run                   13109
No Play                2608
Kickoff                2565
Punt                   2443
Timeout                1859
Sack                   1191
Extra Point            1126
Field Goal              988
Quarter End             675
QB Kneel                425
End of Game             211
Onside Kick              67
Spike                    52
Half End                  6
Name: PlayType, dtype: int64

In [18]:
uncleaned_plays.isnull().sum()

Unnamed: 0               0
Date                     0
GameID                   0
Drive                    0
qtr                      0
down                  7123
time                    27
TimeUnder                0
TimeSecs                27
PlayTimeDiff            54
SideofField             66
yrdln                  108
yrdline100             108
ydstogo                  0
ydsnet                   0
GoalToGo               108
FirstDown             3318
posteam               3251
DefensiveTeam         3251
desc                     0
PlayAttempted            0
Yards.Gained             0
sp                       0
Touchdown                0
ExPointResult        44998
TwoPointConv         46040
DefTwoPoint          46124
Safety                   0
PuntResult           43644
PlayType                 0
                     ...  
Interceptor          45662
Rusher               33063
RushAttempt              0
RunLocation          33163
RunGap               36542
Receiver             27671
R

In [20]:
plays = clean_nfl_plays_dataframe(uncleaned_plays)

In [21]:
plays.columns

Index([u'GameID', u'Quarter', u'Down', u'Time', u'TimeUnder', u'TimeSecs',
       u'SideofField', u'YardLine', u'YardLineOf100', u'YardsToGo',
       u'YardsNet', u'FirstDown', u'OffensiveTeam', u'DefensiveTeam',
       u'Description', u'PlayAttempted', u'YardsGained', u'ScoringPlay',
       u'Touchdown', u'Safety', u'PlayType', u'Passer', u'PassAttempt',
       u'PassOutcome', u'PassLength', u'PassLocation', u'InterceptionThrown',
       u'Rusher', u'RushAttempt', u'RunLocation', u'RunGap', u'Reception',
       u'Sack', u'PosTeamScore', u'DefTeamScore', u'ScoreDiff',
       u'AbsScoreDiff'],
      dtype='object')

In [22]:
plays.head(30)

Unnamed: 0,GameID,Quarter,Down,Time,TimeUnder,TimeSecs,SideofField,YardLine,YardLineOf100,YardsToGo,...,Rusher,RushAttempt,RunLocation,RunGap,Reception,Sack,PosTeamScore,DefTeamScore,ScoreDiff,AbsScoreDiff
1,2015091000,1,1.0,15:00,15,3600.0,PIT,20.0,80.0,10,...,D.Hightower,1,right,tackle,0,0,0.0,0.0,0.0,0.0
2,2015091000,1,1.0,14:21,15,3561.0,PIT,38.0,62.0,10,...,,0,,,1,0,0.0,0.0,0.0,0.0
3,2015091000,1,2.0,14:04,15,3544.0,PIT,47.0,53.0,1,...,J.Collins,1,right,guard,0,0,0.0,0.0,0.0,0.0
4,2015091000,1,1.0,13:26,14,3506.0,NE,49.0,49.0,10,...,,0,,,1,0,0.0,0.0,0.0,0.0
5,2015091000,1,1.0,12:42,13,3462.0,NE,35.0,35.0,10,...,J.Collins,1,right,guard,0,0,0.0,0.0,0.0,0.0
6,2015091000,1,1.0,12:05,13,3425.0,NE,24.0,24.0,10,...,,0,,,0,1,0.0,0.0,0.0,0.0
8,2015091000,1,2.0,10:53,11,3353.0,NE,42.0,42.0,28,...,W.Johnson,1,right,guard,0,0,0.0,0.0,0.0,0.0
9,2015091000,1,3.0,10:28,11,3328.0,NE,36.0,36.0,22,...,,0,,,1,0,0.0,0.0,0.0,0.0
10,2015091000,1,4.0,09:44,10,3284.0,NE,26.0,26.0,12,...,,0,,,0,0,0.0,0.0,0.0,0.0
11,2015091000,1,1.0,09:40,10,3280.0,NE,34.0,66.0,10,...,,0,,,1,0,0.0,0.0,0.0,0.0


In [23]:
model_ready_plays = plays[FEATURE_COLS + [RESPONSE_COL, ]]

In [24]:
model_ready_plays.head(30)

Unnamed: 0,Down,Time,YardsToGo,OffensiveTeam,DefensiveTeam,Passer,ScoreDiff,PlayType
1,1.0,15:00,10,PIT,NE,,0.0,Run
2,1.0,14:21,10,PIT,NE,B.Roethlisberger,0.0,Pass
3,2.0,14:04,1,PIT,NE,,0.0,Run
4,1.0,13:26,10,PIT,NE,B.Roethlisberger,0.0,Pass
5,1.0,12:42,10,PIT,NE,,0.0,Run
6,1.0,12:05,10,PIT,NE,,0.0,Pass
8,2.0,10:53,28,PIT,NE,,0.0,Run
9,3.0,10:28,22,PIT,NE,B.Roethlisberger,0.0,Pass
10,4.0,09:44,12,PIT,NE,,0.0,Field Goal
11,1.0,09:40,10,NE,PIT,T.Brady,0.0,Pass


In [27]:
model_ready_plays.PlayType.value_counts(1)

Pass          0.533950
Run           0.358777
Punt          0.067077
Field Goal    0.027127
QB Kneel      0.011642
Spike         0.001428
Name: PlayType, dtype: float64