In [1]:
import re, math
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import nfldb
import sys
import matplotlib as mpl
import seaborn as sns
from __future__ import division
import scipy.stats as st
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.libqsturng import psturng
from statsmodels.formula.api import ols
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
# configure matplotlib plots
%matplotlib inline
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.figsize'] = [12, 9]#[10, 7.5]
mpl.rcParams['figure.dpi'] = 90
mpl.rcParams['savefig.dpi'] = 100
# configure seaborn plots
sns.set(style="whitegrid", color_codes=True)

yrType = '2016 Regular Season'

In [3]:
# functions
def icol(df, cols):
    return [i for i, dfCol in enumerate(df.columns) for col in cols if dfCol==col]

def appStrs(cols, string):
    return [c + string for c in cols]

In [4]:
# load proccessed data
plays = pd.read_csv('data/2016_reg_plays_gd.csv')

In [5]:
# play lables
playType = ['pass', 'rush']
playDir = ['left', 'up the middle', 'right']
playCols = [t + '_' + d[-6:] for t in playType for d in playDir]
# Feature column names: primarily cum sum stats columns - initially
col2cum = [c for c in plays.columns for x in ['type', 'dir', 'first', 'third', 'fourth', 'offense', 'passing', 'receiving', 'rushing'] if c.startswith(x) and not c.endswith('downs') and not c.endswith('_sh1') and not c.endswith('cum_dr') and not c.endswith('cum_gm')]
gcumCols = appStrs(col2cum, '_cum_gm')
dcumCols = appStrs(col2cum, '_cum_dr')
cumCols = gcumCols + dcumCols
prevPlay = appStrs(col2cum, '_sh1')
plays.pos_team = plays.pos_team.astype('category')
plays['pos_team_code'] = plays.pos_team.cat.codes
### - INCORPORATE THESE INTO plays_setup in future feature engineering
# DRIVE cumulative stats: fill NaN in columns with 0
plays[dcumCols] = plays[dcumCols].fillna(0)
# GAME cumulative stats: fill NaN in columns with forward fill
plays[gcumCols] = plays.groupby(['gsis_id', 'pos_team'])[gcumCols].fillna(method='ffill')
# GAME: cumulative stats: fill first play of game with 0
plays[gcumCols] = plays[gcumCols].fillna(0)
# Noncumulative, shifted stats: fill first play of game with 0
plays[prevPlay] = plays[prevPlay].fillna(0)
###
plays.play = plays.play.astype('category')
plays['play_code'] = plays.play.cat.codes
plays.play_type = plays.play_type.astype('category')
plays['play_type_code'] = plays.play_type.cat.codes
plays.play_dir = plays.play_dir.astype('category')
plays['play_dir_code'] = plays.play_dir.cat.codes

In [6]:
plays = plays.drop(['Unnamed: 0'], axis=1)#, 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis=1)
plays.head()

Unnamed: 0,gsis_id,pos_team,drive_id,description,down,drive,first_down,fourth_down_att,fourth_down_conv,fourth_down_failed,...,def_SF,def_TB,def_TEN,def_WAS,day_Monday,day_Saturday,day_Sunday,day_Thursday,play_type_code,play_dir_code
0,2016090800,CAR,2,(11:37) J.Stewart up the middle to CAR 35 for ...,1.0,[Touchdown ] CAR from OWN 29 to OPP 14 (last...,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
1,2016090800,CAR,2,(11:01) (Shotgun) J.Stewart left end to CAR 46...,2.0,[Touchdown ] CAR from OWN 29 to OPP 14 (last...,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,2016090800,CAR,2,(10:21) (Shotgun) M.Tolbert right guard to CAR...,1.0,[Touchdown ] CAR from OWN 29 to OPP 14 (last...,0,0,0,0,...,0,0,0,0,0,0,0,1,1,2
3,2016090800,CAR,2,(9:39) (Shotgun) C.Newton pass incomplete shor...,2.0,[Touchdown ] CAR from OWN 29 to OPP 14 (last...,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,2016090800,CAR,2,(9:35) (Shotgun) C.Newton pass deep middle to ...,3.0,[Touchdown ] CAR from OWN 29 to OPP 14 (last...,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1


# Potential feature predictors

In [53]:
# incoporate new features and dummy cols in features array
adfeat = ['quarter_code', 'qtr_timef', 'down', 'yards_to_go', 'score_diff', 'week', 'home', 'yardfield']
dumPrefix = ['off_', 'def_', 'day_']
catExc = ['pos_team', 'def_team', 'day_of_week']
dumCols = [c for c in plays.columns for d in dumPrefix if c.startswith(d) and c not in catExc]
features = cumCols + prevPlay + adfeat + dumCols
features = np.load('data/features_dum.npy')
# not in ['rushing_loss_cum_gm', 'rushing_loss_yds_cum_gm']
features = [f for f in features if not f.startswith('rushing_loss') and f not in ['passing_int_cum_dr']]

In [28]:
# features = [f for f in features if not f.startswith('type') and not f.startswith('dir')]
features = np.array(features)
features

array(['first_down_cum_gm', 'fourth_down_att_cum_gm',
       'fourth_down_failed_cum_gm', 'offense_tds_cum_gm',
       'offense_yds_cum_gm', 'passing_att_cum_gm', 'passing_cmp_cum_gm',
       'passing_first_down_cum_gm', 'passing_incmp_cum_gm',
       'passing_incmp_air_yds_cum_gm', 'passing_int_cum_gm',
       'passing_sk_cum_gm', 'passing_sk_yds_cum_gm', 'passing_tds_cum_gm',
       'passing_yds_cum_gm', 'receiving_rec_cum_gm',
       'receiving_tds_cum_gm', 'receiving_yac_yds_cum_gm',
       'rushing_att_cum_gm', 'rushing_first_down_cum_gm',
       'rushing_tds_cum_gm', 'rushing_yds_cum_gm', 'third_down_att_cum_gm',
       'third_down_conv_cum_gm', 'third_down_failed_cum_gm',
       'type_pass_cum_gm', 'type_rush_cum_gm', 'dir_left_cum_gm',
       'dir_middle_cum_gm', 'dir_right_cum_gm', 'first_down_cum_dr',
       'fourth_down_att_cum_dr', 'fourth_down_failed_cum_dr',
       'offense_tds_cum_dr', 'offense_yds_cum_dr', 'passing_att_cum_dr',
       'passing_cmp_cum_dr', 'passing_cmp_

# Greedy Feature Selection based on Logistic Regression coefficients 
w/ 3-fold Cross validation

## Target: Play (play type and play direction combined)

In [None]:
# 6 class play
# greedy recursive feature elimination by 3-fold cross-validation
cv_n = 3
seed = 9
kf = KFold(n_splits=cv_n, random_state=seed)
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced')
featSelect = RFECV(lr, step=1, cv=kf, n_jobs=-1)
featSelect.fit(plays[features], plays.play)
features[featSelect.support_]

In [16]:
features[featSelect.support_]

array(['offense_tds_cum_gm', 'rushing_att_cum_gm',
       'third_down_conv_cum_gm', 'type_rush_cum_gm', 'dir_middle_cum_gm',
       'dir_right_cum_gm', 'passing_att_cum_dr', 'passing_cmp_cum_dr',
       'passing_incmp_cum_dr', 'receiving_rec_cum_dr',
       'third_down_att_cum_dr', 'third_down_conv_cum_dr',
       'type_pass_cum_dr', 'type_rush_cum_dr', 'dir_left_cum_dr',
       'dir_middle_cum_dr', 'dir_right_cum_dr', 'first_down_sh1',
       'offense_yds_sh1', 'passing_att_sh1', 'passing_cmp_sh1',
       'passing_first_down_sh1', 'passing_yds_sh1', 'receiving_rec_sh1',
       'rushing_att_sh1', 'rushing_yds_sh1', 'type_pass_sh1',
       'type_rush_sh1', 'qtr_timef', 'down', 'yards_to_go', 'score_diff'], 
      dtype='|S28')

## Target: Play Type

In [29]:
# Binary play type (pass vs rush)
# greedy recursive feature elimination by 3-fold cross-validation
cv_n = 3
seed = 9
kf = KFold(n_splits=cv_n, random_state=seed)
lr = LogisticRegression(multi_class='ovr', solver='lbfgs', class_weight='balanced')
fSelect_type = RFECV(lr, step=1, cv=kf, n_jobs=-1)
fSelect_type.fit(plays[features], plays.play_type_code)
features[fSelect_type.support_]

array(['fourth_down_att_cum_gm', 'fourth_down_failed_cum_gm',
       'passing_int_cum_gm', 'passing_sk_yds_cum_gm',
       'receiving_rec_cum_gm', 'type_rush_cum_gm', 'first_down_cum_dr',
       'passing_att_cum_dr', 'passing_cmp_cum_dr',
       'passing_first_down_cum_dr', 'passing_incmp_cum_dr',
       'passing_sk_yds_cum_dr', 'receiving_rec_cum_dr',
       'third_down_att_cum_dr', 'third_down_conv_cum_dr',
       'type_pass_cum_dr', 'first_down_sh1', 'fourth_down_att_sh1',
       'offense_yds_sh1', 'passing_att_sh1', 'passing_cmp_sh1',
       'passing_cmp_air_yds_sh1', 'passing_first_down_sh1',
       'passing_incmp_sh1', 'passing_int_sh1', 'passing_sk_yds_sh1',
       'passing_yds_sh1', 'receiving_rec_sh1', 'receiving_yac_yds_sh1',
       'rushing_tds_sh1', 'rushing_yds_sh1', 'third_down_conv_sh1',
       'type_pass_sh1', 'type_rush_sh1', 'dir_middle_sh1', 'dir_right_sh1',
       'quarter_code', 'qtr_timef', 'down', 'yards_to_go', 'score_diff',
       'home', 'off_ARI', 'off_ATL', 

In [22]:
# Binary play type (pass vs rush)
# greedy recursive feature elimination by 3-fold cross-validation
cv_n = 3
seed = 9
kf = KFold(n_splits=cv_n, random_state=seed)
lr = LogisticRegression(multi_class='ovr', solver='lbfgs', class_weight='balanced')
fSelect_type = RFECV(lr, step=1, cv=kf, n_jobs=-1)
fSelect_type.fit(plays[features], plays.play_type_code)
features[fSelect_type.support_]

array(['fourth_down_att_cum_gm', 'passing_att_cum_gm',
       'passing_sk_yds_cum_gm', 'receiving_tds_cum_gm',
       'rushing_att_cum_gm', 'first_down_cum_dr', 'passing_att_cum_dr',
       'passing_cmp_cum_dr', 'passing_incmp_cum_dr',
       'passing_sk_yds_cum_dr', 'receiving_rec_cum_dr',
       'third_down_att_cum_dr', 'offense_yds_sh1', 'passing_att_sh1',
       'passing_cmp_sh1', 'passing_cmp_air_yds_sh1',
       'passing_first_down_sh1', 'passing_incmp_sh1', 'passing_int_sh1',
       'passing_sk_yds_sh1', 'passing_yds_sh1', 'receiving_rec_sh1',
       'receiving_yac_yds_sh1', 'rushing_first_down_sh1',
       'rushing_tds_sh1', 'rushing_yds_sh1', 'third_down_conv_sh1',
       'quarter_code', 'qtr_timef', 'down', 'yards_to_go', 'score_diff',
       'home', 'off_ARI', 'off_ATL', 'off_BAL', 'off_BUF', 'off_CAR',
       'off_CHI', 'off_CIN', 'off_CLE', 'off_DAL', 'off_DEN', 'off_DET',
       'off_GB', 'off_HOU', 'off_IND', 'off_JAC', 'off_KC', 'off_LA',
       'off_MIA', 'off_MIN', 'o

In [32]:
fSelect_type.n_features_

93

## Target: Play Direction

In [30]:
# 3 class play direction (left, middle, right)
# greedy recursive feature elimination by 3-fold cross-validation
cv_n = 3
seed = 9
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced')
fSelect_dir = RFECV(lr, step=1, cv=kf, n_jobs=-1)
fSelect_dir.fit(plays[features], plays.play_dir_code)
features[fSelect_dir.support_]

array(['first_down_cum_gm', 'fourth_down_att_cum_gm', 'offense_tds_cum_gm',
       'offense_yds_cum_gm', 'passing_cmp_cum_gm',
       'passing_first_down_cum_gm', 'passing_incmp_cum_gm',
       'passing_int_cum_gm', 'passing_sk_yds_cum_gm', 'passing_yds_cum_gm',
       'receiving_rec_cum_gm', 'rushing_att_cum_gm',
       'rushing_first_down_cum_gm', 'rushing_tds_cum_gm',
       'rushing_yds_cum_gm', 'third_down_att_cum_gm',
       'third_down_failed_cum_gm', 'type_pass_cum_gm', 'dir_left_cum_gm',
       'dir_middle_cum_gm', 'dir_right_cum_gm', 'first_down_cum_dr',
       'passing_first_down_cum_dr', 'passing_incmp_cum_dr',
       'type_pass_cum_dr', 'dir_left_cum_dr', 'dir_middle_cum_dr',
       'dir_right_cum_dr', 'first_down_sh1', 'passing_att_sh1',
       'passing_first_down_sh1', 'rushing_att_sh1',
       'rushing_first_down_sh1', 'third_down_conv_sh1', 'type_pass_sh1',
       'type_rush_sh1', 'dir_left_sh1', 'dir_middle_sh1', 'dir_right_sh1',
       'quarter_code', 'qtr_timef', 'd

In [31]:
fSelect_dir.n_features_

68