In [1]:
import numpy as np
import pandas as pd

# Read in data

In [2]:
yards = pd.read_csv('../data/cleaned_train.csv', usecols=['GameId','PlayId','Yards','YardsToTouchdown','YardsToSafety'])\
          .drop_duplicates()\
          .reset_index(drop=True)

In [3]:
yards.shape

(31007, 5)

In [4]:
yards.head()

Unnamed: 0,GameId,PlayId,Yards,YardsToTouchdown,YardsToSafety
0,2017090700,20170907000118,8,65,35
1,2017090700,20170907000139,3,57,43
2,2017090700,20170907000189,5,35,65
3,2017090700,20170907000345,2,2,98
4,2017090700,20170907000395,7,75,25


# Explore Capping Range of Potential Gains/Losses

In [5]:
yards[['Yards','YardsToTouchdown','YardsToSafety']].describe()

Unnamed: 0,Yards,YardsToTouchdown,YardsToSafety
count,31007.0,31007.0,31007.0
mean,4.227626,51.671945,48.328055
std,6.449966,25.242847,25.242847
min,-15.0,1.0,1.0
25%,1.0,32.0,26.0
50%,3.0,56.0,44.0
75%,6.0,74.0,68.0
max,99.0,99.0,99.0


In [6]:
percentiles_of_interest = [1, 5, 10, 25, 50, 75, 90, 95, 99]
percentiles = [np.percentile(yards.Yards, x) for x in percentiles_of_interest]
percentiles

[-4.0, -2.0, -1.0, 1.0, 3.0, 6.0, 10.0, 14.0, 28.0]

It seems to me we should be able to cap our predicted range from a 5 yard loss to a 30 yard gain. That is we only have to predict for 36 yards worth of gain/loss instead of 198 (-99 to +99). On top of that we have the potential to cap based on where on the field the LOS is. If the LOS is at their own 1, then we know they can at max lose 1 yard on the rush, and gain the max 30 capped yards instead of 99.

Now we have to change our outcome range from index 0 - 198, to 0 - 35. 0 will represent a -5 yard rush and 35 will represent a 30 yard gain.

In [7]:
min_idx = 71
max_idx = 150

In [8]:
yards['YardIndex'] = yards['Yards'].apply(lambda x: x + 99)
yards['YardIndexClipped'] = yards['YardIndex'].apply(lambda x: min_idx if x < min_idx else max_idx if x > max_idx else x)
yards['UnclippedYardIndex'] = yards['Yards'].apply(lambda x: x + 99)

In [9]:
yards.head()

Unnamed: 0,GameId,PlayId,Yards,YardsToTouchdown,YardsToSafety,YardIndex,YardIndexClipped,UnclippedYardIndex
0,2017090700,20170907000118,8,65,35,107,107,107
1,2017090700,20170907000139,3,57,43,102,102,102
2,2017090700,20170907000189,5,35,65,104,104,104
3,2017090700,20170907000345,2,2,98,101,101,101
4,2017090700,20170907000395,7,75,25,106,106,106


In [10]:
print('max yardIndex: ', yards.YardIndex.max())
print('max yardIndexClipped: ', yards.YardIndexClipped.max())
print('min yardIndex: ', yards.YardIndex.min())
print('min yardIndexClipped: ', yards.YardIndexClipped.min())

max yardIndex:  198
max yardIndexClipped:  150
min yardIndex:  84
min yardIndexClipped:  84


# Write Play Outcomes to Disk

In [11]:
yards.to_csv('../data/yard_outcomes.csv', header=True, index=False)