# NFL Big Data Bowl - Feature Engineering v1
-------------------
TheNerdyCat <br>
27 Nov 2019 Deadline


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as patches
import seaborn as sns

import datetime
import kaggle
import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import keras
import math

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]
pd.options.display.max_columns = 100


  from numpy.core.umath_tests import inner1d
Using TensorFlow backend.


In [2]:
df = pd.read_csv("../input/nfl-big-data-bowl-2020/train.csv", low_memory=False)
df['ToLeft'] = df.PlayDirection == "left"

#rm_cols = ['PlayDirection']
# Match the NFLId to that play's rusher's ID
df['IsBallCarrier'] = df.NflId == df.NflIdRusher 

# Correct differences in Team Name abbreviations
df.loc[df.VisitorTeamAbbr == "ARI", 'VisitorTeamAbbr'] = "ARZ"
df.loc[df.HomeTeamAbbr == "ARI", 'HomeTeamAbbr'] = "ARZ"
df.loc[df.VisitorTeamAbbr == "BAL", 'VisitorTeamAbbr'] = "BLT"
df.loc[df.HomeTeamAbbr == "BAL", 'HomeTeamAbbr'] = "BLT"
df.loc[df.VisitorTeamAbbr == "CLE", 'VisitorTeamAbbr'] = "CLV"
df.loc[df.HomeTeamAbbr == "CLE", 'HomeTeamAbbr'] = "CLV"
df.loc[df.VisitorTeamAbbr == "HOU", 'VisitorTeamAbbr'] = "HST"
df.loc[df.HomeTeamAbbr == "HOU", 'HomeTeamAbbr'] = "HST"

# New feature to show Dir in radians
df['Dir_rad'] = np.mod(90 - df.Dir, 360) * math.pi/180.0

It's really hard to tell which team is on offense friom the data! Even though the ball carrier is highlighted in black, the inconsistency from one play to the next is less than ideal. Sometimes the away team is on offense, other times the home team is on offense. And they're both potentially moving left or moving right.

Our ultimate goal will be to ensure that the offensive team (PossessionTeam) is moving left to right, even if in the raw data, the offense is moving right to left.

The following set of code will get us there.

In [3]:
df['TeamOnOffense'] = "home"
df.loc[df.PossessionTeam != df.HomeTeamAbbr, 'TeamOnOffense'] = "away"
# Is player on offense?
df['IsOnOffense'] = df.Team == df.TeamOnOffense 
#rm_cols += ['Team']

df['YardLine_std'] = 100 - df.YardLine
df.loc[df.FieldPosition.fillna('') == df.PossessionTeam,  
          'YardLine_std'
         ] = df.loc[df.FieldPosition.fillna('') == df.PossessionTeam,  
          'YardLine']
df['X_std'] = df.X
df.loc[df.ToLeft, 'X_std'] = 120 - df.loc[df.ToLeft, 'X'] 
df['Y_std'] = df.Y
df.loc[df.ToLeft, 'Y_std'] = 160/3 - df.loc[df.ToLeft, 'Y'] 
df['Orientation_std'] = -90 + df.Orientation
df['Dir_std'] = df['Dir_rad']
df.loc[df.ToLeft, 'Dir_std'] = np.mod(np.pi + df.loc[df.ToLeft, 'Dir_rad'], 2*np.pi)
# Clean Position feature
def clean_position(pos):
        if pos == 'SAF':
            return 'DB'
        if pos == 'S':
            return 'DB'
        elif pos == 'OG':
            return 'G'
        elif pos == "OT":
            return 'T'
        else:
            return pos
df['Position'] = df['Position'].apply(clean_position)

In [4]:
df.head(2)

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,ToLeft,IsBallCarrier,Dir_rad,TeamOnOffense,IsOnOffense,YardLine_std,X_std,Y_std,Orientation_std,Dir_std
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,496723,Eric Berry,29,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,212,12/29/1988,Tennessee,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,True,False,4.761607,home,False,35,46.09,18.493333,-8.01,1.620015
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,288,03/25/1989,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,True,False,4.386012,home,False,35,45.33,20.693333,-62.39,1.24442


In [5]:
# OffenseTeam
df = df.rename(columns = {'PossessionTeam':'OffenseTeam'})
# DefenseTeam
df['DefenseTeam'] = df['VisitorTeamAbbr']
df.loc[df.TeamOnOffense == 'away', 'DefenseTeam'] = df['HomeTeamAbbr']
#rm_cols += ['HomeTeamAbbr','VisitorTeamAbbr']

# IsOffenseAtHome
df['IsOffenseAtHome'] = True
df.loc[df.TeamOnOffense == 'away', 'IsOffenseAtHome'] = False

# OffenseScore
df['OffenseScore'] = df['HomeScoreBeforePlay']
df.loc[df.TeamOnOffense == 'away', 'OffenseScore'] = df['VisitorScoreBeforePlay']
#rm_cols += ['HomeScoreBeforePlay']

# DefenseScore
df['DefenseScore'] = df['VisitorScoreBeforePlay']
df.loc[df.TeamOnOffense == 'away', 'DefenseScore'] = df['HomeScoreBeforePlay']
#rm_cols += ['VisitorScoreBeforePlay']

# IsOffenseWinning
df['IsOffenseWinning'] = False
df.loc[df.OffenseScore > df.DefenseScore, 'IsOffenseWinning'] = True

# OffenseInOwnTerritory
df['OffenseInOwnTerritory'] = False
df.loc[df.FieldPosition == df.OffenseTeam, 'OffenseInOwnTerritory'] = True
#rm += ['FieldPosition']

# OffenseRushingPosition
play_rushers = df.loc[df.NflIdRusher == df.NflId, ['PlayId', 'Position']]
play_rushers = play_rushers.rename(columns={'Position': 'OffenseRushingPosition'})
df = df.merge(play_rushers, how='left', left_on='PlayId', right_on='PlayId')

# OffenceFormation
def clean_offenceformation(of):
        if of == "SHOTGUN":
            return 9
        elif of == "SINGLEBACK":
            return 8
        elif of == "JUMBO":
            return 6
        elif of == "PISTOL":
            return 5
        elif of == "I_FORM":
            return 4
        elif of == "ACE":
            return 3
        elif of ==  "WILDCAT":
            return 2
        elif of == "EMPTY":
            return 1
        else: 
            return 7
df['OffenseFormation'] = df['OffenseFormation'].apply(clean_offenceformation)

# NumberOfTEsOnPlay, NumberOfWRsOnPlay, NumberOfBacksOnPlay, ....
def create_generalposition(pos):
    if pos == 'SS' or pos == 'FS' or pos == 'CB' or pos == 'DB':
        return 'DB'
    elif pos == 'DE' or pos == 'DT' or pos == 'DL':
        return 'DL'
    elif pos == 'ILB' or pos == 'OLB' or pos == 'MLB' or pos == 'LB':
        return 'LB'
    elif pos == 'WR':
        return 'WR'
    elif pos == 'TE':
        return 'TE'
    elif pos == 'T' or pos == 'G' or pos == 'C' or pos == 'NT' or pos == 'OL':
        return 'OL'
    elif pos == 'QB' or pos == 'RB' or pos == 'FB' or pos == 'HB' or pos == 'TB' or pos == 'WB':
        return 'OB'
    else:
        return 'Other'
df['GeneralPosition'] = df['Position'].apply(create_generalposition)
# Pivot to find counts of each general position
gen_pos_counts = df[['PlayId','GeneralPosition']].pivot_table(index='PlayId', columns='GeneralPosition', 
                                                              aggfunc=len, fill_value=0)
gen_pos_counts = gen_pos_counts.rename(columns = 
                      {'DB':'NumberOfDBsOnPlay', 'DL':'NumberOfDLinemenOnPlay', 
                       'LB':'NumberOfLBsOnPlay', 'OB':'NumberOfBacksOnPlay',
                       'OL':'NumberOfOLinemenOnPlay', 'TE':'NumberOfTEsOnPlay',
                       'WR':'NumberOfWRsOnPlay'})
df = df.merge(gen_pos_counts, how='left', left_on='PlayId', right_on='PlayId')
#rm_cols += ['DefensePersonnel', 'OffensePersonnel']
# TimeBetweenSnapHandoff, Month, ...
def utc2sec(x):
    return int(x.split("-")[2].split(":")[2].split(".")[0])
def timesnap2day(x):
    day = x.split("T")[0]
    return day
        
df['TimeBetweenSnapHandoff'] = df['TimeHandoff'].apply(utc2sec) - df['TimeSnap'].apply(utc2sec)
df['DayOfYear'] = pd.to_datetime(df['TimeSnap'].apply(timesnap2day)).dt.dayofyear
df['DayOfWeek'] = pd.to_datetime(df['TimeSnap'].apply(timesnap2day)).dt.dayofweek
df['MonthOfYear'] = df['TimeSnap'].apply(lambda x : int(x[5:7]))
df['Morning'] = df['TimeSnap'].apply(lambda x : 1 if (int(x[11:13]) >=0 and int(x[11:13]) <12) else 0)
df['Afternoon'] = df['TimeSnap'].apply(lambda x : 1 if (int(x[11:13]) <18 and int(x[11:13]) >=12) else 0)
df['Evening'] = df['TimeSnap'].apply(lambda x : 1 if (int(x[11:13]) >= 18 and int(x[11:13]) < 24) else 0)
#rm_cols += ['TimeHandoff', 'TimeSnap']

# QuarterGameSecs, TotalGameSecsPlayed, HalfGameSecs
def gameclock2secs(x):
    clock = x.split(":")
    return (60 * int(clock[0])) + int(clock[1])

df['QuarterGameSecs'] = df['GameClock'].apply(gameclock2secs)
df['TotalGameSecsPlayed'] = (900 - df['QuarterGameSecs']) + ((df['Quarter'] - 1) * 900)
df['HalfGameSecsLeft'] = df['QuarterGameSecs']
df.loc[(df['Quarter'].isin([1,3])), 'HalfGameSecsLeft'] = (900 + df['QuarterGameSecs'])
#rm_cols += ['GameClock']

# IsInEngland
df['IsInEngland'] = df["Location"].str.lower().map(lambda x: True if "london" in x else False)
#rm_cols += ['Location']

# StadiumType
# from https://www.kaggle.com/code1110/optimizing-lightgbm-hyperparameters
def group_stadium_types(stadium):
    outdoor = [
        'Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 'Outdor', 'Ourdoor', 
        'Outside', 'Outddors','Outdoor Retr Roof-Open', 'Oudoor', 'Bowl'
        ]
    indoor_closed = [
        'Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed',
        'Retractable Roof', 'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed',
    ]
    indoor_open   = ['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open']
    dome_closed   = ['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']
    dome_open     = ['Domed, Open', 'Domed, open']

    if stadium in outdoor:
        return 0 #'outdoor'
    elif stadium in indoor_closed:
        return 3 # 'indoor closed'
    elif stadium in indoor_open:
        return 2 #'indoor open'
    elif stadium in dome_closed:
        return 4 #'dome closed'
    elif stadium in dome_open:
        return 1 #'dome open'
    else:
        return 5 #'unknown'
    
df['StadiumType'] = df['StadiumType'].apply(group_stadium_types)

# Turf
# from https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112681#latest-649087
Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 
        'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 
        'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
        'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 
        'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 

df['Turf'] = df['Turf'].map(Turf)
df['Turf'] = df['Turf'].map({"Natural": 0,"Artificial": 1})

# GameWeather
# https://www.kaggle.com/code1110/optimizing-lightgbm-hyperparameters
def group_game_weather(weather):
    rain = [
        'Rainy', 'Rain Chance 40%', 'Showers',
        'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
        'Scattered Showers', 'Cloudy, Rain', 'Rain shower', 'Light Rain', 'Rain'
    ]
    overcast = [
        'Cloudy, light snow accumulating 1-3"', 'Party Cloudy', 'Cloudy, chance of rain',
        'Coudy', 'Cloudy, 50% change of rain', 'Rain likely, temps in low 40s.',
        'Cloudy and cold', 'Cloudy, fog started developing in 2nd quarter',
        'Partly Clouidy', '30% Chance of Rain', 'Mostly Coudy', 'Cloudy and Cool',
        'cloudy', 'Partly cloudy', 'Overcast', 'Hazy', 'Mostly cloudy', 'Mostly Cloudy',
        'Partly Cloudy', 'Cloudy'
    ]
    clear = [
        'Partly clear', 'Sunny and clear', 'Sun & clouds', 'Clear and Sunny',
        'Sunny and cold', 'Sunny Skies', 'Clear and Cool', 'Clear and sunny',
        'Sunny, highs to upper 80s', 'Mostly Sunny Skies', 'Cold',
        'Clear and warm', 'Sunny and warm', 'Clear and cold', 'Mostly sunny',
        'T: 51; H: 55; W: NW 10 mph', 'Clear Skies', 'Clear skies', 'Partly sunny',
        'Fair', 'Partly Sunny', 'Mostly Sunny', 'Clear', 'Sunny'
    ]
    snow  = ['Heavy lake effect snow', 'Snow']
    none  = ['N/A Indoor', 'Indoors', 'Indoor', 'N/A (Indoors)', 'Controlled Climate']

    if weather in rain:
        return -1 #'rain'
    elif weather in overcast:
        return 1 #'overcast'
    elif weather in clear:
        return 2 #'clear'
    elif weather in snow:
        return -2 #snow'
    elif weather in none:
        return 0 #'none'
df['GameWeather'] = df['GameWeather'].apply(group_game_weather)
    
# Temperature 
df['Temperature'] = df['Temperature'].fillna(df['Temperature'].median())
# Humidity
df['Humidity'] = df['Humidity'].fillna(df['Humidity'].median())

# WindSpeed
def clean_wind_speed(windspeed):
        """
        This is not a very robust function,
        but it should do the job for this dataset.
        """
        ws = str(windspeed)
        # if it's already a number just return an int value
        if ws.isdigit():
            return int(ws)
        # if it's a range, take their mean
        if '-' in ws:
            return (int(ws.split('-')[0]) + int(ws.split('-')[1]))/2
        # if there's a space between the number and mph
        if ws.split(' ')[0].isdigit():
            return int(ws.split(' ')[0])
        # if it looks like '10MPH' or '12mph' just take the first part
        if 'mph' in ws.lower():
            return int(ws.lower().split('mph')[0])
        else:
            return 0
df['WindSpeed'] = df['WindSpeed'].apply(clean_wind_speed)

# WindDirection
# from https://www.kaggle.com/code1110/optimizing-lightgbm-hyperparameters
def clean_wind_direction(wind_direction):
    wd = str(wind_direction).upper()
    if wd == 'N' or 'FROM S' in wd:
        return 90 #'north'
    if wd == 'S' or 'FROM N' in wd:
        return 270 #'south'
    if wd == 'W' or 'FROM E' in wd:
        return 180 #'west'
    if wd == 'E' or 'FROM W' in wd:
        return 0 #'east'

    if 'FROM SW' in wd or 'FROM SSW' in wd or 'FROM WSW' in wd:
        return 45 #'north east'
    if 'FROM SE' in wd or 'FROM SSE' in wd or 'FROM ESE' in wd:
        return 135 #'north west'
    if 'FROM NW' in wd or 'FROM NNW' in wd or 'FROM WNW' in wd:
        return 315 #'south east'
    if 'FROM NE' in wd or 'FROM NNE' in wd or 'FROM ENE' in wd:
        return 225 #'south west'

    if 'NW' in wd or 'NORTHWEST' in wd:
        return 135 #'north west'
    if 'NE' in wd or 'NORTH EAST' in wd:
        return 45 #'north east'
    if 'SW' in wd or 'SOUTHWEST' in wd:
        return 225 #'south west'
    if 'SE' in wd or 'SOUTHEAST' in wd:
        return 315 #'south east'
df['WindDirection'] = df['WindDirection'].apply(clean_wind_direction)

Quarter                    1
GameClock           03:03:00
QuarterGameSecs          183
HalfGameSecsLeft        1083
Name: 256, dtype: object

In [27]:
df

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,OffenseTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,ToLeft,IsBallCarrier,Dir_rad,TeamOnOffense,IsOnOffense,YardLine_std,X_std,Y_std,Orientation_std,Dir_std,DefenseTeam,IsOffenseAtHome,OffenseScore,DefenseScore,IsOffenseWinning,OffenseInOwnTerritory,OffenseRushingPosition,GeneralPosition,NumberOfDBsOnPlay,NumberOfDLinemenOnPlay,NumberOfLBsOnPlay,NumberOfBacksOnPlay,NumberOfOLinemenOnPlay,NumberOfTEsOnPlay,NumberOfWRsOnPlay,TimeBetweenSnapHandoff,DayOfYear,DayOfWeek,MonthOfYear,Morning,Afternoon,Evening,QuarterGameSecs,TotalGameSecsPlayed,IsInEngland,HalfGameSecsLeft
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.40,81.99,177.18,496723,Eric Berry,29,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,212,12/29/1988,Tennessee,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,4.761607,home,False,35,46.09,18.493333,-8.01,1.620015,KC,True,0,0,False,True,RB,DB,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.70,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,288,03/25/1989,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,4.386012,home,False,35,45.33,20.693333,-62.39,1.244420,KC,True,0,0,False,True,RB,DL,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
2,2017090700,20170907000118,away,74.00,33.20,1.22,0.59,0.31,3.01,202.73,2495493,Justin Houston,50,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,270,01/21/1989,Georgia,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,4.315676,home,False,35,46.00,20.133333,-86.99,1.174083,KC,True,0,0,False,True,RB,DL,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
3,2017090700,20170907000118,away,71.46,27.70,0.42,0.54,0.02,359.77,105.64,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,245,11/22/1982,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,6.010216,home,False,35,48.54,25.633333,269.77,2.868623,KC,True,0,0,False,True,RB,LB,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,2530794,Ron Parker,38,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,206,08/17/1987,Newberry,FS,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,4.986231,home,False,35,50.68,17.913333,-77.37,1.844638,KC,True,0,0,False,True,RB,DB,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
5,2017090700,20170907000118,away,75.06,24.00,1.01,0.32,0.18,308.34,95.01,2543494,Dee Ford,55,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-2,252,03/19/1991,Auburn,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,6.195744,home,False,35,44.94,29.333333,218.34,3.054152,KC,True,0,0,False,True,RB,DL,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
6,2017090700,20170907000118,away,74.11,16.64,1.11,0.83,0.02,357.23,322.59,2543637,Terrance Mitchell,39,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,5-11,190,05/17/1992,Oregon,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,2.223724,home,False,35,45.89,36.693333,267.23,5.365317,KC,True,0,0,False,True,RB,DB,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
7,2017090700,20170907000118,away,73.37,18.73,1.24,0.74,0.13,328.52,270.04,2543851,Phillip Gaines,23,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,193,04/04/1991,Rice,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,3.140895,home,False,35,46.63,34.603333,238.52,6.282487,KC,True,0,0,False,True,RB,DB,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
8,2017090700,20170907000118,away,56.63,26.90,0.26,1.86,0.28,344.70,55.31,2550257,Daniel Sorensen,49,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-2,208,03/05/1990,Brigham Young,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,0.605455,home,False,35,63.37,26.433333,254.70,3.747047,KC,True,0,0,False,True,RB,DB,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754
9,2017090700,20170907000118,away,73.35,38.83,4.55,0.76,0.51,75.47,190.84,2552488,Marcus Peters,22,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,9,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,197,01/09/1993,Washington,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",0,1,2.0,63.0,77.0,8.0,225.0,True,False,4.523195,home,False,35,46.65,14.503333,-14.53,1.381603,KC,True,0,0,False,True,RB,DB,6,4,1,2,5,1,3,1,251,4,9,1,0,0,854,46,False,1754


In [None]:
# PlayerHeight_inches
def height2inch(x):
    height = x.split("-")
    return 12 * int(height[0]) + int(height[1])
X['PlayerHeight'] = df['PlayerHeight'].apply(height2inch)

In [None]:
### For df flattenning??

features = [c for c in df.columns.values]
train_df = df[features]
print(train_df.shape)
train_df.head()

train_data=np.zeros((509762//22, len(features)))
for i in tqdm.tqdm(range(0,509762,22)):
    count=0
    for c in features:
        train_data[i//22][count] = train_df[c][i]
        count+=1
        
        
y_train_ = np.array([df["Yards"][i] for i in range(0,509762,22)])


X_train = pd.DataFrame(data=train_data,columns=features)


features = [f for f in features if f not in ["Yards"]]
X_train = X_train[features]

print(X_train.shape)
X_train.head()


y_train = np.zeros(len(y_train_),dtype=np.float)
for i in range(len(y_train)):
    y_train[i]=(y_train_[i])

scaler = preprocessing.StandardScaler()
scaler.fit([[y] for y in y_train])
y_train = np.array([y[0] for y in scaler.transform([[y] for y in y_train])])
data = [0 for i in range(199)]
for y in y_train:
    data[int(y+99)]+=1
plt.plot([i-99 for i in range(199)],data)

In [None]:
df['row_number'] = df.groupby(['PlayId']).cumcount() + 1
# Extended dataframe
new_df = df.pivot_table(index=['PlayId'], columns=['row_number'], values=['X'])

In [None]:
columns = []
for c in new_df.columns:
    col_name = str(c[0]) + str(c[1])
    columns.append(col_name)
new_df.columns = columns

In [None]:
# If you want to add more variables just add them:
new_df = df.pivot_table(index=['PlayId'], columns=['row_number'], values=['X', 'Y', 'S'])

The below functions will plot plays onto a football field.

In [None]:
def create_football_field(linenumbers=True,
                          endzones=True,
                          highlight_line=False,
                          highlight_line_number=50,
                          highlighted_name='Line of Scrimmage',
                          fifty_is_los=False,
                          figsize=(12*2, 6.33*2)):
    """
    Function that plots the football field for viewing plays.
    Allows for showing or hiding endzones.
    """
    rect = patches.Rectangle((0, 0), 120, 53.3, linewidth=0.1,
                             edgecolor='r', facecolor='darkgreen', zorder=0,  alpha=0.5)

    fig, ax = plt.subplots(1, figsize=figsize)
    ax.add_patch(rect)
    plt.plot([10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             [0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             color='white')
    if fifty_is_los:
        plt.plot([60, 60], [0, 53.3], color='gold')
        plt.text(62, 50, '<- Player Yardline at Snap', color='gold')
    # Endzones
    if endzones:
        ez1 = patches.Rectangle((0, 0), 10, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ez2 = patches.Rectangle((110, 0), 120, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ax.add_patch(ez1)
        ax.add_patch(ez2)
    plt.xlim(0, 120)
    plt.ylim(-5, 58.3)
    plt.axis('off')
    if linenumbers:
        for x in range(20, 110, 10):
            numb = x
            if x > 50:
                numb = 120 - x
            plt.text(x, 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white')
            plt.text(x - 0.95, 53.3 - 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white', rotation=180)
    if endzones:
        hash_range = range(11, 110)
    else:
        hash_range = range(1, 120)
    for x in hash_range:
        ax.plot([x, x], [0.4, 0.7], color='white')
        ax.plot([x, x], [53.0, 52.5], color='white')
        ax.plot([x, x], [22.91, 23.57], color='white')
        ax.plot([x, x], [29.73, 30.39], color='white')
    if highlight_line:
        hl = highlight_line_number + 10
        plt.plot([hl, hl], [0, 53.3], color='yellow')
        plt.text(hl + 2, 50, '<- {}'.format(highlighted_name),
                 color='yellow')
    return fig, ax
#create_football_field()

def get_dx_dy(radian_angle, dist):
    dx = dist * math.cos(radian_angle)
    dy = dist * math.sin(radian_angle)
    return dx, dy
def show_play(play_id, df=df):
    df = df[df.PlayId == play_id]
    fig, ax = create_football_field()
    ax.scatter(df.X, df.Y, cmap='rainbow', c=~(df.Team == 'home'), s=100)
    rusher_row = df[df.NflIdRusher == df.NflId]
    ax.scatter(rusher_row.X, rusher_row.Y, color='black')
    yards_covered = rusher_row["Yards"].values[0]
    x = rusher_row["X"].values[0]
    y = rusher_row["Y"].values[0]
    rusher_dir = rusher_row["Dir_rad"].values[0]
    rusher_speed = rusher_row["S"].values[0]
    dx, dy = get_dx_dy(rusher_dir, rusher_speed)
    ax.arrow(x, y, dx, dy, length_includes_head=True, width=0.3, color='black')
    left = 'left' if df.ToLeft.sum() > 0 else 'right'
    plt.title(f'Play # {play_id} moving to {left}, yard distance is {yards_covered}', fontsize=20)
    plt.legend()
    plt.show()

In [None]:
show_play(20171224060264)

In [None]:
# The code below takes as input the df DataFrame with all rows from train.csv, 
# and outputs a summary DataFrame with one row per pay, with the column 
# distance_to_runner as the desired value.
# Also, you can see that I'm creating a data DataFrame. In it, I calculate the 
# distance from every player to the runner. From there, I can create lots of features.

plays = df[df['NflId'] == df['NflIdRusher']]
data = pd.merge(df, plays[['PlayId', 'X', 'Y']], on='PlayId',
                suffixes=('', '_r'))
data['distance_to_runner'] = ((data['X'] - data['X_r'])**2 +
                              (data['Y'] - data['Y_r'])**2)**(1/2)
summary = data.groupby(['PlayId', 'Team']).agg({'distance_to_runner': 'min'})
summary = summary[summary['distance_to_runner'] != 0]
summary.index = summary.index.get_level_values(0)

In [None]:
# I wrote the helper function below to combine all files within a 
# folder into a single file to submit in Kaggle.

def generate_submission(path=None, closing_file='_closing_submission.py',
                        submission_file='submissions/submission.py'):
    """This function combines all PY files into a single submission file, to
    be uploaded as a single script in Kaggle.

    Args:
        path (str): The path from within combine PY files. If None, will use
            current
        closing_file (str): The last file to be added to the submission file.
            It contains the final function to be executed in the script.
        submission_file (str): The final submission file

    """
    if path is None or not os.path.exists(path):
        path = os.getcwd()

    dest = open(os.path.join(path, submission_file), "w")
    for filename in os.listdir(path):
        if filename.endswith(".py") and not filename.startswith('_'):
            f = open(os.path.join(path, filename), "r")
            for line in f.readlines():
                if line.startswith('from nfl.'):
                    continue
                if line.startswith("if __name__ == "):
                    break

                dest.write(line)

            dest.write('\n\n')
            dest.write('#' + '*' * 79)
            dest.write('\n\n')
            f.close()

    if not os.path.exists(closing_file):
        raise ValueError('Closing file does not exist!')

    f = open(os.path.join(path, closing_file), "r")
    dest.write(f.read())
    dest.close()
    
    
    # Better organizing the code in a proper file structure helped 
    # me figure out faster & better ways to develop & improve my algorithms. It enabled my very final code becoming something very neat:
n_splits = 5
dataset = Dataset('/kaggle/input/nfl-big-data-bowl-2020/train.csv')
model1 = KerasModel(n_splits=n_splits, input_dim=103)
model2 = XGBModel(n_splits=n_splits)
model3 = LGBModel(n_splits=n_splits)
model4 = CatBoostModel(n_splits=n_splits)
ensemble = Ensemble(models=[model1, model2, model3, model4], dataset=dataset)
ensemble.train()
env = nflrush.make_env()
ensemble.make_submission(env)

In [None]:
# I saw in another thread an implementation of CRPS score with 2 loops. 
# That's very slow, very inefficient. It follows a vectorized implementation, very fast:

def crps(y_pred, y_real):
    """This function takes as input the Y predictions (the distribution
    probability, shape N x 199) and the Y real (the actual completed yards,
    shape N) and outputs the Continuous Ranked Probability Score
    (CRPS). For more information, check:
    https://www.kaggle.com/c/nfl-big-data-bowl-2020/overview/evaluation

    Args:
        y_pred: Cumulative distribution probabilities predicted by the 
            model, shape N x 199, where N is the number of samples
        y_real: Real yards completed (target), shape N

    Returns:
        float: the Continuous Ranked Probability Score (CRPS)

    """
    n = np.arange(-99, 100)
    n = np.row_stack([n] * y_pred.shape[0])
    ym = y_real.reshape(y_pred.shape[0], 1)
    step = np.heaviside(n - ym, 1)

    total = np.mean((y_pred - step)**2)

    return total