In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 150)

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]

In [2]:
from sklearn.ensemble import RandomForestRegressor,VotingClassifier, VotingRegressor
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold

In [3]:
# Model NN Keras
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import Callback, EarlyStopping
from keras import backend as K
from keras import regularizers
import tensorflow as tf

Using TensorFlow backend.


In [4]:
#train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})
train = pd.read_csv('../data/train.csv', dtype={'WindSpeed': 'object'})
#train = train[:2200]
print(train.shape)
train.head()

(509762, 49)


Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,496723,Eric Berry,29,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,212,12/29/1988,Tennessee,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,288,03/25/1989,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,2495493,Justin Houston,50,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,270,01/21/1989,Georgia,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,245,11/22/1982,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,2530794,Ron Parker,38,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,206,08/17/1987,Newberry,FS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


## Feature engineering

In [5]:
#https://www.kaggle.com/rooshroosh/fork-of-neural-networks-different-architecture
def strtoseconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def strtofloat(x):
    try:
        return float(x)
    except:
        return -1

def map_weather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans*=0.5
    if 'climate controlled' in txt or 'indoor' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt:
        return -3*ans
    return 0

def OffensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0, 'QB' : 0, 'RB' : 0, 'TE' : 0, 'WR' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def DefensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def orientation_to_cat(x):
    x = np.clip(x, 0, 360 - 1)
    try:
        return str(int(x/15))
    except:
        return "nan"

In [6]:
def preprocess(train):
    ## GameClock
    train['GameClock_sec'] = train['GameClock'].apply(strtoseconds)
    train["GameClock_minute"] = train["GameClock"].apply(lambda x : x.split(":")[0]).astype("object")

    ## Height
    train['PlayerHeight_dense'] = train['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

    ## Time
    train['TimeHandoff'] = train['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    train['TimeSnap'] = train['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

    train['TimeDelta'] = train.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
    train['PlayerBirthDate'] = train['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

    ## Age
    seconds_in_year = 60*60*24*365.25
    train['PlayerAge'] = train.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
    train["PlayerAge_ob"] = train['PlayerAge'].astype(np.int).astype("object")

    ## WindSpeed
    train['WindSpeed_ob'] = train['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    train['WindSpeed_dense'] = train['WindSpeed_ob'].apply(strtofloat)

    ## Weather
    train['GameWeather_process'] = train['GameWeather'].str.lower()
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
    train['GameWeather_process'] = train['GameWeather_process'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
    train['GameWeather_dense'] = train['GameWeather_process'].apply(map_weather)

    ## Rusher
    train['IsRusher'] = (train['NflId'] == train['NflIdRusher'])
    train['IsRusher_ob'] = (train['NflId'] == train['NflIdRusher']).astype("object")
    temp = train[train["IsRusher"]][["Team", "PlayId"]].rename(columns={"Team":"RusherTeam"})
    train = train.merge(temp, on = "PlayId")
    train["IsRusherTeam"] = train["Team"] == train["RusherTeam"]

    ## dense -> categorical
    train["Quarter_ob"] = train["Quarter"].astype("object")
    train["Down_ob"] = train["Down"].astype("object")
    train["JerseyNumber_ob"] = train["JerseyNumber"].astype("object")
    train["YardLine_ob"] = train["YardLine"].astype("object")
    # train["DefendersInTheBox_ob"] = train["DefendersInTheBox"].astype("object")
    # train["Week_ob"] = train["Week"].astype("object")
    # train["TimeDelta_ob"] = train["TimeDelta"].astype("object")


    ## Orientation and Dir
    train["Orientation_ob"] = train["Orientation"].apply(lambda x : orientation_to_cat(x)).astype("object")
    train["Dir_ob"] = train["Dir"].apply(lambda x : orientation_to_cat(x)).astype("object")

    train["Orientation_sin"] = train["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Orientation_cos"] = train["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
    train["Dir_sin"] = train["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Dir_cos"] = train["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))

    ## diff Score
    train["diffScoreBeforePlay"] = train["HomeScoreBeforePlay"] - train["VisitorScoreBeforePlay"]
    train["diffScoreBeforePlay_binary_ob"] = (train["HomeScoreBeforePlay"] > train["VisitorScoreBeforePlay"]).astype("object")

    ## Turf
    Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 
    train['Turf'] = train['Turf'].map(Turf)

    ## OffensePersonnel
    temp = train["OffensePersonnel"].iloc[np.arange(0, len(train), 22)].apply(lambda x : pd.Series(OffensePersonnelSplit(x)))
    temp.columns = ["Offense" + c for c in temp.columns]
    temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)]
    train = train.merge(temp, on = "PlayId")

    ## DefensePersonnel
    temp = train["DefensePersonnel"].iloc[np.arange(0, len(train), 22)].apply(lambda x : pd.Series(DefensePersonnelSplit(x)))
    temp.columns = ["Defense" + c for c in temp.columns]
    temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)]
    train = train.merge(temp, on = "PlayId")

    ## sort
#     train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'Team', 'IsRusher']).reset_index(drop = True)
    train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'IsRusherTeam', 'IsRusher']).reset_index(drop = True)
    return train

In [7]:
%%time
df_train = preprocess(train)

CPU times: user 2min 6s, sys: 4.37 s, total: 2min 10s
Wall time: 1min 55s


In [8]:
## DisplayName remove Outlier
v = train["DisplayName"].value_counts()
missing_values = list(v[v < 5].index)
train["DisplayName"] = train["DisplayName"].where(~train["DisplayName"].isin(missing_values), "nan")

## PlayerCollegeName remove Outlier
v = train["PlayerCollegeName"].value_counts()
missing_values = list(v[v < 10].index)
train["PlayerCollegeName"] = train["PlayerCollegeName"].where(~train["PlayerCollegeName"].isin(missing_values), "nan")

In [9]:
df_train.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,GameClock_sec,GameClock_minute,PlayerHeight_dense,TimeDelta,PlayerAge,PlayerAge_ob,WindSpeed_ob,WindSpeed_dense,GameWeather_process,GameWeather_dense,IsRusher,IsRusher_ob,RusherTeam,IsRusherTeam,Quarter_ob,Down_ob,JerseyNumber_ob,YardLine_ob,Orientation_ob,Dir_ob,Orientation_sin,Orientation_cos,Dir_sin,Dir_cos,diffScoreBeforePlay,diffScoreBeforePlay_binary_ob,OffenseDB,OffenseDL,OffenseLB,OffenseOL,OffenseQB,OffenseRB,OffenseTE,OffenseWR,DefenseDB,DefenseDL,DefenseLB,DefenseOL
0,2017090700,20170907000118,away,74.15,28.9,0.72,0.73,0.01,342.58,274.14,2556369,Chris Jones,95,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-6,308,1994-07-03,Mississippi State,DT,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,78,1.0,23.184204,23,8,8.0,clear and warm,1.0,False,False,home,False,1,3,95,35,22,18,-0.299374,0.954136,-0.997391,0.072194,0,False,0,0,0,0,0,1,1,3,6,2,3,0
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-3,288,1989-03-25,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,75,1.0,28.457305,28,8,8.0,clear and warm,1.0,False,False,home,False,1,3,97,35,1,13,0.463451,0.886123,-0.320613,-0.94721,0,False,0,0,0,0,0,1,1,3,6,2,3,0
2,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-3,245,1982-11-22,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,75,1.0,34.79543,34,8,8.0,clear and warm,1.0,False,False,home,False,1,3,56,35,23,7,-0.004014,0.999992,0.962975,-0.269592,0,False,0,0,0,0,0,1,1,3,6,2,3,0
3,2017090700,20170907000118,away,74.11,16.64,1.11,0.83,0.02,357.23,322.59,2543637,Terrance Mitchell,39,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,5-11,190,1992-05-17,Oregon,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,71,1.0,25.311514,25,8,8.0,clear and warm,1.0,False,False,home,False,1,3,39,35,23,21,-0.048327,0.998832,-0.607514,0.794309,0,False,0,0,0,0,0,1,1,3,6,2,3,0
4,2017090700,20170907000118,away,73.37,18.73,1.24,0.74,0.13,328.52,270.04,2543851,Phillip Gaines,23,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-0,193,1991-04-04,Rice,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,72,1.0,26.431295,26,8,8.0,clear and warm,1.0,False,False,home,False,1,3,23,35,21,18,-0.522201,0.852822,-1.0,0.000698,0,False,0,0,0,0,0,1,1,3,6,2,3,0


In [10]:
def create_features(df, deploy=False):
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_Y(y_coordinate, play_direction):
        if play_direction == 'left':
            return (160 / 3) - y_coordinate
        else:
            return y_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            #new_angle = np.mod(180 + angle, 360)
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2

        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]

        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Y'] = df[['Y','PlayDirection']].apply(lambda x: new_Y(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)

        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')

        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]

        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','max','mean','std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']

        return defense

    def static_features(df):
        static_features = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                                'YardLine','Quarter','Down','Distance','DefendersInTheBox',
                                                                'Turf']].drop_duplicates()
        static_features['DefendersInTheBox'] = static_features['DefendersInTheBox'].fillna(np.mean(static_features['DefendersInTheBox']))

        return static_features


    def combine_features(relative_to_back, defense, static, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')

        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        return df
    
    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    def_feats = defense_features(df)
    static_feats = static_features(df)
    basetable = combine_features(rel_back, def_feats, static_feats, deploy=deploy)
    
    
    return basetable

In [11]:
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()

In [12]:
%time train_basetable = create_features(train, False)

CPU times: user 1min 39s, sys: 2.33 s, total: 1min 41s
Wall time: 1min 35s


In [13]:
train_basetable.head()

Unnamed: 0,GameId,PlayId,back_from_scrimmage,back_oriented_down_field,back_moving_down_field,min_dist,max_dist,mean_dist,std_dist,def_min_dist,def_max_dist,def_mean_dist,def_std_dist,X,Y,S,A,Dis,Orientation,Dir,YardLine,Quarter,Down,Distance,DefendersInTheBox,Turf,Yards
0,2017090700,20170907000118,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299,41.25,22.803333,3.63,3.35,0.38,198.02,114.26,45.0,1,3,2,6.0,Field Turf,8
1,2017090700,20170907000139,4.07,0,0,0.792023,23.025872,8.614623,5.598683,4.287773,23.025872,10.297028,5.833217,48.93,26.173333,3.06,2.41,0.34,149.3,47.8,53.0,1,1,10,6.0,Field Turf,3
2,2017090700,20170907000189,3.66,1,0,1.64639,20.726285,8.482583,4.642121,4.22167,20.726285,9.903689,5.07329,71.34,34.223333,5.77,2.42,0.6,219.18,138.04,75.0,1,1,10,7.0,Field Turf,5
3,2017090700,20170907000345,3.53,0,0,0.918096,9.791231,5.549379,1.983128,4.528002,9.791231,6.309354,1.834174,104.47,27.973333,4.45,3.2,0.46,173.78,84.56,108.0,1,2,2,9.0,Field Turf,2
4,2017090700,20170907000395,5.01,0,0,0.502892,21.214806,9.168819,5.611232,4.288088,21.214806,11.056456,5.900009,29.99,27.12,3.9,2.53,0.44,34.27,157.92,35.0,1,1,10,7.0,Field Turf,7


In [14]:
def drop_basetable(train):
    drop_cols = ['X','Y','S','A','Dis','Orientation','Dir','YardLine','Quarter','Down',
                  'Distance','DefendersInTheBox','Turf','Yards']
    train = train.drop(drop_cols, axis = 1)
    return train

In [15]:
train_basetable = drop_basetable(train_basetable)

In [16]:
df_train = pd.merge(df_train, train_basetable, on=['GameId','PlayId'], how='left')

In [17]:
df_train.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,GameClock_sec,GameClock_minute,PlayerHeight_dense,TimeDelta,PlayerAge,PlayerAge_ob,WindSpeed_ob,WindSpeed_dense,GameWeather_process,GameWeather_dense,IsRusher,IsRusher_ob,RusherTeam,IsRusherTeam,Quarter_ob,Down_ob,JerseyNumber_ob,YardLine_ob,Orientation_ob,Dir_ob,Orientation_sin,Orientation_cos,Dir_sin,Dir_cos,diffScoreBeforePlay,diffScoreBeforePlay_binary_ob,OffenseDB,OffenseDL,OffenseLB,OffenseOL,OffenseQB,OffenseRB,OffenseTE,OffenseWR,DefenseDB,DefenseDL,DefenseLB,DefenseOL,back_from_scrimmage,back_oriented_down_field,back_moving_down_field,min_dist,max_dist,mean_dist,std_dist,def_min_dist,def_max_dist,def_mean_dist,def_std_dist
0,2017090700,20170907000118,away,74.15,28.9,0.72,0.73,0.01,342.58,274.14,2556369,Chris Jones,95,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-6,308,1994-07-03,Mississippi State,DT,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,78,1.0,23.184204,23,8,8.0,clear and warm,1.0,False,False,home,False,1,3,95,35,22,18,-0.299374,0.954136,-0.997391,0.072194,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-3,288,1989-03-25,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,75,1.0,28.457305,28,8,8.0,clear and warm,1.0,False,False,home,False,1,3,97,35,1,13,0.463451,0.886123,-0.320613,-0.94721,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
2,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-3,245,1982-11-22,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,75,1.0,34.79543,34,8,8.0,clear and warm,1.0,False,False,home,False,1,3,56,35,23,7,-0.004014,0.999992,0.962975,-0.269592,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
3,2017090700,20170907000118,away,74.11,16.64,1.11,0.83,0.02,357.23,322.59,2543637,Terrance Mitchell,39,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,5-11,190,1992-05-17,Oregon,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,71,1.0,25.311514,25,8,8.0,clear and warm,1.0,False,False,home,False,1,3,39,35,23,21,-0.048327,0.998832,-0.607514,0.794309,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
4,2017090700,20170907000118,away,73.37,18.73,1.24,0.74,0.13,328.52,270.04,2543851,Phillip Gaines,23,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08 00:44:06,2017-09-08 00:44:05,8,6-0,193,1991-04-04,Rice,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,Clear and warm,63.0,77.0,8,SW,854.0,14,72,1.0,26.431295,26,8,8.0,clear and warm,1.0,False,False,home,False,1,3,23,35,21,18,-0.522201,0.852822,-1.0,0.000698,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299


In [18]:
def drop(train):
    drop_cols = ["GameId", "GameWeather", "NflId", "Season", "NflIdRusher"] 
    drop_cols += ['TimeHandoff', 'TimeSnap', 'PlayerBirthDate']
    drop_cols += ["Orientation", "Dir", 'WindSpeed', "GameClock"]
    train = train.drop(drop_cols, axis = 1)
    return train

In [19]:
df_train = drop(df_train)

In [20]:
df_train.head()

Unnamed: 0,PlayId,Team,X,Y,S,A,Dis,DisplayName,JerseyNumber,YardLine,Quarter,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,Yards,PlayerHeight,PlayerWeight,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,Temperature,Humidity,WindDirection,GameClock_sec,GameClock_minute,PlayerHeight_dense,TimeDelta,PlayerAge,PlayerAge_ob,WindSpeed_ob,WindSpeed_dense,GameWeather_process,GameWeather_dense,IsRusher,IsRusher_ob,RusherTeam,IsRusherTeam,Quarter_ob,Down_ob,JerseyNumber_ob,YardLine_ob,Orientation_ob,Dir_ob,Orientation_sin,Orientation_cos,Dir_sin,Dir_cos,diffScoreBeforePlay,diffScoreBeforePlay_binary_ob,OffenseDB,OffenseDL,OffenseLB,OffenseOL,OffenseQB,OffenseRB,OffenseTE,OffenseWR,DefenseDB,DefenseDL,DefenseLB,DefenseOL,back_from_scrimmage,back_oriented_down_field,back_moving_down_field,min_dist,max_dist,mean_dist,std_dist,def_min_dist,def_max_dist,def_mean_dist,def_std_dist
0,20170907000118,away,74.15,28.9,0.72,0.73,0.01,Chris Jones,95,35,1,NE,3,2,NE,0,0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,6-6,308,Mississippi State,DT,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,63.0,77.0,SW,854.0,14,78,1.0,23.184204,23,8,8.0,clear and warm,1.0,False,False,home,False,1,3,95,35,22,18,-0.299374,0.954136,-0.997391,0.072194,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
1,20170907000118,away,74.67,32.64,0.42,1.35,0.01,Allen Bailey,97,35,1,NE,3,2,NE,0,0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,6-3,288,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,63.0,77.0,SW,854.0,14,75,1.0,28.457305,28,8,8.0,clear and warm,1.0,False,False,home,False,1,3,97,35,1,13,0.463451,0.886123,-0.320613,-0.94721,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
2,20170907000118,away,71.46,27.7,0.42,0.54,0.02,Derrick Johnson,56,35,1,NE,3,2,NE,0,0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,6-3,245,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,63.0,77.0,SW,854.0,14,75,1.0,34.79543,34,8,8.0,clear and warm,1.0,False,False,home,False,1,3,56,35,23,7,-0.004014,0.999992,0.962975,-0.269592,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
3,20170907000118,away,74.11,16.64,1.11,0.83,0.02,Terrance Mitchell,39,35,1,NE,3,2,NE,0,0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,5-11,190,Oregon,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,63.0,77.0,SW,854.0,14,71,1.0,25.311514,25,8,8.0,clear and warm,1.0,False,False,home,False,1,3,39,35,23,21,-0.048327,0.998832,-0.607514,0.794309,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299
4,20170907000118,away,73.37,18.73,1.24,0.74,0.13,Phillip Gaines,23,35,1,NE,3,2,NE,0,0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,8,6-0,193,Rice,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Artificial,63.0,77.0,SW,854.0,14,72,1.0,26.431295,26,8,8.0,clear and warm,1.0,False,False,home,False,1,3,23,35,21,18,-0.522201,0.852822,-1.0,0.000698,0,False,0,0,0,0,0,1,1,3,6,2,3,0,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299


In [21]:
cat_features = []
dense_features = []
for col in df_train.columns:
    if df_train[col].dtype =='object':
        cat_features.append(col)
        print("*cat*", col, len(df_train[col].unique()))
    else:
        dense_features.append(col)
        print("!dense!", col, len(df_train[col].unique()))
dense_features.remove("PlayId")
dense_features.remove("Yards")

!dense! PlayId 23171
*cat* Team 2
!dense! X 10890
!dense! Y 4339
!dense! S 884
!dense! A 903
!dense! Dis 105
*cat* DisplayName 2230
!dense! JerseyNumber 99
!dense! YardLine 50
!dense! Quarter 5
*cat* PossessionTeam 32
!dense! Down 4
!dense! Distance 35
*cat* FieldPosition 33
!dense! HomeScoreBeforePlay 50
!dense! VisitorScoreBeforePlay 45
*cat* OffenseFormation 9
*cat* OffensePersonnel 56
!dense! DefendersInTheBox 12
*cat* DefensePersonnel 38
*cat* PlayDirection 2
!dense! Yards 94
*cat* PlayerHeight 16
!dense! PlayerWeight 182
*cat* PlayerCollegeName 301
*cat* Position 25
*cat* HomeTeamAbbr 32
*cat* VisitorTeamAbbr 32
!dense! Week 17
*cat* Stadium 55
*cat* Location 60
*cat* StadiumType 30
*cat* Turf 2
!dense! Temperature 79
!dense! Humidity 87
*cat* WindDirection 54
!dense! GameClock_sec 901
*cat* GameClock_minute 16
!dense! PlayerHeight_dense 16
!dense! TimeDelta 7
!dense! PlayerAge 506270
*cat* PlayerAge_ob 22
*cat* WindSpeed_ob 34
!dense! WindSpeed_dense 27
*cat* GameWeather_process

## categorical

In [22]:
train_cat = df_train[cat_features]
categories = []
most_appear_each_categories = {}
for col in tqdm_notebook(train_cat.columns):
    train_cat.loc[:,col] = train_cat[col].fillna("nan")
    train_cat.loc[:,col] = col + "__" + train_cat[col].astype(str)
    most_appear_each_categories[col] = list(train_cat[col].value_counts().index)[0]
    categories.append(train_cat[col].unique())
categories = np.hstack(categories)
print(len(categories))

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


3337


In [23]:
le = LabelEncoder()
le.fit(categories)
for col in tqdm_notebook(train_cat.columns):
    train_cat.loc[:, col] = le.transform(train_cat[col])
num_classes = len(le.classes_)

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




## Dense

In [24]:
train_dense = df_train[dense_features]
sss = {}
medians = {}
for col in tqdm_notebook(train_dense.columns):
    print(col)
    medians[col] = np.nanmedian(train_dense[col])
    train_dense.loc[:, col] = train_dense[col].fillna(medians[col])
    ss = StandardScaler()
    train_dense.loc[:, col] = ss.fit_transform(train_dense[col].values[:,None])
    sss[col] = ss

HBox(children=(IntProgress(value=0, max=53), HTML(value='')))

X
Y
S
A
Dis
JerseyNumber
YardLine
Quarter
Down
Distance
HomeScoreBeforePlay
VisitorScoreBeforePlay
DefendersInTheBox
PlayerWeight
Week
Temperature
Humidity
GameClock_sec
PlayerHeight_dense
TimeDelta
PlayerAge
WindSpeed_dense
GameWeather_dense
IsRusher
IsRusherTeam
Orientation_sin
Orientation_cos
Dir_sin
Dir_cos
diffScoreBeforePlay
OffenseDB
OffenseDL
OffenseLB
OffenseOL
OffenseQB
OffenseRB
OffenseTE
OffenseWR
DefenseDB
DefenseDL
DefenseLB
DefenseOL
back_from_scrimmage
back_oriented_down_field
back_moving_down_field
min_dist
max_dist
mean_dist
std_dist
def_min_dist
def_max_dist
def_mean_dist
def_std_dist



## Divide features into groups

In [25]:
eps = 1e-8
## dense features for play
dense_game_features = train_dense.columns[train_dense[:22].std() <= eps]
## dense features for each player
dense_player_features = train_dense.columns[train_dense[:22].std() > eps]
## categorical features for play
cat_game_features = train_cat.columns[train_cat[:22].std() <= eps]
## categorical features for each player
cat_player_features = train_cat.columns[train_cat[:22].std() > eps]

In [26]:
dense_game_feature_names = ["G_" + cc for cc in dense_game_features]
dense_player_feature_names = list(np.hstack([["P_" + c for c in dense_player_features] for k in range(22)]))
cat_game_feature_names = ["G_" + cc for cc in cat_game_features]
cat_player_feature_names = list(np.hstack([["P_" + c for c in cat_player_features] for k in range(22)]))

In [27]:
train_dense_game = train_dense[dense_game_features].iloc[np.arange(0, len(train), 22)].reset_index(drop = True).values
## rusher player feature is included in train_dense_players, so skip this.
# train_dense_game = np.hstack([train_dense_game, train_dense[dense_player_features][train_dense["IsRusher"] > 0]]) ## with rusher player feature

train_dense_players = [train_dense[dense_player_features].iloc[np.arange(k, len(train), 22)].reset_index(drop = True) for k in range(22)]
train_dense_players = np.stack([t.values for t in train_dense_players]).transpose(1, 0, 2)

train_cat_game = train_cat[cat_game_features].iloc[np.arange(0, len(train), 22)].reset_index(drop = True).values
# train_cat_game = np.hstack([train_cat_game, train_cat[cat_player_features][train_dense["IsRusher"] > 0]]) ## with rusher player feature

train_cat_players = [train_cat[cat_player_features].iloc[np.arange(k, len(train), 22)].reset_index(drop = True) for k in range(22)]
train_cat_players = np.stack([t.values for t in train_cat_players]).transpose(1, 0, 2)

In [28]:
def return_step(x):
    temp = np.zeros(199)
    temp[x + 99:] = 1
    return temp

train_y_raw = train["Yards"].iloc[np.arange(0, len(train), 22)].reset_index(drop = True)
train_y = np.vstack(train_y_raw.apply(return_step).values)

In [29]:
train_dense_game.shape, train_dense_players.shape, train_cat_game.shape, train_cat_players.shape, train_y.shape

((23171, 38), (23171, 22, 15), (23171, 21), (23171, 22, 10), (23171, 199))

# Let's build models

### LGBM Model

In [69]:
## concat all features
train_dense_players_lgb = np.reshape(train_dense_players, (len(train_dense_players), -1))
train_dense = np.hstack([train_dense_players_lgb, train_dense_game])

train_cat_players_lgb = np.reshape(train_cat_players, (len(train_cat_players), -1))
train_cat = np.hstack([train_cat_players_lgb, train_cat_game])

train_x = np.hstack([train_dense, train_cat])

In [70]:
train_x.shape

(23171, 609)

In [71]:
class MultiLGBMClassifier():
    def __init__(self, resolution, params):
        ## smoothing size
        self.resolution = resolution
        ## initiarize models
        self.models = [LGBMClassifier(**params) for _ in range(resolution)]
        
    def fit(self, x, y):
        self.classes_list = []
        for k in tqdm_notebook(range(self.resolution)):
            ## train each model
            self.models[k].fit(x, (y + k) // self.resolution)
            ## (0,1,2,3,4,5,6,7,8,9) -> (0,0,0,0,0,1,1,1,1,1) -> (0,5)
            classes = np.sort(list(set((y + k) // self.resolution))) * self.resolution - k
            classes = np.append(classes, 999)
            self.classes_list.append(classes)
            
    def predict(self, x):
        pred199_list = []
        for k in range(self.resolution):
            preds = self.models[k].predict_proba(x)
            classes = self.classes_list[k]
            pred199s = self.get_pred199(preds, classes)
            pred199_list.append(pred199s)
        self.pred199_list = pred199_list
        pred199_ens = np.mean(np.stack(pred199_list), axis = 0)
        return pred199_ens
    
    def _get_pred199(self, p, classes):
        ## categorical prediction -> predicted distribution whose length is 199
        pred199 = np.zeros(199)
        for k in range(len(p)):
            pred199[classes[k] + 99 : classes[k+1] + 99] = p[k]
        return pred199

    def get_pred199(self, preds, classes):
        pred199s = []
        for p in preds:
            pred199 = np.cumsum(self._get_pred199(p, classes))
            pred199 = pred199/np.max(pred199)
            pred199s.append(pred199)
        return np.vstack(pred199s)

In [72]:
params = {'lambda_l1': 0.001,'lambda_l2': 0.001,'num_leaves': 40,'feature_fraction': 0.4,
          'subsample': 0.4, 'min_child_samples': 10,'learning_rate': 0.01,'num_iterations': 700, 
          'random_state': 42}

In [None]:
for k in range(1):
    kfold = KFold(5, random_state = 12345 + k, shuffle = True)
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(train_y)):
        print("-----------")
        print("-----------")
        model_LGB = MultiLGBMClassifier(resolution = 5, params = params)
        model_LGB.fit(train_x[tr_inds], train_y_raw.values[tr_inds])
        preds = model_LGB.predict(train_x[val_inds])
        loss = np.mean((train_y[val_inds] - preds) ** 2)
        models.append(model_LGB)
        print(k_fold, loss)
        losses.append(loss)
print("-------")
print(losses)
print(np.mean(losses))

-----------
-----------


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


0 0.012661746331946428
-----------
-----------


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


1 0.013497105544069488
-----------
-----------


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


2 0.012692194353468697
-----------
-----------


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

### Neural Network Model

In [32]:
keras.backend.clear_session()
def crps(y_true, y_pred):
    loss = K.mean((K.cumsum(y_pred, axis = 1) - y_true)**2)
    return loss

def get_model(batch_size, epochs):
    ## model dense
    input_dense_game = keras.layers.Input(shape=(train_dense_game.shape[1],))
    x1 = keras.layers.Dense(32, activation="relu")(input_dense_game)
    x1 = keras.layers.Dropout(0.5)(x1)
    # x1 = keras.layers.Dropout(0.1)(x1)

    input_dense_players = keras.layers.Input(shape=(train_dense_players.shape[1],train_dense_players.shape[2]))
    x2 = keras.layers.Dense(32, activation="relu")(input_dense_players)
    x2 = keras.layers.Dropout(0.5)(x2)
    # x2 = keras.layers.Flatten()(x2)
    # x2 = keras.layers.Dropout(0.1)(x2)

    ## model categorical
    input_cat_game = keras.layers.Input(shape=(train_cat_game.shape[1], ))
    embedding = keras.layers.Embedding(num_classes, 8, embeddings_regularizer=regularizers.l2(1))

    x3 = embedding(input_cat_game)
    x3 = keras.layers.Flatten()(x3)
    x3 = keras.layers.Dense(8, activation="relu")(x3)
    x3 = keras.layers.Dropout(0.6)(x3)

    input_cat_players = keras.layers.Input(shape=(train_cat_players.shape[1], train_cat_players.shape[2]))
    x4 = embedding(input_cat_players)

    x4 = keras.layers.Reshape((int(x4.shape[1]), int(x4.shape[2]) * int(x4.shape[3])))(x4)
    x4 = keras.layers.Dense(16, activation="relu")(x4)
    x4 = keras.layers.Dropout(0.6)(x4)

    ### concat players
    x_concat_players = keras.layers.Concatenate()([x2,x4])
    x_concat_players = keras.layers.Dense(16, activation="relu")(x_concat_players)
    # x_concat_players = keras.layers.GlobalAveragePooling1D()(x_concat_players)

    ## flatten
    x2 = keras.layers.Flatten()(x2)
    x4 = keras.layers.Flatten()(x4)
    x_concat_players = keras.layers.Flatten()(x_concat_players)

    ### concat all
    x_concat = keras.layers.Concatenate()([x1,x3,x_concat_players] + [x2, x4])
    x_concats = []
    n_unit = 128
    decay_rate = 0.5
    for k in range(5):
        x_concat = keras.layers.Dense(n_unit, activation="relu")(x_concat)
        x_concats.append(x_concat)
        n_unit = int(n_unit * decay_rate)
    x_concat = keras.layers.Concatenate()(x_concats)
    x_concat = keras.layers.Dropout(0.5)(x_concat)

    ## concat
    x_concat = keras.layers.Concatenate()([x1,x3,x_concat_players,x_concat] + [x2, x4])
    out_soft = keras.layers.Dense(199, activation="softmax", name = "out_soft")(x_concat)
    out_reg = keras.layers.Dense(1, activation=None, name = "out_reg")(x_concat)
    modelNN = keras.models.Model(inputs = [input_dense_game, input_dense_players, input_cat_game, input_cat_players],
                               outputs = [out_soft, out_reg])

    ## compile
    er = EarlyStopping(patience=10, min_delta=1e-4, restore_best_weights=True, monitor='val_out_soft_loss')
    modelNN.compile(loss=[crps, keras.losses.mae],
                  loss_weights=[1.0, 0.01],
                  optimizer=keras.optimizers.Adam(learning_rate=0.001, decay = 1e-5))

    ## train
    tr_x = [train_dense_game[tr_inds], train_dense_players[tr_inds], train_cat_game[tr_inds], train_cat_players[tr_inds]]
    tr_y = [train_y[tr_inds], train_y_raw[tr_inds]/100]
    val_x = [train_dense_game[val_inds], train_dense_players[val_inds], train_cat_game[val_inds], train_cat_players[val_inds]]
    val_y = [train_y[val_inds], train_y_raw[val_inds]/100]
    modelNN.fit(tr_x,
              tr_y,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_data=(val_x, val_y),
              callbacks=[er]
             )
    loss = modelNN.history.history["val_out_soft_loss"][-1]
    return modelNN, loss


In [35]:
models = []
losses = []

for k in range(2):
    kfold = KFold(5, random_state = 12345 + k, shuffle = True)
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(train_y)):
        print("-----------")
        modelNN, loss = get_model(1024, 250)    
        if (loss < 0.013):
            print('Append KFold:',k_fold,'| Loss:', loss)
            models.append(modelNN)
            losses.append(loss)
            if (len(losses) > 10):
                break
        else:
            print('Ignore KFold:',k_fold, '| Loss:', loss)        
print("-------")
print('Loss Mean:', np.mean(losses), ' | List:', losses)

-----------
Train on 18536 samples, validate on 4635 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250


Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Append KFold: 0 | Loss: 0.012818822637200356
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250


Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Ignore KFold: 1 | Loss: 0.01352053415030241
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Append KFold: 2 | Loss: 0.01297709345817566
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Ep

Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Append KFold: 4 | Loss: 0.01285842526704073
-----------
Train on 18536 samples, validate on 4635 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250


Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Ignore KFold: 0 | Loss: 0.01358469296246767
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250


Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Ignore KFold: 1 | Loss: 0.013028393499553204
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250


Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Ignore KFold: 2 | Loss: 0.01326210517436266
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250


Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Ignore KFold: 3 | Loss: 0.013158516958355904
-----------
Train on 18537 samples, validate on 4634 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250


Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Ignore KFold: 4 | Loss: 0.013154258951544762
-------
Loss Mean: 0.012884780454138914  | List: [0.012818822637200356, 0.01297709345817566, 0.01285842526704073]


In [36]:
print(losses)
print(np.mean(losses))

[0.012818822637200356, 0.01297709345817566, 0.01285842526704073]
0.012884780454138914


## Prediction

In [None]:
def make_pred(test, sample, env, model):
    df_test = preprocess(test)
    
    test_basetable = create_features(test, True)
    test_basetable = drop_basetable(test_basetable)
    
    df_test = pd.merge(df_test, test_basetable, on=['GameId','PlayId'], how='left')
    
    df_test = drop(df_test)
    #test = test.drop(un_use_features, axis = 1)
    
    ### categorical
    test_cat = test[cat_features]
    for col in (test_cat.columns):
        test_cat.loc[:,col] = test_cat[col].fillna("nan")
        test_cat.loc[:,col] = col + "__" + test_cat[col].astype(str)
        isnan = ~test_cat.loc[:,col].isin(categories)
        if np.sum(isnan) > 0:
#             print("------")
#             print("test have unseen label : col")
            if not ((col + "__nan") in categories):
#                 print("not nan in train : ", col)
                test_cat.loc[isnan,col] = most_appear_each_categories[col]
            else:
#                 print("nan seen in train : ", col)
                test_cat.loc[isnan,col] = col + "__nan"
    for col in (test_cat.columns):
        test_cat.loc[:, col] = le.transform(test_cat[col])

    ### dense
    test_dense = test[dense_features]
    for col in (test_dense.columns):
        test_dense.loc[:, col] = test_dense[col].fillna(medians[col])
        test_dense.loc[:, col] = sss[col].transform(test_dense[col].values[:,None])

    ### divide
    test_dense_players = [test_dense[dense_player_features].iloc[np.arange(k, len(test), 22)].reset_index(drop = True) for k in range(22)]
    test_dense_players = np.stack([t.values for t in test_dense_players]).transpose(1,0, 2)

    test_dense_game = test_dense[dense_game_features].iloc[np.arange(0, len(test), 22)].reset_index(drop = True).values
    test_dense_game = np.hstack([test_dense_game, test_dense[dense_player_features][test_dense["IsRusher"] > 0]])
    
    test_cat_players = [test_cat[cat_player_features].iloc[np.arange(k, len(test), 22)].reset_index(drop = True) for k in range(22)]
    test_cat_players = np.stack([t.values for t in test_cat_players]).transpose(1,0, 2)

    test_cat_game = test_cat[cat_game_features].iloc[np.arange(0, len(test), 22)].reset_index(drop = True).values
    test_cat_game = np.hstack([test_cat_game, test_cat[cat_player_features][test_dense["IsRusher"] > 0]])

    test_inp = [test_dense_game, test_dense_players, test_cat_game, test_cat_players]
    
    ## pred
    pred = 0
    for model in models:
        _pred = model.predict(test_inp)[0]
        _pred = np.cumsum(_pred, axis = 1)
        pred += _pred
    pred /= len(models)
    pred = np.clip(pred, 0, 1)
    env.predict(pd.DataFrame(data=pred,columns=sample.columns))
    return pred

In [None]:
from kaggle.competitions import nflrush
env = nflrush.make_env()
preds = []
for test, sample in tqdm_notebook(env.iter_test()):
    pred = make_pred(test, sample, env, models)
    preds.append(pred)
env.write_submission_file()

In [None]:
preds = np.vstack(preds)
## check whether prediction is submittable
print(np.mean(np.diff(preds, axis = 1) >= 0) == 1.0)
print(np.mean(preds > 1) == 0)

In [None]:
print(losses)
print(np.mean(losses))