In [1]:
import numpy as np
import pandas as pd

import sklearn.metrics as mtr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.callbacks import Callback, EarlyStopping
from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout, merge, Add
from keras.layers.embeddings import Embedding

from sklearn.model_selection import KFold,GroupKFold

import warnings
import random as rn
import math
import datetime
import tensorflow as tf
from keras.models import load_model
import os
import tqdm

warnings.filterwarnings("ignore")
pd.options.display.max_columns = 200

from kaggle.competitions import nflrush
env = nflrush.make_env()
iter_test = env.iter_test()

Using TensorFlow backend.


In [2]:
# evaluation metric
def crps(y_true, y_pred):
    y_true = np.clip(np.cumsum(y_true, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    return ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * y_true.shape[0]) 


# author : nlgn
# Link : https://www.kaggle.com/kingychiu/keras-nn-starter-crps-early-stopping
class Metric(Callback):
    def __init__(self, model, callbacks, data):
        super().__init__()
        self.model = model
        self.callbacks = callbacks
        self.data = data

    def on_train_begin(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_begin(logs)

    def on_train_end(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_end(logs)

    def on_epoch_end(self, batch, logs=None):
        X_train, y_train = self.data[0][0], self.data[0][1]
        y_pred = self.model.predict(X_train)
        y_true = np.clip(np.cumsum(y_train, axis=1), 0, 1)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
        tr_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_train[-1].shape[0])
        tr_s = np.round(tr_s, 6)
        logs['tr_CRPS'] = tr_s

        X_valid, y_valid = self.data[1][0], self.data[1][1]

        y_pred = self.model.predict(X_valid)
        y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
        val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid[-1].shape[0])
        val_s = np.round(val_s, 6)
        logs['val_CRPS'] = val_s
        print('tr CRPS', tr_s, 'val CRPS', val_s)

        for callback in self.callbacks:
            callback.on_epoch_end(batch, logs)

In [3]:
def create_features(df):
    
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2
        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def map_team_name(df):
        map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
        for abb in df['PossessionTeam'].unique():
            map_abbr[abb] = abb
        df['PossessionTeam'] = df['PossessionTeam'].map(map_abbr)
        df['HomeTeamAbbr'] = df['HomeTeamAbbr'].map(map_abbr)
        df['VisitorTeamAbbr'] = df['VisitorTeamAbbr'].map(map_abbr)
        df['FieldPosition'] = df['FieldPosition'].map(map_abbr)
        return df
    
    def clean_position(df):
        def get_position(pos):
            if pos == 'SAF':
                return 'DB'
            if pos == 'S':
                return 'DB'
            elif pos == 'OG':
                return 'G'
            elif pos == "OT":
                return 'T'
            else:
                return pos
        df['Position'] = df['Position'].apply(get_position)
        return df
            
    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]
        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')
        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['RusherDisYardLine'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y',
                             'RusherDisYardLine','back_oriented_down_field','back_moving_down_field']]
        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        player_distance = player_distance.groupby(['GameId','PlayId','RusherDisYardLine','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','RusherDisYardLine','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']
        return player_distance
def create_features(df):
    
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2
        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def map_team_name(df):
        map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
        for abb in df['PossessionTeam'].unique():
            map_abbr[abb] = abb
        df['PossessionTeam'] = df['PossessionTeam'].map(map_abbr)
        for abb in df['HomeTeamAbbr'].unique():
            map_abbr[abb] = abb
        df['HomeTeamAbbr'] = df['HomeTeamAbbr'].map(map_abbr)
        for abb in df['VisitorTeamAbbr'].unique():
            map_abbr[abb] = abb
        df['VisitorTeamAbbr'] = df['VisitorTeamAbbr'].map(map_abbr)
        for abb in df['FieldPosition'].unique():
            map_abbr[abb] = abb
        df['FieldPosition'] = df['FieldPosition'].map(map_abbr)
        return df
    
    def clean_position(df):
        def get_position(pos):
            if pos == 'SAF':
                return 'DB'
            if pos == 'S':
                return 'DB'
            elif pos == 'OG':
                return 'G'
            elif pos == "OT":
                return 'T'
            else:
                return pos
        df['Position'] = df['Position'].apply(get_position)
        return df
            
    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]
        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')
        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['RusherDisYardLine'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y',
                             'RusherDisYardLine','back_oriented_down_field','back_moving_down_field']]
        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        player_distance = player_distance.groupby(['GameId','PlayId','RusherDisYardLine','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','RusherDisYardLine','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']
        return player_distance
    
    def create_general_position(df):
        def get_general_position(pos):
            if pos == 'SS' or pos == 'FS' or pos == 'CB' or pos == 'DB':
                return 'DB'
            elif pos == 'DE' or pos == 'DT' or pos == 'DL':
                return 'DL'
            elif pos == 'ILB' or pos == 'OLB' or pos == 'MLB' or pos == 'LB':
                return 'LB'
            elif pos == 'WR':
                return 'WR'
            elif pos == 'TE':
                return 'TE'
            elif pos == 'T' or pos == 'G' or pos == 'C' or pos == 'NT' or pos == 'OL':
                return 'OL'
            elif pos == 'QB' or pos == 'RB' or pos == 'FB' or pos == 'HB' or pos == 'TB' or pos == 'WB':
                return 'OB'
            else:
                return 'Other'
        df['GeneralPosition'] = df['Position'].apply(get_general_position)
        return df
    
    def get_team_on_offense(df):
        df['TeamOnOffense'] = "home"
        df.loc[df.PossessionTeam != df.HomeTeamAbbr, 'TeamOnOffense'] = "away"
        df['IsOnOffense'] = df.Team == df.TeamOnOffense 
        return df
    
    def map_offense_defense_team(df):
        df['OffenseTeam'] = df['VisitorTeamAbbr']
        df.loc[df.TeamOnOffense == 'home', 'OffenseTeam'] = df['HomeTeamAbbr']    
        df['DefenseTeam'] = df['VisitorTeamAbbr']
        df.loc[df.TeamOnOffense == 'away', 'DefenseTeam'] = df['HomeTeamAbbr']
        df['IsOffenseAtHome'] = True
        df.loc[df.TeamOnOffense == 'away', 'IsOffenseAtHome'] = False
        return df
    
    def get_is_offense_winning(df):
        df['OffenseScore'] = df['HomeScoreBeforePlay']
        df.loc[df.TeamOnOffense == 'away', 'OffenseScore'] = df['VisitorScoreBeforePlay']
        df['DefenseScore'] = df['VisitorScoreBeforePlay']
        df.loc[df.TeamOnOffense == 'away', 'DefenseScore'] = df['HomeScoreBeforePlay']
        df['OffenseLessDefenseScore'] = df['OffenseScore'] - df['DefenseScore']
        df['OffenseInOwnTerritory'] = False
        df.loc[df.FieldPosition == df.OffenseTeam, 'OffenseInOwnTerritory'] = True
        df.drop(['OffenseScore','DefenseScore'], axis=1, inplace=True)
        return df

    def get_general_pos_counts(df):
        df['NumberOfBacksOnPlay'] = 0
        df['NumberOfOLinemenOnPlay'] = 0
        df['NumberOfWRsOnPlay'] = 0
        df['NumberOfTEsOnPlay'] = 0
        df['NumberOfDBsOnPlay'] = 0
        df['NumberOfDLinemenOnPlay'] = 0 
        df['NumberOfLBsOnPlay'] = 0
        # Pivot to find counts of each general position
        gen_pos_counts = df[['PlayId','GeneralPosition']].pivot_table(index='PlayId', columns='GeneralPosition', 
                                                                      aggfunc=len, fill_value=0)
        gen_pos_counts = gen_pos_counts.rename(columns = 
                              {'DB':'NumberOfDBsOnPlay', 'DL':'NumberOfDLinemenOnPlay', 
                               'LB':'NumberOfLBsOnPlay', 'OB':'NumberOfBacksOnPlay',
                               'OL':'NumberOfOLinemenOnPlay', 'TE':'NumberOfTEsOnPlay',
                               'WR':'NumberOfWRsOnPlay'})
        gen_pos_counts = gen_pos_counts.reset_index(drop=False)
        del gen_pos_counts.columns.name
        gen_pos_counts_cols = gen_pos_counts.columns.values.tolist()
        gen_pos_counts = gen_pos_counts.loc[gen_pos_counts.index.repeat(22)].reset_index(drop=True)
        df.update(gen_pos_counts)
        return df

    def utc2sec(x):
        return int(x.split("-")[2].split(":")[2].split(".")[0])
    def gameclock2secs(x):
        clock = x.split(":")
        return (60 * int(clock[0])) + int(clock[1])        
    
    def str_to_float(txt):
        try:
            return float(txt)
        except:
            return -1

    def get_time_features(df):
        df['TimeBetweenSnapHandoff'] = df['TimeHandoff'].apply(utc2sec) - df['TimeSnap'].apply(utc2sec)
        df['QuarterGameSecs'] = df['GameClock'].apply(gameclock2secs)
        df['TotalGameSecsPlayed'] = (900 - df['QuarterGameSecs']) + ((df['Quarter'] - 1) * 900)
        df['HalfGameSecsLeft'] = df['QuarterGameSecs']
        df.loc[(df['Quarter'].isin([1,3])), 'HalfGameSecsLeft'] = (900 + df['QuarterGameSecs'])
        return(df)
    
    def get_player_age(df):
        def timesnap2day(x):
            days = x.split("-")
            return 365 * int(days[0]) + 30 * int(days[1]) + int(days[2][:2])    
        def birthday2day(x):
            days = x.split("/")
            return 30 * int(days[0]) + int(days[1]) + 365 * int(days[2])        
        df['PlayerAge'] = df['TimeSnap'].apply(timesnap2day) - df['PlayerBirthDate'].apply(birthday2day)
        df.drop('PlayerBirthDate', axis=1, inplace=True)
        return df
        
    def get_player_weights_bmi(df):
        def height2inch(x):
            height = x.split("-")
            return 12 * int(height[0]) + int(height[1])
        df['PlayerHeight'] = df['PlayerHeight'].apply(height2inch)
        df = df.rename(columns={'PlayerWeight':'PlayerMass'})
        df['PlayerBMI'] = df['PlayerMass'] / df['PlayerHeight']
        return df
    def get_is_rusher(df):
        df['IsRusher'] = df.NflId == df.NflIdRusher 
        return df

    def get_redzone(df):
        df['InOffenseRedzone'] = False
        df.loc[df.YardLine <= 30, 'InOffenseRedzone'] = True
        df['InDefenseRedzone'] = False
        df.loc[df.YardLine >= 90, 'InDefenseRedzone'] = True    
        return df
    
    def get_qb_kneel(df):
        df['QBKneel'] = False
        df.loc[
            ((df.Quarter == 2) | (df.Quarter == 4)) &
            (df.GameClock <= '02:00') &
            (df.OffenseLessDefenseScore > 0) &
            (df.NumberOfBacksOnPlay >= 3) &
            (df.NumberOfTEsOnPlay >= 2), 'QBKneel'
        ] = True
        return df

    def get_dis_yardline(df):
        """
        For defender use only
        """
        df['DisYardLine'] = 0
        df.loc[df.IsOnOffense == True, 'DisYardLine'] = df['YardLine'] - df['X']
        df.loc[df.IsOnOffense == False, 'DisYardLine'] = df['X'] - df['YardLine']
        return df
    
    def get_no_defenders_yl(df):
        df['NoDefenderYL'] = 'NaN'
        df.loc[(df.IsOnOffense == False) & (df.DisYardLine < 0), 'NoDefenderYL'] = 'NoDefendersBelow0YL'
        df.loc[(df.IsOnOffense == False) & ((df.DisYardLine >= 0) & (df.DisYardLine < 3)), 'NoDefenderYL'] = 'NoDefenders0_2YL'
        df.loc[(df.IsOnOffense == False) & ((df.DisYardLine >= 3) & (df.DisYardLine < 6)), 'NoDefenderYL'] = 'NoDefenders3_5YL'
        df.loc[(df.IsOnOffense == False) & ((df.DisYardLine >= 6) & (df.DisYardLine < 9)), 'NoDefenderYL'] = 'NoDefenders6_8YL'
        df.loc[(df.IsOnOffense == False) & (df.DisYardLine >= 9), 'NoDefenderYL'] = 'NoDefendersAbove9YL'
        df['NoDefendersBelow0YL'] = 0
        df['NoDefenders0_2YL'] = 0
        df['NoDefenders3_5YL'] = 0
        df['NoDefenders6_8YL'] = 0
        df['NoDefendersAbove9YL'] = 0
        # Pivot to find counts of each general position
        no_defenders = df[['PlayId','NoDefenderYL']].pivot_table(index='PlayId', columns='NoDefenderYL', 
                                                                      aggfunc=len, fill_value=0)
        no_defenders = no_defenders.reset_index(drop=False).drop('NaN', axis=1)
        del no_defenders.columns.name
        no_defenders_cols = no_defenders.columns.values.tolist()
        no_defenders = no_defenders.loc[no_defenders.index.repeat(22)].reset_index(drop=True)
        df.update(no_defenders)
        return df
    
    def get_inside_runs(df):
        df['IsInside'] = 0
        inside1 = df[ # Outside seams and running in
            (((df.RusherY > -2.00) & (df.RusherY <= 23.55)) & ((df.RusherDir > 270) | (df.RusherDir <= 90))) |
            (((df.RusherY > 29.75) & (df.RusherY <= 55.00)) & ((df.RusherDir > 90) & (df.RusherDir <= 270)))
        ]['PlayId']
        inside2 = df[ # Inside the seams and running in
            (((df.RusherY > 23.55) & (df.RusherY <= 29.75)) & ((df.RusherDir > 40) & (df.RusherDir <= 140)))
        ]['PlayId']
        inside = inside1.tolist() + inside2.tolist()
        df.loc[df.PlayId.isin(inside), 'IsInside'] = 1
        return df
    
    def get_dis_from_yl(df):
        """
        For both off and def
        """
        df['DisFromYL'] = abs(df['YardLine'] - df['X'])
        return df
    
    def get_dis_rusher(df):
        rusher_xy = df.loc[df.IsRusher == True, ['GameId','PlayId','X','Y']].rename(columns={'X':'RusherX','Y':'RusherY'})
        df = df.merge(rusher_xy, on=['GameId','PlayId'])
        df['DisRusher'] = df[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        df.drop(['RusherX','RusherY'], axis=1,inplace=True)
        return df

    def get_dis_features(df):
        """
        Returns DisRusherNearestYardLine, RusherDisQB, RusherDisC and RusherDisMLB, DisC, DisQB
        """
        def get_rusher_dis_mlb(df):
            lb_xy = df.loc[(df.Position == 'MLB') | (df.Position == 'ILB'), 
                                   ['PlayId','X','Y']].rename(columns={'X':'MLBX', 'Y':'MLBY'})
            rusher_lb_xy = lb_xy.merge(rusher_xy, on=['PlayId'], how='left')
            rusher_lb_xy['RusherDisMLB'] = rusher_lb_xy[
                ['RusherX','RusherY','MLBX','MLBY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
            rusher_lb_xy.drop(['RusherX','RusherY','MLBX','MLBY'],axis=1, inplace=True)
            rusher_lb_dis = rusher_lb_xy.groupby(['PlayId']).agg({'RusherDisMLB':['min'],}).reset_index()
            rusher_lb_dis.columns = ['PlayId','RusherDisMLB']
            return rusher_lb_dis
        
        rusher_xy = df.loc[df.IsRusher == True, ['PlayId','X','Y']].rename(columns={'X':'RusherX','Y':'RusherY'})
        qb_xy = df.loc[df.Position == 'QB', ['PlayId','X','Y']].rename(columns={'X':'QBX','Y':'QBY'})
        c_xy = df.loc[df.Position == 'C', ['PlayId','X','Y']].rename(columns={'X':'CX','Y':'CY'})
        try:
            rusher_lb_dis = get_rusher_dis_mlb(df)
        except:
            rusher_lb_dis = np.nan
        rusherxy_qbxy = rusher_xy.merge(qb_xy, on=['PlayId'])
        rusherxy_qbxy_cxy = rusherxy_qbxy.merge(c_xy, on=['PlayId'])
        try:
            dis_total_xy = rusherxy_qbxy_cxy.merge(rusher_lb_dis, on=['PlayId'])
        except:
            dis_total_xy = rusherxy_qbxy_cxy
            dis_total_xy['RusherDisMLB'] = np.nan
        dis_total_xy = dis_total_xy.loc[dis_total_xy.index.repeat(22)].reset_index(drop=True)
        dis_total_xy.drop(['PlayId'], axis=1, inplace=True)
        df['RusherX'] = 0
        df['RusherY'] = 0
        df['QBX'] = 0
        df['QBY'] = 0
        df['CX'] = 0
        df['CY'] = 0
        df['RusherDisMLB'] = 0
        df.update(dis_total_xy) 

        df['DisRusherNearestYardLine'] = df[['YardLine','RusherY','X','Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)    
        df['RusherDisQB'] = df[['RusherX','RusherY','QBX','QBY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)    
        df['RusherDisC'] = df[['RusherX','RusherY','CX','CY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        df['DisC'] = df[['X','Y','CX','CY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)    
        df['DisQB'] = df[['X','Y','QBX','QBY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)    

        df.drop(['RusherX','RusherY','QBX','QBY','CX','CY'], axis=1,inplace=True)
        return df 

    
    def get_team_aggs(df, col, for_offense=True):
        aggs = ['Avg','Min','Max','Std']
        if for_offense == True:
            team_agg = df[df.IsOnOffense == True][['PlayId'] + [col]]
            team_agg = df[['PlayId'] + [col]]
            team_agg = team_agg.groupby(['PlayId']).agg({col:['mean','min','max','std']}).reset_index()
            avg_col = 'AvgOffense' + col
            min_col = 'MinOffense' + col
            max_col = 'MaxOffense' + col
            std_col = 'StdOffense' + col
        if for_offense == False:
            team_agg = df[df.IsOnOffense == False][['PlayId'] + [col]]
            team_agg = team_agg.groupby(['PlayId']).agg({col:['mean','min','max','std']}).reset_index()
            avg_col = 'AvgDefense' + col
            min_col = 'MinDefense' + col
            max_col = 'MaxDefense' + col
            std_col = 'StdDefense' + col
                
        team_agg.drop(['PlayId'], axis=1, inplace=True)
        team_agg_cols = [avg_col,min_col,max_col,std_col]
        team_agg.columns = team_agg_cols
        team_agg = team_agg.loc[team_agg.index.repeat(22)].reset_index(drop=True)
        for col in team_agg_cols:
            df[col] = 0
        df.update(team_agg)
        return df
    
    def get_rusher_dis_mlb_inside(df):
        try:
            df['RusherDisMLBByIsInside'] = (1 / df['RusherDisMLB']) * df['IsInside']
            df['RusherDisMLBByIsInside'] = df['RusherDisMLBByIsInside'].replace([np.inf, -np.inf], np.nan)
            return df
        except:
            df['RusherDisMLBByIsInside'] = np.nan
            return df
        
    def get_yards_by_down(df):
        df['YardsByDownSqrt'] = (df['Distance'] * df['Down']) **(1/2)
        return df
    
    def get_diff_rusher_dir_otation(df):
        df['DiffRusherDirOtation'] = df['RusherDir'] - df['RusherOrientation']
        return df
    
    def get_mech_feats(df):
        df['Weight'] = df['PlayerMass'] * 9.806 # acceleration gravity
        df['ChangeTime'] = df['Dis'] / df['S']
        df['Force'] = df['PlayerMass'] * df['A']
        df['Momentum'] = df['PlayerMass'] * df['S']
        df['KE'] = 0.5 * df['PlayerMass'] * (df['S']**2)
        df['Work'] = df['Force'] * df['Dis']
        df['Power'] = df['Work'] / df['ChangeTime']
        df['Impulse'] = df['Force'] * df['ChangeTime']
        angle = 90 - df['Dir']
        df['SX'] = np.abs(df['S'] * np.cos(angle))
        df['SY'] = np.abs(df['S'] * np.sin(angle))    
        df['ForceX'] = np.abs(df['Force'] * np.cos(angle))
        df['ForceY'] = np.abs(df['Force'] * np.sin(angle))
        df['MomentumX'] = np.abs(df['Momentum'] * np.cos(angle))
        df['MomentumY'] = np.abs(df['Momentum'] * np.sin(angle))
        df['WorkX'] = np.abs(df['Work'] * np.cos(angle))
        df['WorkY'] = np.abs(df['Work'] * np.sin(angle))
        df['PowerX'] = np.abs(df['Power'] * np.cos(angle))
        df['PowerY'] = np.abs(df['Power'] * np.sin(angle))
        df['ImpulseX'] = np.abs(df['Impulse'] * np.cos(angle))
        df['ImpulseY'] = np.abs(df['Impulse'] * np.sin(angle)) 
        return df
    
    def get_gen_position_feats(df, position): 
        pos_feat = df.loc[df.GeneralPosition == position, ['PlayId','A','S','Dir',
                                                    'Orientation','Dis',
                                                    'PlayerMass','PlayerHeight']]
        pos_feat = pos_feat.rename(columns={'A':position+'A','S':position+'S','Dir':position+'Dir',
                                            'Orientation':position+'Orientation',
                                            'Dis':position+'Dis','PlayerMass':position+'Weight',
                                            'PlayerHeight':position+'Height'})
        pos_feat = pos_feat.groupby(['PlayId']).agg(
            {position+'A':['mean','min','max'], 
             position+'S':['mean','min','max'], 
             position+'Dir':['mean','min','max'], 
             position+'Orientation':['mean','min','max'], 
             position+'Dis':['mean','min','max'], 
             position+'Weight':['mean','min','max'], 
             position+'Height':['mean','min','max']}).reset_index()
        pos_feat.columns = [''.join(col) for col in pos_feat.columns.values]
        pos_feat_columns = pos_feat.columns.tolist()
        pos_feat_columns.remove('PlayId')
        pos_feat.drop('PlayId',axis=1,inplace=True)
        pos_feat = pos_feat.loc[pos_feat.index.repeat(22)].reset_index(drop=True)
        for feat in pos_feat_columns:
            df[feat] = 0
        df.update(pos_feat)
        return df

    def get_off_less_def_feats(df, feat):
        off_feat = df.loc[df.IsOnOffense == True, ['PlayId',feat]]
        off_feat = off_feat.groupby(['PlayId']).agg({feat:['sum']}).reset_index()
        off_feat.drop('PlayId', axis=1,inplace=True)
        off_feat.columns = ['Off'+feat]
        def_feat = df.loc[df.IsOnOffense == False, ['PlayId',feat]]
        def_feat = def_feat.groupby(['PlayId']).agg({feat:['sum']}).reset_index()
        def_feat.drop('PlayId', axis=1,inplace=True)
        def_feat.columns = ['Def'+feat]
        off_def_feat = pd.DataFrame(off_feat['Off'+feat] - def_feat['Def'+feat], columns=['OffLessDef'+feat])

        df['OffLessDef'+feat] = 0
        off_def_feat = off_def_feat.loc[off_def_feat.index.repeat(22)].reset_index(drop=True)
        df.update(off_def_feat)
        return df
    
    def get_rusher_feats(df):
        rusher_feats = df.loc[df.IsRusher == True,['X','Y','S','A','Dis',
                                                   'Orientation','Dir','DisFromYL',
                                                   'PlayerMass','PlayerHeight']]
        rusher_feats = rusher_feats.loc[rusher_feats.index.repeat(22)].reset_index(drop=True)
        rusher_feats = rusher_feats.rename(columns={'X':'RusherX','Y':'RusherY',})
        df['RusherX'] = 0
        df['RusherY'] = 0        
        df.update(rusher_feats)
        df = df.rename(columns={'S':'RusherS',
                                'A':'RusherA','Dis':'RusherDis',
                                'Orientation':'RusherOrientation',
                                'Dir':'RusherDir','DisFromYL':'RusherDisYL',
                                'PlayerMass':'RusherMass',
                                'PlayerHeight':'RusherHeight'})
        df['RusherWeight'] = df['RusherMass'] * 9.806 # acceleration gravity
        df['ChangeTime'] = df['RusherDis'] / df['RusherS']
        df['RusherForce'] = df['RusherMass'] * df['RusherA']
        df['RusherMomentum'] = df['RusherMass'] * df['RusherS']
        df['RusherKE'] = 0.5 * df['RusherMass'] * (df['RusherS']**2)
        df['RusherWork'] = df['RusherForce'] * df['RusherDis']
        df['RusherPower'] = df['RusherWork'] / df['ChangeTime']
        df['RusherImpulse'] = df['RusherForce'] * df['ChangeTime']
        angle = 90 - df['RusherDir']
        df['RusherSX'] = np.abs(df['RusherS'] * np.cos(angle))
        df['RusherSY'] = np.abs(df['RusherS'] * np.sin(angle))    
        df['RusherForceX'] = np.abs(df['RusherForce'] * np.cos(angle))
        df['RusherForceY'] = np.abs(df['RusherForce'] * np.sin(angle))
        df['RusherMomentumX'] = np.abs(df['RusherMomentum'] * np.cos(angle))
        df['RusherMomentumY'] = np.abs(df['RusherMomentum'] * np.sin(angle))
        df['RusherWorkX'] = np.abs(df['RusherWork'] * np.cos(angle))
        df['RusherWorkY'] = np.abs(df['RusherWork'] * np.sin(angle))
        df.drop(['ChangeTime'],axis=1,inplace=True)
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0)
        return df
    
    def get_gap_feats(df):
        df['X_gapmedian'] = 0
        df['X_gapmax'] = 0
        df['Y_gapmedian'] = 0
        df['Y_gapmax'] = 0
        plays = df.loc[df.IsOnOffense == False, ['PlayId','X','Y','RusherX']]
        gaps_df = pd.DataFrame(columns=['PlayId','X_gap','Y_gap'])
        for play in plays['PlayId'].unique():
            RusherX_val = df.loc[df.PlayId == play, 'RusherX'].unique()[0]
            X_vals = plays.loc[plays.PlayId == play, 'X']
            X_vals = X_vals.append(pd.Series([RusherX_val,120]), ignore_index=True).sort_values().reset_index(drop=True)
            X_vals = np.diff(X_vals)
            Y_vals = plays.loc[plays.PlayId == play, 'Y']
            Y_vals = Y_vals.append(pd.Series([0,53.3]), ignore_index=True).sort_values().reset_index(drop=True)
            Y_vals = np.diff(Y_vals)
            gaps_play = pd.DataFrame()
            gaps_play['X_gap'] = X_vals
            gaps_play['Y_gap'] = Y_vals
            gaps_play['PlayId'] = play
            gaps_df = pd.concat([gaps_df, gaps_play], axis=0, ignore_index=True)
        gaps_agg_x = gaps_df.groupby('PlayId').agg({'X_gap':['median','max']}).reset_index()
        gaps_agg_x.columns = [''.join(col) for col in gaps_agg_x.columns.values]
        gaps_agg_x = gaps_agg_x.loc[gaps_agg_x.index.repeat(22)].reset_index(drop=True)
        gaps_agg_y = gaps_df.groupby('PlayId').agg({'Y_gap':['median','max']}).reset_index()
        gaps_agg_y.columns = [''.join(col) for col in gaps_agg_y.columns.values]
        gaps_agg_y = gaps_agg_y.loc[gaps_agg_y.index.repeat(22)].reset_index(drop=True)
        df.update(gaps_agg_x)
        df.update(gaps_agg_y)
        df['XY_gap_area'] = df['X_gapmax'] * df['Y_gapmax']
        df.drop(['X','Y'], axis=1, inplace=True)
        return df
    
    def combine_features(df): 
        df = map_team_name(df)
        df = get_team_on_offense(df)
        df = map_offense_defense_team(df)
        df = clean_position(df)
        df = get_is_rusher(df)
        df = create_general_position(df)
        df = get_player_age(df)
        df = get_player_weights_bmi(df)
        yardline = update_yardline(df)
        df = update_orientation(df, yardline)         
        df = get_redzone(df)
        df = get_dis_yardline(df) # use for defender distance only
        df = get_dis_from_yl(df) # absolute distance for both off and def
        df = get_dis_rusher(df)
        df = get_dis_features(df)
        df = get_mech_feats(df)
        agg_cols = ['X','Y','A','Dir','DisFromYL','DisRusher','Force','Momentum','ForceX','Dis'
                   ]
        
        for agg_col in agg_cols:
            df = get_team_aggs(df, col=agg_col, for_offense=True)
            df = get_team_aggs(df, col=agg_col, for_offense=False)
            
        del agg_cols
        df.drop(['DisQB','DisC','MinOffenseDisRusher'],axis=1,inplace=True)

        off_less_def_feats = ['X']
        
        for feat in off_less_def_feats:
            df = get_off_less_def_feats(df, feat)
        df = get_rusher_feats(df)
        df = get_gap_feats(df)
        return df
    
    df = combine_features(df)
    df = df.fillna(-999)
    df = df.select_dtypes(exclude=['object'])
    df.drop(['RusherMass','PlayerAge','PlayerBMI','DisYardLine',
             'DisRusher','NflIdRusher','IsOnOffense',
             'NflId','JerseyNumber','IsRusher','DisRusherNearestYardLine',
             'Weight','Force','Momentum','KE','Work','Power','Impulse',
             'SX','SY','ForceX','ForceY','MomentumX','MomentumY','WorkX',
             'WorkY','PowerX','PowerY','ImpulseX','ImpulseY'], axis=1, inplace=True)

    df = df.drop_duplicates().reset_index(drop=True)
    return df

In [4]:
train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv')
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()

train_basetable = create_features(train)

X = train_basetable.copy()
X = X.sample(frac=1).reset_index(drop=True)

yards = X.Yards

y = np.zeros((yards.shape[0], 199))
for idx, target in enumerate(list(yards)):
    y[idx][99 + target] = 1
print(train_basetable.shape)
train_basetable.head()

(23171, 129)


Unnamed: 0,GameId,PlayId,RusherS,RusherA,RusherDis,RusherOrientation,RusherDir,Season,Quarter,Down,Distance,HomeScoreBeforePlay,VisitorScoreBeforePlay,DefendersInTheBox,Yards,RusherHeight,Week,Temperature,Humidity,IsOffenseAtHome,YardLine,InOffenseRedzone,InDefenseRedzone,RusherDisYL,RusherDisMLB,RusherDisQB,RusherDisC,AvgOffenseX,MinOffenseX,MaxOffenseX,StdOffenseX,AvgDefenseX,MinDefenseX,MaxDefenseX,StdDefenseX,AvgOffenseY,MinOffenseY,MaxOffenseY,StdOffenseY,AvgDefenseY,MinDefenseY,MaxDefenseY,StdDefenseY,AvgOffenseA,MinOffenseA,MaxOffenseA,StdOffenseA,AvgDefenseA,MinDefenseA,MaxDefenseA,StdDefenseA,AvgOffenseDir,MinOffenseDir,MaxOffenseDir,StdOffenseDir,AvgDefenseDir,MinDefenseDir,MaxDefenseDir,StdDefenseDir,AvgOffenseDisFromYL,MinOffenseDisFromYL,MaxOffenseDisFromYL,StdOffenseDisFromYL,AvgDefenseDisFromYL,MinDefenseDisFromYL,MaxDefenseDisFromYL,StdDefenseDisFromYL,AvgOffenseDisRusher,MaxOffenseDisRusher,StdOffenseDisRusher,AvgDefenseDisRusher,MinDefenseDisRusher,MaxDefenseDisRusher,StdDefenseDisRusher,AvgOffenseForce,MinOffenseForce,MaxOffenseForce,StdOffenseForce,AvgDefenseForce,MinDefenseForce,MaxDefenseForce,StdDefenseForce,AvgOffenseMomentum,MinOffenseMomentum,MaxOffenseMomentum,StdOffenseMomentum,AvgDefenseMomentum,MinDefenseMomentum,MaxDefenseMomentum,StdDefenseMomentum,AvgOffenseForceX,MinOffenseForceX,MaxOffenseForceX,StdOffenseForceX,AvgDefenseForceX,MinDefenseForceX,MaxDefenseForceX,StdDefenseForceX,AvgOffenseDis,MinOffenseDis,MaxOffenseDis,StdOffenseDis,AvgDefenseDis,MinDefenseDis,MaxDefenseDis,StdDefenseDis,OffLessDefX,RusherX,RusherY,RusherWeight,RusherForce,RusherMomentum,RusherKE,RusherWork,RusherPower,RusherImpulse,RusherSX,RusherSY,RusherForceX,RusherForceY,RusherMomentumX,RusherMomentumY,RusherWorkX,RusherWorkY,X_gapmedian,X_gapmax,Y_gapmedian,Y_gapmax,XY_gap_area
0,2017090700,20170907000118,3.63,3.35,0.38,198.02,114.26,2017,1,3,2,0,0,6.0,8,70,1,63.0,77.0,True,45.0,False,False,3.75,7.820038,1.449724,4.401931,46.118182,40.24,63.37,4.392297,48.179091,44.94,63.37,5.294079,28.713182,16.64,38.83,6.308114,28.890909,16.64,38.83,7.014714,1.191818,0.32,3.35,0.691807,1.025455,0.32,2.43,0.631338,152.131818,37.41,304.69,69.10972,173.046364,37.41,304.69,81.429631,2.226364,0.06,18.37,3.925483,3.19,0.06,18.37,5.286855,7.680806,22.415872,5.056308,9.752491,4.59331,22.415872,5.327299,286.34,80.64,686.75,153.918266,233.012727,80.64,500.58,133.860076,399.645,54.08,1081.5,253.965203,287.580909,54.08,896.35,226.936523,168.642671,3.299931,452.532345,124.686249,119.980292,34.14441,224.056325,68.04185,0.245,0.0,0.59,0.184462,0.184545,0.01,0.51,0.17201,-45.34,41.25,30.53,2010.23,686.75,744.15,1350.63225,260.965,2492.9025,71.891185,2.333106,2.78092,441.39409,526.114835,478.286731,570.088612,167.729754,199.923637,0.53,56.63,2.495,16.64,942.3232
1,2017090700,20170907000139,3.06,2.41,0.34,149.3,47.8,2017,1,1,10,0,0,6.0,3,70,1,63.0,77.0,True,53.0,False,False,4.07,7.760058,0.792023,3.787941,54.214545,48.21,71.95,4.647315,56.561818,53.2,71.95,5.406292,24.827727,9.43,33.88,6.892898,24.892727,9.43,33.88,7.190716,1.567273,0.55,2.67,0.617554,1.592727,0.55,2.67,0.660425,166.613182,12.72,355.85,110.152689,207.282727,12.72,355.85,118.90526,2.5,0.08,18.95,4.073731,3.561818,0.2,18.95,5.406292,8.223049,23.025872,5.76419,10.297028,4.287773,23.025872,5.833217,365.933182,169.4,556.1,114.967154,351.486364,169.4,550.02,113.172284,440.061364,144.0,806.96,178.123475,367.016364,197.1,574.74,107.84506,235.685102,23.636195,539.523303,148.461654,195.11654,23.636195,412.77662,138.441026,0.236364,0.01,0.5,0.121088,0.197273,0.01,0.34,0.101793,-51.64,48.93,27.16,2010.23,494.05,627.3,959.769,167.977,1511.793,54.894444,0.642378,2.991814,103.714689,483.04106,131.687531,613.321844,35.262994,164.23396,0.61,48.05,2.395,19.42,933.131
2,2017090700,20170907000189,5.77,2.42,0.6,219.18,138.04,2017,1,1,10,0,0,7.0,5,70,1,63.0,77.0,True,75.0,False,False,3.66,6.69003,1.64639,3.329625,76.618636,70.49,91.76,4.003402,78.389091,75.25,91.76,4.720893,22.026818,9.08,32.89,6.192438,22.602727,9.08,32.89,6.57762,1.755909,0.46,3.63,0.935733,2.092727,1.04,3.11,0.749134,157.578636,74.03,232.83,36.419174,164.201818,74.03,193.63,34.4427,2.435909,0.12,16.76,3.543412,3.389091,0.25,16.76,4.720893,8.097011,20.726285,4.877886,9.903689,4.22167,20.726285,5.07329,410.418636,84.18,708.48,179.398905,477.791818,197.6,708.48,156.435161,845.866364,248.9,1252.4,244.104383,754.953636,248.9,1028.7,208.588768,286.411988,12.878255,646.294291,189.327712,348.773481,61.996408,646.294291,198.175616,0.380455,0.12,0.6,0.103853,0.360909,0.12,0.45,0.094282,-38.95,71.34,19.11,2010.23,496.1,1182.85,3412.52225,297.66,2862.497,51.587522,3.513412,4.57699,302.0804,393.52591,720.249548,938.282852,181.24824,236.115546,0.88,28.24,2.215,20.41,576.3784
3,2017090700,20170907000345,4.45,3.2,0.46,173.78,84.56,2017,1,2,2,0,0,9.0,2,71,1,63.0,77.0,True,108.0,False,True,3.53,5.704849,0.918096,5.24187,108.515,103.7,110.8,1.679696,109.310909,107.79,110.8,0.962418,25.766818,18.48,33.65,3.795418,25.733636,18.48,33.65,4.505029,1.087273,0.07,3.2,0.825499,1.293636,0.27,2.62,0.762119,119.335909,5.41,348.87,125.543391,146.282727,5.41,348.87,150.729268,1.304091,0.05,4.3,1.147866,1.349091,0.09,2.8,0.902513,5.297134,9.791231,2.26833,6.309354,4.528002,9.791231,1.834174,273.020909,23.45,672.0,181.174812,318.250909,72.9,641.9,174.044609,474.225,46.2,1136.2,265.785962,389.686364,167.4,695.36,163.680695,157.044079,3.760057,446.938818,120.598496,186.888238,63.196787,409.595361,109.732661,0.226818,0.09,0.5,0.116062,0.199091,0.13,0.32,0.064102,-17.51,104.47,25.36,2059.26,672.0,934.5,2079.2625,309.12,2990.4,69.465169,2.959639,3.323106,446.938818,501.826357,621.524294,697.852278,205.591856,230.840124,0.3,9.2,1.54,19.65,180.78
4,2017090700,20170907000395,3.9,2.53,0.44,34.27,157.92,2017,1,1,10,7,0,7.0,7,71,1,63.0,77.0,False,35.0,False,False,5.01,7.188992,0.502892,4.92062,35.774545,29.51,49.41,4.811638,38.413636,33.95,49.41,5.391251,26.824091,10.91,39.18,7.466987,26.586364,12.4,39.18,7.864325,1.870909,0.48,3.26,0.627489,2.121818,1.26,3.26,0.607055,170.335,30.63,341.78,76.029425,198.156364,141.22,322.72,46.7747,3.026364,0.07,14.41,3.766343,3.959091,0.53,14.41,4.964066,8.752055,21.214806,5.81445,11.056456,4.288088,21.214806,5.900009,475.914091,110.4,941.5,179.104215,518.136364,313.9,941.5,202.298265,654.256818,206.4,1165.5,286.321443,580.593636,206.4,1165.5,266.876529,269.689326,27.256068,623.148226,151.393581,250.074409,47.160716,623.148226,159.821839,0.288182,0.0,0.49,0.125381,0.245455,0.0,0.37,0.121767,-58.06,29.99,27.12,2118.096,546.48,842.4,1642.68,240.4512,2131.272,61.654154,1.431213,3.627896,200.545974,508.351948,309.142015,783.625532,88.240228,223.674857,1.14,70.59,3.17,14.12,996.7308


In [5]:
cat = ['InDefenseRedzone']

num = list(set(X.columns.values.tolist()) - set(cat))
num.remove('GameId')
num.remove('PlayId')
print(len(cat))
print(len(num))

1
126


In [6]:
features = ['GameId','PlayId',
            'RusherX','RusherA',
            'RusherDir',
            'RusherDis',
            'YardLine',
            'RusherDisYL', 
            'StdDefenseX', 
            'StdDefenseY', 
            'AvgOffenseA', 
            'AvgDefenseA', 
            'StdOffenseDir',
            'StdDefenseDir',
            'MaxDefenseDisFromYL',
            'AvgDefenseDisRusher',
            'MinDefenseDisRusher',
            'AvgOffenseForce',
            'AvgDefenseForce',
            'AvgOffenseMomentum',
            'AvgDefenseMomentum',
            'AvgDefenseForceX',
            'OffLessDefX', 
            'RusherForce',
            'RusherMomentum',
            'InDefenseRedzone',
            'AvgOffenseDis',
            'AvgDefenseDis',
            'AvgOffenseDisFromYL',
            'AvgDefenseDisFromYL',
            'RusherKE',
            'RusherWork',
            'Y_gapmax'
           ]
X = X[features]

In [7]:
scaler = StandardScaler()
num = list(set(features) & set(num)) # update num to only show intersection with features selected
X[num] = scaler.fit_transform(X[num])

In [8]:
def model_396_1():
    inputs = []
    embeddings = []
    for i in cat:
        input_ = Input(shape=(1,))
        embedding = Embedding(int(np.absolute(X[i]).max() + 1), 10, input_length=1)(input_)
        embedding = Reshape(target_shape=(10,))(embedding)
        inputs.append(input_)
        embeddings.append(embedding)
    input_numeric = Input(shape=(len(num),))
    embedding_numeric = Dense(512, activation='relu')(input_numeric) 
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    x = Concatenate()(embeddings)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(199, activation='softmax')(x)
    model = Model(inputs, output)
    return model


n_splits = 5
kf = GroupKFold(n_splits=n_splits)
score = []
for i_369, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])):
    print(f'Fold : {i_369}')
    X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
    X_train = [np.absolute(X_train[i]) for i in cat] + [X_train[num]]
    X_val = [np.absolute(X_val[i]) for i in cat] + [X_val[num]]
    model = model_396_1()
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=[])
    es = EarlyStopping(monitor='val_CRPS', 
                   mode='min',
                   restore_best_weights=True, 
                   verbose=2, 
                   patience=5)
    es.set_model(model)
    metric = Metric(model, [es], [(X_train,y_train), (X_val,y_val)])
    for i in range(1):
        model.fit(X_train, y_train, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=64, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=128, verbose=False)
    for i in range(1):
        model.fit(X_train, y_train, batch_size=256, verbose=False)
    model.fit(X_train, y_train, callbacks=[metric], epochs=100, batch_size=1024, verbose=False)
    score_ = crps(y_val, model.predict(X_val))
    model.save(f'keras_369_{i_369}.h5')
    print(score_)
    score.append(score_)

Fold : 0
tr CRPS 0.012648 val CRPS 0.012592
tr CRPS 0.012627 val CRPS 0.01257
tr CRPS 0.012609 val CRPS 0.012581
tr CRPS 0.012599 val CRPS 0.012579
tr CRPS 0.012572 val CRPS 0.012568
tr CRPS 0.012555 val CRPS 0.01257
tr CRPS 0.012532 val CRPS 0.012557
tr CRPS 0.012523 val CRPS 0.012555
tr CRPS 0.012494 val CRPS 0.012541
tr CRPS 0.012504 val CRPS 0.012557
tr CRPS 0.012478 val CRPS 0.012539
tr CRPS 0.012448 val CRPS 0.012534
tr CRPS 0.012424 val CRPS 0.012522
tr CRPS 0.012412 val CRPS 0.012511
tr CRPS 0.0124 val CRPS 0.012512
tr CRPS 0.012363 val CRPS 0.012516
tr CRPS 0.012366 val CRPS 0.012521
tr CRPS 0.012329 val CRPS 0.012484
tr CRPS 0.01232 val CRPS 0.012486
tr CRPS 0.012313 val CRPS 0.012495
tr CRPS 0.012316 val CRPS 0.012511
tr CRPS 0.012297 val CRPS 0.012509
tr CRPS 0.012229 val CRPS 0.012483
tr CRPS 0.012212 val CRPS 0.012466
tr CRPS 0.012197 val CRPS 0.012464
tr CRPS 0.012179 val CRPS 0.012516
tr CRPS 0.012137 val CRPS 0.012498
tr CRPS 0.012123 val CRPS 0.01248
tr CRPS 0.012132 

In [9]:
print(np.mean(score))

0.012699222746859662


In [13]:
models = []
for i in range(n_splits):
    models.append(load_model(f'keras_369_{i}.h5'))

In [14]:
for (test_df, sample_prediction_df) in tqdm.tqdm(iter_test):
    basetable = create_features(test_df)
    
    basetable = basetable[features]
    basetable[num] = scaler.transform(basetable[num])
    test_ = [np.absolute(basetable[i]) for i in cat] + [basetable[num]]
    
    y_pred = np.mean([model.predict(test_) for model in models], axis=0)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]
    
    preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)
    env.predict(preds_df)
env.write_submission_file()

3438it [1:28:00,  1.54s/it]


Your submission file has been saved!  Once you `Commit` your Notebook and it finishes running, you can submit the file to the competition from the Notebook Viewer `Output` tab.
