## 1. Importa os pacotes e o dataset de treino

In [9]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import re
import codecs
import time
import datetime
import gc

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

# Variavel para controlar o treinamento no Kaggle
TRAIN_OFFLINE = True

In [10]:
# Importa os pacotes de algoritmos de redes neurais (Keras)
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras.utils import to_categorical
from keras.layers import Dense,Input,Flatten,concatenate,Dropout,Lambda,BatchNormalization
from keras.layers import Activation
from keras.models import Sequential, Model
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint
import keras.backend as K
from keras.optimizers import Adam, Nadam, RMSprop
from keras import optimizers
#from keras_lookahead import Lookahead
#from keras_radam import RAdam

# Importa pacotes do sklearn
from sklearn import preprocessing
import sklearn.metrics as mtr
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import scale, MinMaxScaler, StandardScaler
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder



In [11]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [12]:
if TRAIN_OFFLINE:
    train = pd.read_csv('../data/train.csv', dtype={'WindSpeed': 'object'})
else:
    train  = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})

In [13]:
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()

## 2. Feature Engineering

In [14]:
def add_centroid(tmp):
    '''
    Calculates centroid coordinates of offense and defense teams
    Takes into account rusher coordinates
    '''
    
    df = pd.DataFrame()
    
    for col in tmp.columns:
        if col in ['X','Y']:
            df[col+'_mean']= tmp.groupby(['GameId','PlayId','IsOnOffense'])[col].mean()
            df[col+'_std']= tmp.groupby(['GameId','PlayId','IsOnOffense'])[col].std()
    temp1= df.unstack()
    temp1.columns = ['Xc_def_mean','Xc_off_mean', 'Xc_def_std','Xc_off_std',
                    'Yc_def_mean','Yc_off_mean','Yc_def_std','Yc_off_std']
    #col = ['Xc_def_std','Yc_def_std','Xc_off_std','Yc_off_std']
    #temp1.drop(col, axis=1,inplace=True)
    
    temp2 = tmp.loc[tmp['IsRusher'] == True, ['GameId','PlayId','X','Y','Sx','Sy','Ax','Ay','F','Fx','Fy','p','px','py','pf_max','px_max','py_max']]
    temp2.columns = ['GameId','PlayId','X_rusher','Y_rusher','Sx_rusher','Sy_rusher','Ax_rusher','Ay_rusher','F_rusher','Fx_rusher','Fy_rusher','p_rusher','px_rusher','py_rusher','p_max_rusher','px_max_rusher','py_max_rusher']
    temp1 = temp1.merge(temp2, on=['GameId','PlayId'], how='left')
    new_tmp = tmp.merge(temp1, on=['GameId','PlayId'])

    return new_tmp

def add_x_y_components(df):
    '''
    Splits S and A into their x and y components
    Calculates max speed at given acceleration
    Calculates F and p, absolute and relatives
    '''
    df['S_'] = df['S'].apply(convert_yard_to_m)
    df['A_'] = df['A'].apply(convert_yard_to_m)
    
    df['Sx'] = df['S']*np.cos(df['Dir'])
    
    df['Sy'] = df['S']*np.sin(df['Dir'])
    df['Sy_'] = np.abs(df['S']*np.sin(df['Dir'])) 
    
    df['Ax'] = df['A']*np.cos(df['Dir'])
    df['Ay'] = df['A']*np.sin(df['Dir'])
    
    df['Sfx'] = df['Sx']+df['Ax']*3.5
    df['Sfy'] = df['Sy']+df['Ay']*3.5
    
    df['F'] = df['PlayerWeight_kg']*df['A_']
    df['Fx'] = df['F']*np.cos(df['Dir'])
    df['Fy'] = df['F']*np.sin(df['Dir'])

    df['p'] = df['PlayerWeight_kg']*df['S_']
    df['px'] = df['p']*np.cos(df['Dir'])
    df['py'] = df['p']*np.sin(df['Dir'])
    
    df['pf_max'] = df['PlayerWeight_kg']*(np.sqrt(df['Sfx']**2+df['Sfy']**2))
    #df['pf_max'] = df['PlayerWeight']*9
    
    df['px_max'] = df['pf_max']*np.cos(df['Dir'])
    df['py_max'] = df['pf_max']*np.sin(df['Dir'])
    
    df['KE'] = (1/2) * df['PlayerWeight_kg'] * (df['S_'] ** 2)

    
    del df['A_']
    del df['S_']
    
    return df
    
def l2_norm_mat(mat_v, mat_u):
    '''
    Calculates euclidean distance
    '''
    return np.linalg.norm(mat_v - mat_u, axis=1)    

def calculate_distance(tmp): 
    '''
    Calculates euclidean distance for below components
    '''

    mat_u = np.array(tmp[['X','Y']])
    mat_v = np.array(tmp[['Xc_def_mean','Yc_def_mean']])
    mat_w = np.array(tmp[['Xc_off_mean','Yc_off_mean']])
    mat_z = np.array(tmp[['X_rusher','Y_rusher']])
    
    tmp['dist_cdef_vs_coff'] = l2_norm_mat(mat_v, mat_w)
    tmp['dist_cdef_vs_player'] = l2_norm_mat(mat_u, mat_v)
    tmp['dist_coff_vs_player'] = l2_norm_mat(mat_u, mat_w)
    tmp['dist_cdef_vs_rusher'] = l2_norm_mat(mat_z, mat_v)
    tmp['dist_coff_vs_rusher'] = l2_norm_mat(mat_z, mat_w)
    tmp['dist_player_vs_rusher'] = l2_norm_mat(mat_u, mat_z)
    
    return tmp

def add_rushertimeto1stdown(data):
    '''
    Calculates time needed for 1st down (based on Distance)
    '''
    tmp = data.copy()
    tmp = tmp.loc[tmp['IsRusher']==True,['GameId','PlayId','X','Sx_rusher','Ax_rusher','Dis','Distance','YardLine']]
    # below corrected
    tmp['S0x_rusher'] = np.sqrt(np.square(tmp['Sx_rusher'])-2*(tmp['Ax_rusher'])*(tmp['Dis']))
    tmp['S0x_rusher'].fillna(np.sqrt(-np.square(tmp['Sx_rusher'])-2*(-tmp['Ax_rusher'])*(tmp['Dis'])), inplace = True)
    tmp['rusher_dist_to_yardline'] = tmp['YardLine']-tmp['X']
    tmp['rusher_dist_to_1stdown'] = tmp['rusher_dist_to_yardline'] + tmp['Distance']
    # tmp['Disc_1'] = (tmp['Sx_rusher'])**2-(4*(tmp['Ax_rusher']/2)*(-tmp['Distance']))
    tmp['Disc_1'] = (tmp['S0x_rusher'])**2-(4*(tmp['Ax_rusher']/2)*(-tmp['rusher_dist_to_1stdown']))
    tmp['t_1stDown'] = (-tmp['S0x_rusher'] + np.sqrt(tmp['Disc_1'])/(tmp['Ax_rusher']))
    S_max_rusher = tmp.loc[tmp['Sx_rusher']>0,'Sx_rusher'].median()
    tmp['t_1stDown'].fillna(2*tmp['rusher_dist_to_1stdown']/S_max_rusher, inplace=True)
    tmp.loc[np.isneginf(tmp['t_1stDown'])==True,['t_1stDown']] = -1.0
    tmp.loc[np.isposinf(tmp['t_1stDown'])==True,['t_1stDown']] = -1.0


    col = ['Sx_rusher','Ax_rusher','Dis','Distance','Disc_1','X','YardLine','rusher_dist_to_yardline']
    tmp.drop(col,axis= 1, inplace= True)
    
    data = data.merge(tmp, on=['GameId','PlayId'], how='left')
    
    return data

def calc_timetotackle(tmp): # it doesnt get affected 
    '''
    Calculates time needed to tackle rusher
    '''
    df = pd.DataFrame()
    tmp = tmp.loc[tmp['IsOnOffense']==False,['GameId','PlayId','Sx','Sx_rusher','X','X_rusher','Ax','Ax_rusher','Sy','Sy_rusher','Y','Y_rusher','Ay','Ay_rusher','IsOnOffense']]
    tmp['Dx'] = np.square(tmp['Sx']-tmp['Sx_rusher'])-4*(tmp['X']-tmp['X_rusher'])*(tmp['Ax']-tmp['Ax_rusher'])/2 
    tmp['Dy'] = np.square(tmp['Sy']-tmp['Sy_rusher'])-4*(tmp['Y']-tmp['Y_rusher'])*(tmp['Ay']-tmp['Ay_rusher'])/2 


    #tmp['t1'] = (-(tmp['Sx']-tmp['Sx_rusher'])+np.sqrt(tmp['D']))/(2*(tmp['Ax']-tmp['Ax_rusher'])/2)
    tmp['tx'] = (-(tmp['Sx']-tmp['Sx_rusher'])-np.sqrt(tmp['Dx']))/(2*(tmp['Ax']-tmp['Ax_rusher'])/2)
    tmp['ty'] = (-(tmp['Sy']-tmp['Sy_rusher'])-np.sqrt(tmp['Dy']))/(2*(tmp['Ay']-tmp['Ay_rusher'])/2)
    
    tmp['tx'].fillna((-(tmp['Sx']-tmp['Sx_rusher']))/(2*(tmp['Ax']-tmp['Ax_rusher'])/2), inplace = True)
    tmp['ty'].fillna((-(tmp['Sy']-tmp['Sy_rusher']))/(2*(tmp['Ay']-tmp['Ay_rusher'])/2), inplace = True)
    
    tmp['time_to_tackle'] = np.sqrt(np.square(tmp['tx']) + np.square(tmp['ty']))
    
    for col in tmp.columns:
        if col in ['time_to_tackle']:
            df[col+'_mean']= tmp.groupby(['GameId','PlayId','IsOnOffense'])[col].mean()
            df[col+'_min']= tmp.groupby(['GameId','PlayId','IsOnOffense'])[col].min()
            df[col+'_max']= tmp.groupby(['GameId','PlayId','IsOnOffense'])[col].max()
            
    tmp2 = df.unstack()
    tmp2.columns = ['time_to_tackle_mean','time_to_tackle_min','time_to_tackle_max']#,'time_to_tackle_cnt']
    
    tmp2.loc[np.isposinf(tmp2['time_to_tackle_max'])==True,['time_to_tackle_mean','time_to_tackle_max']] = tmp2['time_to_tackle_min']
    tmp2.loc[np.isposinf(tmp2['time_to_tackle_min'])==True,['time_to_tackle_min']] = 40
    tmp2.loc[np.isposinf(tmp2['time_to_tackle_max'])==True,['time_to_tackle_max']] = 40
    tmp2.loc[np.isposinf(tmp2['time_to_tackle_mean'])==True,['time_to_tackle_mean']] = 40
    
    return tmp2

def add_distance_ratios(data,encoder,dataset='train'):
    
    tmp = data.copy()
    
    tmp1 = tmp.loc[:,['GameId','PlayId','IsOnOffense','IsRusher','dist_player_vs_rusher']]
    tmp1['dist_from_rusher_cat'] = tmp1['dist_player_vs_rusher'].apply(lambda x: map_distance_from_rusher(x))

    enc = encoder #OneHotEncoder(handle_unknown='ignore', sparse = False)
    if dataset == 'train':
        new_column = pd.DataFrame(enc.fit_transform(np.array(tmp1['dist_from_rusher_cat']).reshape(-1,1)))
    else:
        new_column = pd.DataFrame(enc.transform(np.array(tmp1['dist_from_rusher_cat']).reshape(-1,1)))  
    new_column.columns = ['D1','D2','D3','D4','D5','D6','D7']

    tmp = pd.merge(tmp, new_column, left_index=True, right_index=True,how ='left')

    tmp2 = tmp.loc[(tmp1['IsRusher']!=1)&(tmp['IsOnOffense']==1),['GameId','PlayId','NflId','IsOnOffense','dist_player_vs_rusher','dist_from_rusher_cat','D1','D2','D3','D4','D5','D6','D7']]

    df = pd.DataFrame()
    for col in ['D1','D2','D3','D4','D5','D6','D7']:
        df[col+'_cnt_off'] = tmp2.groupby(['GameId','PlayId'])[col].sum()

    tmp2 = tmp.loc[(tmp1['IsOnOffense']==0),['GameId','PlayId','NflId','IsOnOffense','dist_player_vs_rusher','dist_from_rusher_cat','D1','D2','D3','D4','D5','D6','D7']]

    for col in ['D1','D2','D3','D4','D5','D6','D7']:
        df[col+'_cnt_def'] = tmp2.groupby(['GameId','PlayId'])[col].sum()


    weights = [2, 0.9,0.8,0.7,0.6,0.5,0.4]
    df['dist_from_rusher_DEF'] = np.dot(df[['D1_cnt_def','D2_cnt_def','D3_cnt_def','D4_cnt_def','D5_cnt_def','D6_cnt_def','D7_cnt_def']],weights)/np.sum(weights)
    df['dist_from_rusher_OFF'] = np.dot(df[['D1_cnt_off','D2_cnt_off','D3_cnt_off','D4_cnt_off','D5_cnt_off','D6_cnt_off','D7_cnt_off']],weights)/np.sum(weights)
    df['dist_from_rusher_ratio'] = df['dist_from_rusher_OFF']/df['dist_from_rusher_DEF']

    
    cols = ['D1_cnt_def','D2_cnt_def','D3_cnt_def','D4_cnt_def','D5_cnt_def','D6_cnt_def','D7_cnt_def','D1_cnt_off','D2_cnt_off','D3_cnt_off','D4_cnt_off','D5_cnt_off','D6_cnt_off','D7_cnt_off']
    df.drop(cols, axis = 1, inplace = True)


    data = pd.merge(data, df, on=['GameId','PlayId'], how='left')
    
    return data, enc

# Funcao para converter peso em lbs para kg
def convert_to_kg(lbs):
    kg = lbs * 0.45359237
    #print("The weight is", kg, "in kilograms")
    
    return kg

# Funcao para converter yard to m
def convert_yard_to_m(yard):
    m = yard * 0.9144    
    return m

def map_distance_from_rusher(distance):
    if np.square(distance)<= np.square(1):
        return '1'
    elif np.square(distance)<= np.square(2):
        return '2'
    elif np.square(distance)<= np.square(3):
        return '3'
    elif np.square(distance)<= np.square(4):
        return '4'
    elif np.square(distance)<= np.square(5):
        return '5'
    elif np.square(distance)<= np.square(10):
        return '6'
    else:
        return '7'

def s_ratio_runner_vs_1stdef(data):
    tmp = data.loc[((data['dist_player_vs_rusher'])==(data['dist_player_vs_rusher_min_NOnOff']))&(data['IsOnOffense']==False),['GameId','PlayId','Sx','Sx_rusher']]
    tmp['rusher_1stdef_sx_ratio'] = tmp['Sx_rusher']/tmp['Sx']
    drop = ['Sx','Sx_rusher']
    tmp.drop(drop, axis = 1, inplace = True)
    
    return tmp

def add_players_data(tmp):
    '''
    Calculates mean and minimum distances
    '''
    
    df = pd.DataFrame()
    
    for col in tmp.columns:
        if col in ['dist_cdef_vs_player','dist_coff_vs_player','dist_player_vs_rusher']:
            df[col+'_mean']= tmp.groupby(['GameId','PlayId','IsOnOffense'])[col].mean()
            df[col+'_min']= tmp.groupby(['GameId','PlayId','IsOnOffense'])[col].min()
            
    temp1= df.unstack()
    temp1.columns = ['dist_cdef_vs_player_mean_NOnOff','dist_cdef_vs_player_mean_OnOff','dist_cdef_vs_player_min_NOnOff','dist_cdef_vs_player_min_OnOff',
    'dist_coff_vs_player_mean_NOnOff','dist_coff_vs_player_mean_OnOff','dist_coff_vs_player_min_NOnOff','dist_coff_vs_player_min_OnOff',
    'dist_player_vs_rusher_mean_NOnOff','dist_player_vs_rusher_mean_OnOff','dist_player_vs_rusher_min_NOnOff','dist_player_vs_rusher_min_OnOff']
    #tmp = tmp.merge(temp1, on=['GameId','PlayId'], how='left')

    return temp1

def clf_yards(x):

    y = 2
    if x<= -7:
        y=0
    elif x<=-2:
        y=1
    elif x<=0:
        y=2
    elif x<=2:
        y=3
    elif x<=5:
        y=4
    elif x<=10:
        y=5
    elif x<=20:
        y=6
    elif x<=40:
        y=7
    else:
        y=8
    return y


    


def fill_defendersinabox(x, y): # x=DefensePersonnel y=Defenderinthebox
    
    if x =='4 DL, 2 LB, 5 DB':
        y = 6
    elif x =='4 DL, 3 LB, 4 DB':
        y = 7
    elif x == '3 DL, 4 LB, 4 DB':
        y = 7
    elif x == '2 DL, 4 LB, 5 DB':
        y = 6
    elif x == '3 DL, 3 LB, 5 DB':
        y = 6

    return y


def map_DefensePersonnel(x):

    if x =='4 DL, 2 LB, 5 DB':
        y = 0
    elif x =='4 DL, 3 LB, 4 DB':
        y = 1
    elif x == '3 DL, 4 LB, 4 DB':
        y = 2
    elif x == '2 DL, 4 LB, 5 DB':
        y = 3
    elif x == '3 DL, 3 LB, 5 DB':
        y = 4
    else:
        y = 5
    return y

In [15]:
def strtoseconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def strtofloat(x):
    try:
        return float(x)
    except:
        return -1
    
def get_time(x):
    x = x.split(":")
    return int(x[0])*60 + int(x[1])

def map_weather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans*=0.5
    if 'climate controlled' in txt or 'indoor' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt:
        return -3*ans
    return 0

def OffensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0, 'QB' : 0, 'RB' : 0, 'TE' : 0, 'WR' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def DefensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def orientation_to_cat(x):
    x = np.clip(x, 0, 360 - 1)
    try:
        return str(int(x/15))
    except:
        return "nan"    
    
def uid_aggregation(comb, main_columns, uids, aggregations):
    X = pd.DataFrame()
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = col+'_'+main_column+'_'+agg_type
                temp_df = comb[[col, main_column]]
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                X[new_col_name] = comb[col].map(temp_df)
                del temp_df
                gc.collect()
    return X

In [16]:
def create_features_01(df, enc_in, dataset, deploy=False):
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2
        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def velocity(x2, x1, sec) :
        return (x2 - x1) / sec
    
    def diff_x(b, c, theta) :
        if 90.0 < theta < 270.0 :
            return np.sqrt(((b ** 2) + (c ** 2)) - 2 * b * c * np.cos(theta))
        else :
            return 0
        
    def diff_y(b, c, theta) :
        if theta <= 90.0 and theta >= 270.0 :
            return - np.sqrt(((b ** 2) + (c ** 2)) - 2 * b * c * np.cos(theta))
        else :
            return 0
        
    def stop_period(speed, acc) :
        return speed / acc   

    def new_roll_velocity(x1, y1, x2, y2) :  
        x_diff = np.sqrt((x1 - x2) ** 2)
        y_diff = np.sqrt((y1 - y2) ** 2)
        return np.sqrt(x_diff + y_diff) 
    
    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]
        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')
        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X','Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]
        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','max','mean','std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']

        return defense
            
    
    def static_features(df):
        
        
        add_new_feas = []

        ## Height
        df['PlayerHeight_dense'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
        
        
        add_new_feas.append('PlayerHeight_dense')

        ## Time
        df['TimeHandoff'] = df['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
        df['TimeSnap'] = df['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

        df['TimeDelta'] = df.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        df['PlayerBirthDate'] =df['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

        ## Age
        seconds_in_year = 60*60*24*365.25
        df['PlayerAge'] = df.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        add_new_feas.append('PlayerAge')

        ## WindSpeed
        df['WindSpeed_ob'] = df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
        df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
        df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
        df['WindSpeed_dense'] = df['WindSpeed_ob'].apply(strtofloat)
        add_new_feas.append('WindSpeed_dense')

        ## Weather
        df['GameWeather_process'] = df['GameWeather'].str.lower()
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
        df['GameWeather_dense'] = df['GameWeather_process'].apply(map_weather)
        add_new_feas.append('GameWeather_dense')

        ## Orientation and Dir
        df["Orientation_ob"] = df["Orientation"].apply(lambda x : orientation_to_cat(x)).astype("object")
        df["Dir_ob"] = df["Dir"].apply(lambda x : orientation_to_cat(x)).astype("object")

        df["Orientation_sin"] = df["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Orientation_cos"] = df["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        
        df["Dir_sin"] = df["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Dir_cos"] = df["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        
        add_new_feas.append("Dir_sin")
        add_new_feas.append("Dir_cos")

        ## diff Score
        df["diffScoreBeforePlay"] = df["HomeScoreBeforePlay"] - df["VisitorScoreBeforePlay"]
        add_new_feas.append("diffScoreBeforePlay")
    
        static_features = df[df['NflId'] == df['NflIdRusher']][add_new_feas+['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                                             'YardLine','Quarter','Down','Distance',
                                                                             'NflId','NflIdRusher','PossessionTeam','HomeTeamAbbr','Turf',
                                                                             'VisitorTeamAbbr','PlayDirection','GameClock','Season','Team',
                                                                             'FieldPosition']].drop_duplicates()
        #static_features.fillna(-999,inplace=True)

        return static_features


    def combine_features(relative_to_back, defense, static, df2, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,df2,on=['GameId','PlayId'],how='inner')
        
        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        return df
    
    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    
    df2, enc_out = create_features_03(df, enc_in,dataset)
    

    
    
    
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    def_feats = defense_features(df)
    static_feats = static_features(df)
    basetable = combine_features(rel_back, def_feats, static_feats,df2,  deploy = deploy)
    
    return basetable, enc_out

In [17]:
def create_features_02(t_):
    t_['fe1'] = pd.Series(np.sqrt(np.absolute(np.square(t_.X.values) - np.square(t_.Y.values))))
    t_['fe5'] = np.square(t_['S'].values) + 2 * t_['A'].values * t_['Dis'].values  # N
    t_['fe7'] = np.arccos(np.clip(t_['X'].values / t_['Y'].values, -1, 1))  # N
    t_['fe8'] = t_['S'].values / np.clip(t_['fe1'].values, 0.6, None)
    radian_angle = (90 - t_['Dir']) * np.pi / 180.0
    t_['fe10'] = np.abs(t_['S'] * np.cos(radian_angle))
    t_['fe11'] = np.abs(t_['S'] * np.sin(radian_angle))

    t_['IsRusher'] = (t_['NflId'] == t_['NflIdRusher'])
    temp = t_[t_["IsRusher"]][["Team", "PlayId"]].rename(columns={"Team":"RusherTeam"})
    t_ = t_.merge(temp, on = "PlayId")
    t_["IsRusherTeam"] = t_["Team"] == t_["RusherTeam"]    

    t_["is_left"]            = t_["PlayDirection"] == "left"
    t_["old_data"]    = t_["Season"] == 2017
    t_['YardLine_std'] = 100 - t_['YardLine']
    
    t_['Orientation_rad'] = np.mod(t_.Orientation, 360) * np.pi/180.0
    t_.loc[t_.Season >= 2018, 'Orientation_rad'] = np.mod(t_.loc[t_.Season >= 2018, 'Orientation'] - 90, 360) * np.pi/180.0
    t_['Orientation_std'] = t_.Orientation_rad
    t_.loc[t_.is_left, 'Orientation_std'] = np.mod(np.pi + t_.loc[t_.is_left, 'Orientation_rad'], 2*np.pi)
    
    t_['norm_quat'] = (t_['X']**2 + t_['Y']**2 + t_['A']**2 + t_['S']**2)
    t_['mod_quat'] = (t_['norm_quat'])**0.5
    t_['norm_X'] = t_['X'] / t_['mod_quat']
    t_['norm_Y'] = t_['Y'] / t_['mod_quat']
    t_['norm_A'] = t_['A'] / t_['mod_quat']
    t_['norm_S'] = t_['S'] / t_['mod_quat']    

    t_ = t_.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'IsRusherTeam', 'IsRusher']).reset_index(drop = True)
    
    return t_

In [18]:
map_offense_formation = {'SINGLEBACK':0,
                         'SHOTGUN':1,
                         'I_FORM':2,
                         'PISTOL':3,
                         'JUMBO':4,
                         'WILDCAT':5,
                         'ACE':6,
                         'EMPTY':7,
                        }

def create_features_03(df, enc_in, dataset):

    
    df['IsRusher'] = df.NflId == df.NflIdRusher
    df['TeamOnOffense'] = "home"
    df.loc[df.PossessionTeam != df.HomeTeamAbbr, 'TeamOnOffense'] = "away"
    df['IsOnOffense'] = df.Team == df.TeamOnOffense # Is player on offense?
    df['PlayerWeight_kg'] = df['PlayerWeight'].apply(lambda x: convert_to_kg(x))
    df['OffenseFormation'] = df['OffenseFormation'].map(map_offense_formation)
    df['DefendersInTheBox'] = df[['DefensePersonnel','DefendersInTheBox']].apply(lambda x: fill_defendersinabox(x[0],x[1]), axis=1)
    df['DefensePersonnel'] = df['DefensePersonnel'].apply(map_DefensePersonnel)
    
    
    df = add_x_y_components(df)
    df = add_centroid(df)
    # Calculates time needed to make 1st down
    df = add_rushertimeto1stdown(df)
    
    # Calculate distances between offense, defense, rusher and players
    df = calculate_distance(df)

    df, enc_out = add_distance_ratios(df, encoder=enc_in, dataset=dataset)

    # Calculates time to tackle
    timetotackle = calc_timetotackle(df)
    df = pd.merge(df, timetotackle, on=['GameId','PlayId'], how= 'left')

    play_data = add_players_data(df)
    df = pd.merge(df, play_data, on=['GameId','PlayId'], how= 'left')
    # Calculates ratio between speeds from rusher and closest defenser
    to_merge_S = s_ratio_runner_vs_1stdef(df)
    df = pd.merge(df, to_merge_S, on=['GameId','PlayId'], how='left')
    
    df = df.loc[df['IsRusher']==1,:]
    cols = [ 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir', 'NflId', 'DisplayName',
            'JerseyNumber', 'Season', 'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance', 
            'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'NflIdRusher', 
            'OffensePersonnel', 
            'PlayDirection', 'TimeHandoff', 'TimeSnap', 'PlayerHeight', 'PlayerWeight', 
            'PlayerBirthDate', 'PlayerCollegeName', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr', 
            'Week', 'Stadium', 'Location', 'StadiumType', 'Turf', 'GameWeather', 'Temperature',
            'Humidity', 'WindSpeed', 'WindDirection', 'YardLine', 'IsRusher', 'TeamOnOffense', 
            'IsOnOffense', 'PlayerWeight_kg',
            'X_rusher','Y_rusher','Sx_rusher','Sy_rusher','Ax_rusher','Ay_rusher','F_rusher','Fx_rusher',
            'Fy_rusher','p_rusher','px_rusher','py_rusher','p_max_rusher','px_max_rusher','py_max_rusher',
            'dist_player_vs_rusher']
    df.drop(cols, axis=1, inplace=True)
    feat = [col for col in df.columns if col not in ['Yards']]
    df = df[feat]
    
    return df, enc_out

In [19]:
def check_missing(dataset, display = 5):
    temp_df = dataset.copy()
    df_nan = (temp_df.isnull().sum() / len(temp_df)) * 100
    missing_data = pd.DataFrame({'Missing n': temp_df.isnull().sum(),'% Missing' :df_nan})
    if missing_data['Missing n'].sum() == 0:
        return print('Great! There are no missing values in this dataset.')
    else:
        return missing_data.sort_values('% Missing', ascending = False).head(display)

In [20]:
%time train_basetable, enc_out = create_features_01(train, OneHotEncoder(handle_unknown='ignore', sparse = False),'train',  False)

CPU times: user 4min, sys: 18.6 s, total: 4min 19s
Wall time: 3min 47s


In [21]:
%time train_basetable = create_features_02(train_basetable)

CPU times: user 833 ms, sys: 227 ms, total: 1.06 s
Wall time: 277 ms


In [22]:
# Cria uma copia do dataset para backup
X = train_basetable.copy()
X.shape

(23171, 118)

In [23]:
check_missing(X, 10)

Unnamed: 0,Missing n,% Missing
WindSpeed_dense,3065,13.227742
FieldPosition,292,1.260196
OffenseFormation,5,0.021579
rusher_1stdef_sx_ratio,4,0.017263
dist_cdef_vs_rusher,0,0.0
dist_cdef_vs_player_min_NOnOff,0,0.0
dist_cdef_vs_player_mean_OnOff,0,0.0
dist_cdef_vs_player_mean_NOnOff,0,0.0
time_to_tackle_max,0,0.0
time_to_tackle_min,0,0.0


In [24]:
pd.to_pickle(X, "X_NN_fork_v97.pkl")

In [25]:
X['OffenseFormation'].fillna(0, inplace=True)
X['OffenseFormation'] = X['OffenseFormation'].astype(int)
X['DefensePersonnel'] = X['DefensePersonnel'].astype(int)
OHE = OneHotEncoder(handle_unknown='ignore', sparse = False)
columns = ['OffenseFormation' ,'DefensePersonnel']
new_column = pd.DataFrame(OHE.fit_transform(X[columns]))
new_column.columns = ['OffenseFormation_0',
                      'OffenseFormation_1',
                      'OffenseFormation_2',
                      'OffenseFormation_3',
                      'OffenseFormation_4',
                      'OffenseFormation_5',
                      'OffenseFormation_6',
                      'OffenseFormation_7',
                      'DefensePersonnel_0',
                      'DefensePersonnel_1',
                      'DefensePersonnel_2',
                      'DefensePersonnel_3',
                      'DefensePersonnel_4',
                      'DefensePersonnel_5']
X = pd.concat([X, new_column], axis=1)
delcol = ['OffenseFormation','DefensePersonnel']
X.drop(delcol, axis=1,inplace=True)

In [26]:
X['count'] = X.groupby('Yards')['Yards'].transform('count')

In [27]:
X = X[X['count'] >= 10]
X.shape

(23010, 131)

In [28]:
best_features = ['A','S','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                 'def_mean_dist','def_std_dist','def_min_dist','def_max_dist','min_dist','max_dist','mean_dist',
                 'PlayerAge','PlayerHeight_dense','Dis','DefendersInTheBox',
                 'Distance','Dir','Dir_sin','Dir_cos','YardLine_std','Orientation_std',
                 'WindSpeed_dense','GameWeather_dense',
                 'fe1','fe5','fe8','fe10','fe11',
                 'norm_quat','mod_quat','norm_X','norm_Y','norm_A','norm_S']


best_features += ['t_1stDown','rusher_1stdef_sx_ratio','time_to_tackle_min', 'KE',
                  'dist_from_rusher_OFF','dist_from_rusher_ratio',
                  'OffenseFormation_0','OffenseFormation_1','OffenseFormation_2','OffenseFormation_3',
                  'OffenseFormation_4','OffenseFormation_5','OffenseFormation_6','OffenseFormation_7',
                  'DefensePersonnel_0','DefensePersonnel_1','DefensePersonnel_2','DefensePersonnel_3',
                  'DefensePersonnel_4','DefensePersonnel_5']

features_str = str(best_features)

In [32]:
best_features_revised = ['A','S','back_from_scrimmage','back_moving_down_field',
                 'def_mean_dist','def_std_dist','def_min_dist','def_max_dist','min_dist',
                 'PlayerAge','PlayerHeight_dense','Dis','DefendersInTheBox',
                 'Distance','Dir','Dir_sin','Dir_cos','Orientation_std',
                 'WindSpeed_dense','GameWeather_dense',
                 'fe1','fe5','fe8','fe10','fe11']


best_features_revised += ['rusher_1stdef_sx_ratio','time_to_tackle_min',
                  'dist_from_rusher_ratio',
                  'OffenseFormation_0','OffenseFormation_1','OffenseFormation_2',
                  'OffenseFormation_4',
                  'DefensePersonnel_0','DefensePersonnel_2','DefensePersonnel_3',
                  'DefensePersonnel_4','DefensePersonnel_5']


## 4. Criar e avaliar alguns algoritmos de Machine Learning

### 4.1. Split Treino e Validação

In [33]:
# Criar um dataset somente com as colunas mais importantes conforme visto anteriormente
new_X = X.loc[:,best_features_revised]
#new_X.replace(-np.inf,0,inplace=True)
#new_X.replace(np.inf,0,inplace=True)
new_X.fillna(0,inplace=True)

target = X.Yards

y = np.zeros((target.shape[0], 199))
for idx, target in enumerate(list(target)):
    y[idx][99 + target] = 1
    
# Normalizando as variaveis do dataset de treino
scaler = StandardScaler()
new_X = scaler.fit_transform(new_X)
new_X.shape, y.shape

((23010, 37), (23010, 199))

### 4.3. Teste com Keras (New NN Struct)

In [34]:
# Calculate CRPS score
def crps_score(y_prediction, y_valid, shape=X.shape[0]):
    y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_prediction, axis=1), 0, 1)
    val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * shape)
    crps = np.round(val_s, 6)
    
    return crps

In [35]:
class CRPSCallback(Callback):
    
    def __init__(self,validation, predict_batch_size=20, include_on_batch=False):
        super(CRPSCallback, self).__init__()
        self.validation = validation
        self.predict_batch_size = predict_batch_size
        self.include_on_batch = include_on_batch
        
        print('validation shape',len(self.validation))

    def on_batch_begin(self, batch, logs={}):
        pass

    def on_train_begin(self, logs={}):
        if not ('CRPS_score_val' in self.params['metrics']):
            self.params['metrics'].append('CRPS_score_val')

    def on_batch_end(self, batch, logs={}):
        if (self.include_on_batch):
            logs['CRPS_score_val'] = float('-inf')

    def on_epoch_end(self, epoch, logs={}):
        logs['CRPS_score_val'] = float('-inf')
            
        if (self.validation):
            X_valid, y_valid = self.validation[0], self.validation[1]
            y_pred = self.model.predict(X_valid)
            y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
            y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
            val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid.shape[0])
            val_s = np.round(val_s, 6)
            logs['CRPS_score_val'] = val_s

In [36]:
from keras.regularizers import l2,l1, l1_l2

def get_nn(x_tr,y_tr,x_val,y_val,shape):
    K.clear_session()
    inp = Input(shape = (x_tr.shape[1],))
    
    x = Dense(1024, input_dim=X.shape[1], activation='relu', bias_regularizer=l1_l2(l1=0.002,l2=0.002))(inp)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    x = Dense(512, activation='relu', bias_regularizer=l1_l2(l1=0.002,l2=0.002))(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    x = Dense(256, activation='relu', bias_regularizer=l1_l2(l1=0.002,l2=0.002))(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    out = Dense(199, activation='softmax')(x)
    model = Model(inp,out)
    
    model.compile(optimizer = optimizers.adam(lr = 0.05, decay = 0.05),
                  loss='categorical_crossentropy', 
                  metrics=[])
     
    es = EarlyStopping(monitor='CRPS_score_val', 
                       mode='min',
                       restore_best_weights=True, 
                       verbose=1, 
                       patience=15)

    mc = ModelCheckpoint('best_model.h5',monitor='CRPS_score_val',mode='min',save_best_only=True, 
                         verbose=1, save_weights_only=True)
    
    bsz = 1024
    steps = x_tr.shape[0]/bsz
    
    model.fit(x_tr, y_tr,
              callbacks=[CRPSCallback(validation = (x_val,y_val)),es,mc], 
              epochs=200, 
              #steps_per_epoch = steps,
              batch_size=bsz,
              verbose=1)
    
    model.load_weights("best_model.h5")
    
    y_pred = model.predict(x_val)
    y_valid = y_val
    crps = crps_score(y_pred, y_valid, shape=shape)

    return model,crps

In [37]:
def predict(x_te):
    model_num = len(models)
    for k,m in enumerate(models):
        if k==0:
            y_pred = m.predict(x_te,batch_size=1024)
        else:
            y_pred+=m.predict(x_te,batch_size=1024)
            
    y_pred = y_pred / model_num
    
    return y_pred

In [39]:
%%time

loop = 2
fold = 5

oof_nn  = np.zeros([loop, y.shape[0], y.shape[1]])

models_nn = []
crps_csv_nn = []

for k in range(loop):
    kfold = KFold(fold, random_state = 42 + k, shuffle = True)
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(y)):
        print("-----------")
        print(f'Loop {k+1}/{loop}' + f' Fold {k_fold+1}/{fold}')
        print("-----------")
        
        tr_x, tr_y = new_X[tr_inds], y[tr_inds]
        val_x, val_y = new_X[val_inds], y[val_inds]
        
        # Train NN
        nn, crps_nn = get_nn(tr_x, tr_y, val_x, val_y, shape=val_x.shape[0])
        models_nn.append(nn)
        print("the %d fold crps (NN) is %f"%((k_fold+1), crps_nn))
        crps_csv_nn.append(crps_nn)
        
        #Predict OOF
        oof_nn[k, val_inds, :] = nn.predict(val_x)

-----------
Loop 1/2 Fold 1/5
-----------
validation shape 2
Epoch 1/200

Epoch 00001: CRPS_score_val improved from inf to 0.02214, saving model to best_model.h5
Epoch 2/200

Epoch 00002: CRPS_score_val improved from 0.02214 to 0.01928, saving model to best_model.h5
Epoch 3/200

Epoch 00003: CRPS_score_val improved from 0.01928 to 0.01328, saving model to best_model.h5
Epoch 4/200

Epoch 00004: CRPS_score_val improved from 0.01328 to 0.01242, saving model to best_model.h5
Epoch 5/200

Epoch 00005: CRPS_score_val improved from 0.01242 to 0.01201, saving model to best_model.h5
Epoch 6/200

Epoch 00006: CRPS_score_val improved from 0.01201 to 0.01183, saving model to best_model.h5
Epoch 7/200

Epoch 00007: CRPS_score_val improved from 0.01183 to 0.01175, saving model to best_model.h5
Epoch 8/200

Epoch 00008: CRPS_score_val did not improve from 0.01175
Epoch 9/200

Epoch 00009: CRPS_score_val improved from 0.01175 to 0.01169, saving model to best_model.h5
Epoch 10/200

Epoch 00010: CRPS_s


Epoch 00050: CRPS_score_val did not improve from 0.01156
Epoch 51/200

Epoch 00051: CRPS_score_val improved from 0.01156 to 0.01155, saving model to best_model.h5
Epoch 52/200

Epoch 00052: CRPS_score_val did not improve from 0.01155
Epoch 53/200

Epoch 00053: CRPS_score_val did not improve from 0.01155
Epoch 54/200

Epoch 00054: CRPS_score_val did not improve from 0.01155
Epoch 55/200

Epoch 00055: CRPS_score_val improved from 0.01155 to 0.01155, saving model to best_model.h5
Epoch 56/200

Epoch 00056: CRPS_score_val did not improve from 0.01155
Epoch 57/200

Epoch 00057: CRPS_score_val did not improve from 0.01155
Epoch 58/200

Epoch 00058: CRPS_score_val improved from 0.01155 to 0.01155, saving model to best_model.h5
Epoch 59/200

Epoch 00059: CRPS_score_val did not improve from 0.01155
Epoch 60/200

Epoch 00060: CRPS_score_val did not improve from 0.01155
Epoch 61/200

Epoch 00061: CRPS_score_val improved from 0.01155 to 0.01154, saving model to best_model.h5
Epoch 62/200

Epoch 0


Epoch 00103: CRPS_score_val did not improve from 0.01152
Epoch 104/200

Epoch 00104: CRPS_score_val did not improve from 0.01152
Epoch 105/200

Epoch 00105: CRPS_score_val did not improve from 0.01152
Epoch 106/200

Epoch 00106: CRPS_score_val did not improve from 0.01152
Epoch 107/200

Epoch 00107: CRPS_score_val did not improve from 0.01152
Epoch 108/200

Epoch 00108: CRPS_score_val did not improve from 0.01152
Epoch 109/200

Epoch 00109: CRPS_score_val improved from 0.01152 to 0.01152, saving model to best_model.h5
Epoch 110/200

Epoch 00110: CRPS_score_val improved from 0.01152 to 0.01151, saving model to best_model.h5
Epoch 111/200

Epoch 00111: CRPS_score_val improved from 0.01151 to 0.01151, saving model to best_model.h5
Epoch 112/200

Epoch 00112: CRPS_score_val did not improve from 0.01151
Epoch 113/200

Epoch 00113: CRPS_score_val did not improve from 0.01151
Epoch 114/200

Epoch 00114: CRPS_score_val did not improve from 0.01151
Epoch 115/200

Epoch 00115: CRPS_score_val di


Epoch 00027: CRPS_score_val improved from 0.01161 to 0.01161, saving model to best_model.h5
Epoch 28/200

Epoch 00028: CRPS_score_val did not improve from 0.01161
Epoch 29/200

Epoch 00029: CRPS_score_val did not improve from 0.01161
Epoch 30/200

Epoch 00030: CRPS_score_val did not improve from 0.01161
Epoch 31/200

Epoch 00031: CRPS_score_val improved from 0.01161 to 0.01160, saving model to best_model.h5
Epoch 32/200

Epoch 00032: CRPS_score_val did not improve from 0.01160
Epoch 33/200

Epoch 00033: CRPS_score_val improved from 0.01160 to 0.01160, saving model to best_model.h5
Epoch 34/200

Epoch 00034: CRPS_score_val improved from 0.01160 to 0.01159, saving model to best_model.h5
Epoch 35/200

Epoch 00035: CRPS_score_val did not improve from 0.01159
Epoch 36/200

Epoch 00036: CRPS_score_val did not improve from 0.01159
Epoch 37/200

Epoch 00037: CRPS_score_val improved from 0.01159 to 0.01158, saving model to best_model.h5
Epoch 38/200

Epoch 00038: CRPS_score_val did not improve


Epoch 00079: CRPS_score_val did not improve from 0.01155
Epoch 80/200

Epoch 00080: CRPS_score_val did not improve from 0.01155
Epoch 81/200

Epoch 00081: CRPS_score_val did not improve from 0.01155
Epoch 82/200

Epoch 00082: CRPS_score_val did not improve from 0.01155
Epoch 83/200

Epoch 00083: CRPS_score_val did not improve from 0.01155
Epoch 84/200

Epoch 00084: CRPS_score_val did not improve from 0.01155
Epoch 85/200

Epoch 00085: CRPS_score_val did not improve from 0.01155
Epoch 86/200

Epoch 00086: CRPS_score_val did not improve from 0.01155
Epoch 87/200

Epoch 00087: CRPS_score_val did not improve from 0.01155
Epoch 88/200

Epoch 00088: CRPS_score_val did not improve from 0.01155
Epoch 89/200

Epoch 00089: CRPS_score_val did not improve from 0.01155
Epoch 90/200

Epoch 00090: CRPS_score_val improved from 0.01155 to 0.01155, saving model to best_model.h5
Epoch 91/200

Epoch 00091: CRPS_score_val did not improve from 0.01155
Epoch 92/200

Epoch 00092: CRPS_score_val did not impro


Epoch 00133: CRPS_score_val did not improve from 0.01154
Epoch 134/200

Epoch 00134: CRPS_score_val did not improve from 0.01154
Epoch 135/200

Epoch 00135: CRPS_score_val did not improve from 0.01154
Epoch 136/200

Epoch 00136: CRPS_score_val did not improve from 0.01154
Epoch 137/200

Epoch 00137: CRPS_score_val did not improve from 0.01154
Epoch 138/200

Epoch 00138: CRPS_score_val did not improve from 0.01154
Epoch 139/200
Restoring model weights from the end of the best epoch

Epoch 00139: CRPS_score_val did not improve from 0.01154
Epoch 00139: early stopping
the 2 fold crps (NN) is 0.011538
-----------
Loop 1/2 Fold 3/5
-----------
validation shape 2
Epoch 1/200

Epoch 00001: CRPS_score_val improved from inf to 0.02811, saving model to best_model.h5
Epoch 2/200

Epoch 00002: CRPS_score_val improved from 0.02811 to 0.01665, saving model to best_model.h5
Epoch 3/200

Epoch 00003: CRPS_score_val improved from 0.01665 to 0.01348, saving model to best_model.h5
Epoch 4/200

Epoch 000


Epoch 00042: CRPS_score_val did not improve from 0.01129
Epoch 43/200

Epoch 00043: CRPS_score_val improved from 0.01129 to 0.01128, saving model to best_model.h5
Epoch 44/200

Epoch 00044: CRPS_score_val did not improve from 0.01128
Epoch 45/200

Epoch 00045: CRPS_score_val did not improve from 0.01128
Epoch 46/200

Epoch 00046: CRPS_score_val improved from 0.01128 to 0.01128, saving model to best_model.h5
Epoch 47/200

Epoch 00047: CRPS_score_val improved from 0.01128 to 0.01128, saving model to best_model.h5
Epoch 48/200

Epoch 00048: CRPS_score_val did not improve from 0.01128
Epoch 49/200

Epoch 00049: CRPS_score_val did not improve from 0.01128
Epoch 50/200

Epoch 00050: CRPS_score_val improved from 0.01128 to 0.01127, saving model to best_model.h5
Epoch 51/200

Epoch 00051: CRPS_score_val improved from 0.01127 to 0.01127, saving model to best_model.h5
Epoch 52/200

Epoch 00052: CRPS_score_val did not improve from 0.01127
Epoch 53/200

Epoch 00053: CRPS_score_val improved from 0


Epoch 00092: CRPS_score_val did not improve from 0.01122
Epoch 93/200

Epoch 00093: CRPS_score_val improved from 0.01122 to 0.01122, saving model to best_model.h5
Epoch 94/200

Epoch 00094: CRPS_score_val improved from 0.01122 to 0.01121, saving model to best_model.h5
Epoch 95/200

Epoch 00095: CRPS_score_val improved from 0.01121 to 0.01121, saving model to best_model.h5
Epoch 96/200

Epoch 00096: CRPS_score_val did not improve from 0.01121
Epoch 97/200

Epoch 00097: CRPS_score_val did not improve from 0.01121
Epoch 98/200

Epoch 00098: CRPS_score_val did not improve from 0.01121
Epoch 99/200

Epoch 00099: CRPS_score_val did not improve from 0.01121
Epoch 100/200

Epoch 00100: CRPS_score_val improved from 0.01121 to 0.01121, saving model to best_model.h5
Epoch 101/200

Epoch 00101: CRPS_score_val did not improve from 0.01121
Epoch 102/200

Epoch 00102: CRPS_score_val did not improve from 0.01121
Epoch 103/200

Epoch 00103: CRPS_score_val did not improve from 0.01121
Epoch 104/200

Ep


Epoch 00144: CRPS_score_val did not improve from 0.01120
Epoch 145/200

Epoch 00145: CRPS_score_val did not improve from 0.01120
Epoch 146/200

Epoch 00146: CRPS_score_val did not improve from 0.01120
Epoch 147/200

Epoch 00147: CRPS_score_val improved from 0.01120 to 0.01120, saving model to best_model.h5
Epoch 148/200

Epoch 00148: CRPS_score_val improved from 0.01120 to 0.01119, saving model to best_model.h5
Epoch 149/200

Epoch 00149: CRPS_score_val did not improve from 0.01119
Epoch 150/200

Epoch 00150: CRPS_score_val did not improve from 0.01119
Epoch 151/200

Epoch 00151: CRPS_score_val did not improve from 0.01119
Epoch 152/200

Epoch 00152: CRPS_score_val improved from 0.01119 to 0.01119, saving model to best_model.h5
Epoch 153/200

Epoch 00153: CRPS_score_val did not improve from 0.01119
Epoch 154/200

Epoch 00154: CRPS_score_val improved from 0.01119 to 0.01119, saving model to best_model.h5
Epoch 155/200

Epoch 00155: CRPS_score_val did not improve from 0.01119
Epoch 156/


Epoch 00021: CRPS_score_val did not improve from 0.01176
Epoch 22/200

Epoch 00022: CRPS_score_val did not improve from 0.01176
Epoch 23/200

Epoch 00023: CRPS_score_val did not improve from 0.01176
Epoch 24/200

Epoch 00024: CRPS_score_val improved from 0.01176 to 0.01175, saving model to best_model.h5
Epoch 25/200

Epoch 00025: CRPS_score_val improved from 0.01175 to 0.01175, saving model to best_model.h5
Epoch 26/200

Epoch 00026: CRPS_score_val did not improve from 0.01175
Epoch 27/200

Epoch 00027: CRPS_score_val improved from 0.01175 to 0.01174, saving model to best_model.h5
Epoch 28/200

Epoch 00028: CRPS_score_val did not improve from 0.01174
Epoch 29/200

Epoch 00029: CRPS_score_val did not improve from 0.01174
Epoch 30/200

Epoch 00030: CRPS_score_val did not improve from 0.01174
Epoch 31/200

Epoch 00031: CRPS_score_val improved from 0.01174 to 0.01173, saving model to best_model.h5
Epoch 32/200

Epoch 00032: CRPS_score_val improved from 0.01173 to 0.01173, saving model to 


Epoch 00072: CRPS_score_val improved from 0.01167 to 0.01167, saving model to best_model.h5
Epoch 73/200

Epoch 00073: CRPS_score_val did not improve from 0.01167
Epoch 74/200

Epoch 00074: CRPS_score_val did not improve from 0.01167
Epoch 75/200

Epoch 00075: CRPS_score_val did not improve from 0.01167
Epoch 76/200

Epoch 00076: CRPS_score_val did not improve from 0.01167
Epoch 77/200

Epoch 00077: CRPS_score_val did not improve from 0.01167
Epoch 78/200

Epoch 00078: CRPS_score_val improved from 0.01167 to 0.01166, saving model to best_model.h5
Epoch 79/200

Epoch 00079: CRPS_score_val did not improve from 0.01166
Epoch 80/200

Epoch 00080: CRPS_score_val did not improve from 0.01166
Epoch 81/200

Epoch 00081: CRPS_score_val did not improve from 0.01166
Epoch 82/200

Epoch 00082: CRPS_score_val improved from 0.01166 to 0.01166, saving model to best_model.h5
Epoch 83/200

Epoch 00083: CRPS_score_val did not improve from 0.01166
Epoch 84/200

Epoch 00084: CRPS_score_val improved from 


Epoch 00124: CRPS_score_val improved from 0.01164 to 0.01164, saving model to best_model.h5
Epoch 125/200

Epoch 00125: CRPS_score_val did not improve from 0.01164
Epoch 126/200

Epoch 00126: CRPS_score_val improved from 0.01164 to 0.01163, saving model to best_model.h5
Epoch 127/200

Epoch 00127: CRPS_score_val did not improve from 0.01163
Epoch 128/200

Epoch 00128: CRPS_score_val did not improve from 0.01163
Epoch 129/200

Epoch 00129: CRPS_score_val improved from 0.01163 to 0.01163, saving model to best_model.h5
Epoch 130/200

Epoch 00130: CRPS_score_val did not improve from 0.01163
Epoch 131/200

Epoch 00131: CRPS_score_val improved from 0.01163 to 0.01163, saving model to best_model.h5
Epoch 132/200

Epoch 00132: CRPS_score_val did not improve from 0.01163
Epoch 133/200

Epoch 00133: CRPS_score_val did not improve from 0.01163
Epoch 134/200

Epoch 00134: CRPS_score_val did not improve from 0.01163
Epoch 135/200

Epoch 00135: CRPS_score_val did not improve from 0.01163
Epoch 136/

KeyboardInterrupt: 

In [40]:
crps_oof_nn  = []

for k in range(loop):
    crps_oof_nn.append(crps_score(oof_nn[k,...], y))

In [41]:
print("mean crps (NN) is %f"%np.mean(crps_csv_nn))
print("mean OOF crps (NN) is %f"%np.mean(crps_oof_nn))

mean crps (NN) is 0.011413
mean OOF crps (NN) is 0.341232


## 5. Realizar a submissão para o Kaggle

In [None]:
%%time

if  TRAIN_OFFLINE==False:
    
    from kaggle.competitions import nflrush
    env = nflrush.make_env()
    iter_test = env.iter_test()
    df_prev = pd.DataFrame()
    df_test = pd.DataFrame()
    n = 0
    for (test_df, sample_prediction_df) in tqdm_notebook(iter_test):
        #print('\n#################### ITERATION {} ####################'.format(n))
        basetable, _ = create_features_01(test_df, enc_out,'test', deploy=True)
        basetable = create_features_02(basetable)

        basetable['OffenseFormation'].fillna(0, inplace=True)
        basetable['OffenseFormation'] = basetable['OffenseFormation'].astype(int)
        basetable['DefensePersonnel'] = basetable['DefensePersonnel'].astype(int)
        new_column = pd.DataFrame(OHE.transform(basetable[columns]))
        new_column.columns = ['OffenseFormation_0',
                              'OffenseFormation_1',
                              'OffenseFormation_2',
                              'OffenseFormation_3',
                              'OffenseFormation_4',
                              'OffenseFormation_5',
                              'OffenseFormation_6',
                              'OffenseFormation_7',
                              'DefensePersonnel_0',
                              'DefensePersonnel_1',
                              'DefensePersonnel_2',
                              'DefensePersonnel_3',
                              'DefensePersonnel_4',
                              'DefensePersonnel_5']
        X = pd.concat([basetable, new_column], axis=1)
        delcol = ['OffenseFormation','DefensePersonnel']
        X.drop(delcol, axis=1,inplace=True)
        # Considerar somente as colunas do Feature Selection
        basetable = basetable.loc[:,best_features_revised]
        basetable.fillna(0,inplace=True)
    
        scaled_basetable = scaler.transform(basetable)

        y_pred = predict(scaled_basetable)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]

        preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)
        
        df_test = df_test.append(basetable)
        df_prev = df_prev.append(preds_df)
    
        env.predict(preds_df)
        n+=1

    env.write_submission_file()