# NFL Competition

# Feature Engineering e Modelo de Machine Learning

- **Version: 2.1:** included feature selection, loss validation on kFold and new struct for NN
- **Version: 2.2:** included some new static features
- **Version: 2.3:** included LightGBM Model

## 1. Importa os pacotes e o dataset de treino

In [1]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import sklearn.metrics as mtr
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import re
import codecs
import time
import datetime
import tsfresh
import pandasql as ps

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

In [2]:
# Importa os pacotes de algoritmos de regressão
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor

# Importa os pacotes de algoritmos de redes neurais (Keras)
from keras.losses import binary_crossentropy
from keras.utils import to_categorical
from keras.layers import Dense,Input,Flatten,concatenate,Dropout,Lambda,BatchNormalization
from keras.models import Sequential, Model
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint
import keras.backend as K
#from keras_lookahead import Lookahead
#from keras_radam import RAdam

# Importa pacotes do sklearn
from sklearn import preprocessing
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import scale, MinMaxScaler, StandardScaler

Using TensorFlow backend.


In [3]:
# Define o caminho do arquivo
path_local  = '../data/train.csv'
path_kernel = '/kaggle/input/nfl-big-data-bowl-2020/train.csv'

# Carrega o dataset de treino
train = pd.read_csv(path_local, dtype={'WindSpeed': 'object'})

# Usado somente para teste (somente as primeiras 2200 linhas)
#train = train[:2200]

## 2. Feature Engineering

In [4]:
def map_weather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans*=0.5
    if 'climate controlled' in txt or 'indoor' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt:
        return -3*ans
    return 0

def strtoseconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def strtofloat(x):
    try:
        return float(x)
    except:
        return -1

def get_time(x):
    x = x.split(":")
    return int(x[0])*60 + int(x[1])

def get_height(x):
    x = x.split("-")
    return int(x[0])*12 + int(x[1])

def process_windspeed(txt):
    txt = str(txt).lower().replace('mph', '').strip()
    if '-' in txt:
        txt = (int(txt.split('-')[0]) + int(txt.split('-')[1])) / 2
    try:
        return float(txt)
    except:
        return -1.0  
    
def orientation_to_cat(x):
    x = np.clip(x, 0, 360 - 1)
    try:
        return str(int(x/15))
    except:
        return "nan"

In [5]:
def create_features_01(df, deploy=False):

    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2

        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]

        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)

        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')

        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]

        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','max','mean','std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']

        return defense

    def static_features(df):

        add_new_feas = []

        ## Height
        df['PlayerHeight_dense'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
    
        add_new_feas.append('PlayerHeight_dense')

        ## Time
        df['TimeHandoff'] = df['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
        df['TimeSnap'] = df['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

        df['TimeDelta'] = df.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        df['PlayerBirthDate'] =df['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

        ## Age
        seconds_in_year = 60*60*24*365.25
        df['PlayerAge'] = df.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        add_new_feas.append('PlayerAge')

        ## WindSpeed
        df['WindSpeed_ob'] = df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
        df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
        df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
        df['WindSpeed_dense'] = df['WindSpeed_ob'].apply(strtofloat)
        add_new_feas.append('WindSpeed_dense')

        ## Weather
        df['GameWeather_process'] = df['GameWeather'].str.lower()
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
        df['GameWeather_dense'] = df['GameWeather_process'].apply(map_weather)
        add_new_feas.append('GameWeather_dense')

        ## Orientation and Dir
        df["Orientation_ob"] = df["Orientation"].apply(lambda x : orientation_to_cat(x)).astype("object")
        df["Dir_ob"] = df["Dir"].apply(lambda x : orientation_to_cat(x)).astype("object")

        df["Orientation_sin"] = df["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Orientation_cos"] = df["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        df["Dir_sin"] = df["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Dir_cos"] = df["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        add_new_feas.append("Dir_sin")
        add_new_feas.append("Dir_cos")

        ## diff Score
        df["diffScoreBeforePlay"] = df["HomeScoreBeforePlay"] - df["VisitorScoreBeforePlay"]
        add_new_feas.append("diffScoreBeforePlay")
    
        df['DefendersInTheBox'] = df['DefendersInTheBox'].fillna(np.mean(df['DefendersInTheBox']))
        add_new_feas.append("DefendersInTheBox")
        
        static_features = df[df['NflId'] == df['NflIdRusher']][add_new_feas+['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                                             'YardLine','Quarter','Down','Distance','DefendersInTheBox','Team',
                                                                             'PossessionTeam','HomeTeamAbbr','VisitorTeamAbbr','FieldPosition',
                                                                             'PlayDirection','PlayerHeight','WindSpeed','NflId','NflIdRusher',
                                                                             'TimeHandoff','TimeSnap','PlayerBirthDate','GameClock','Season']].drop_duplicates()
        static_features.fillna(-999,inplace=True)
        
        return static_features

    def split_personnel(s):
        splits = s.split(',')
        for i in range(len(splits)):
            splits[i] = splits[i].strip()

        return splits

    def defense_formation(l):
        dl = 0
        lb = 0
        db = 0
        other = 0

        for position in l:
            sub_string = position.split(' ')
            if sub_string[1] == 'DL':
                dl += int(sub_string[0])
            elif sub_string[1] in ['LB','OL']:
                lb += int(sub_string[0])
            else:
                db += int(sub_string[0])

        counts = (dl,lb,db,other)

        return counts

    def offense_formation(l):
        qb = 0
        rb = 0
        wr = 0
        te = 0
        ol = 0

        sub_total = 0
        qb_listed = False
        for position in l:
            sub_string = position.split(' ')
            pos = sub_string[1]
            cnt = int(sub_string[0])

            if pos == 'QB':
                qb += cnt
                sub_total += cnt
                qb_listed = True
            # Assuming LB is a line backer lined up as full back
            elif pos in ['RB','LB']:
                rb += cnt
                sub_total += cnt
            # Assuming DB is a defensive back and lined up as WR
            elif pos in ['WR','DB']:
                wr += cnt
                sub_total += cnt
            elif pos == 'TE':
                te += cnt
                sub_total += cnt
            # Assuming DL is a defensive lineman lined up as an additional line man
            else:
                ol += cnt
                sub_total += cnt

        # If not all 11 players were noted at given positions we need to make some assumptions
        # I will assume if a QB is not listed then there was 1 QB on the play
        # If a QB is listed then I'm going to assume the rest of the positions are at OL
        # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
        if sub_total < 11:
            diff = 11 - sub_total
            if not qb_listed:
                qb += 1
                diff -= 1
            ol += diff

        counts = (qb,rb,wr,te,ol)

        return counts    

    def personnel_features(df):
        personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
        personnel['num_DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
        personnel['num_LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
        personnel['num_DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
        personnel['num_QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
        personnel['num_RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
        personnel['num_WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
        personnel['num_TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
        personnel['num_OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

        # Let's create some features to specify if the OL is covered
        personnel['OL_diff'] = personnel['num_OL'] - personnel['num_DL']
        personnel['OL_TE_diff'] = (personnel['num_OL'] + personnel['num_TE']) - personnel['num_DL']
        # Let's create a feature to specify if the defense is preventing the run
        # Let's just assume 7 or more DL and LB is run prevention
        personnel['run_def'] = (personnel['num_DL'] + personnel['num_LB'] > 6).astype(int)

        personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)
        
        return personnel

    def combine_features(relative_to_back, defense, static, personnel, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,personnel,on=['GameId','PlayId'],how='inner')

        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        return df

    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    def_feats = defense_features(df)
    static_feats = static_features(df)
    personnel = personnel_features(df)
    
    basetable = combine_features(rel_back, def_feats, static_feats, personnel, deploy=deploy)
    
    return basetable

def create_features_02(t_):
    t_['fe1'] = pd.Series(np.sqrt(np.absolute(np.square(t_.X.values) - np.square(t_.Y.values))))
    t_['fe5'] = np.square(t_['S'].values) + 2 * t_['A'].values * t_['Dis'].values  # N
    t_['fe7'] = np.arccos(np.clip(t_['X'].values / t_['Y'].values, -1, 1))  # N
    t_['fe8'] = t_['S'].values / np.clip(t_['fe1'].values, 0.6, None)
    radian_angle = (90 - t_['Dir']) * np.pi / 180.0
    t_['fe10'] = np.abs(t_['S'] * np.cos(radian_angle))
    t_['fe11'] = np.abs(t_['S'] * np.sin(radian_angle))
    
    t_["is_rusher"]          = 1.0*(t_["NflId"] == t_["NflIdRusher"])
    t_["is_home"]            = t_["Team"] == "home"
    t_["is_possession_team"] = 1.0*(t_["PossessionTeam"] == t_["HomeTeamAbbr"]) - 1.0*(t_["PossessionTeam"] == t_["VisitorTeamAbbr"])
    t_["is_field_team"]      = 1.0*(t_["FieldPosition"] == t_["HomeTeamAbbr"]) - 1.0*(t_["FieldPosition"] == t_["VisitorTeamAbbr"])
    t_["is_left"]            = t_["PlayDirection"] == "left"
    
    #t_["player_height"]      = t_["PlayerHeight"].apply(get_height)
    #t_["WindSpeed"]   = t_["WindSpeed"].apply(process_windspeed)
    #t_["TimeHandoff"] = pd.to_datetime(t_["TimeHandoff"])
    #t_["TimeSnap"]    = pd.to_datetime(t_["TimeSnap"])
    #t_["duration"]    = (t_["TimeHandoff"] - t_["TimeSnap"]).dt.total_seconds()

    #t_["player_age"]  = (t_["TimeSnap"].dt.date - pd.to_datetime(t_["PlayerBirthDate"]).dt.date)/np.timedelta64(1, 'D') / 365

    t_["game_time"]   = t_["GameClock"].apply(get_time)
    t_["old_data"]    = t_["Season"] == 2017
    return t_


def logs(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)   
        res.columns.values[m] = l + '_log'
        m += 1
    return res

def squares(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(res[l]*res[l]).values)   
        res.columns.values[m] = l + '_sq'
        m += 1
    return res

In [6]:
# Define as colunas ID que serão agrupadas e posteriormente removidas
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()

In [8]:
# Cria as novas features (etapa 01)
%time train_basetable = create_features_01(train, False)

CPU times: user 3min 4s, sys: 4.16 s, total: 3min 9s
Wall time: 2min 58s


In [9]:
# Cria as novas features (etapa 02)
%time train_basetable = create_features_02(train_basetable)

CPU times: user 486 ms, sys: 114 ms, total: 600 ms
Wall time: 154 ms


In [10]:
# Cria as novas features (etapa 03)
log_features = ['X','Y','S','A','Dis','Orientation','Dir','YardLine','PlayerHeight_dense','Dir_sin','Dir_cos']
train_basetable = logs(train_basetable, log_features)

In [11]:
# Cria as novas features (etapa 04)
squared_features = ['X','Y','S','A','Dis','Orientation','Dir','YardLine','PlayerHeight_dense','Dir_sin','Dir_cos']
train_basetable = squares(train_basetable, squared_features)

In [12]:
# Remove algumas features do dataset e preenche valores NaN com 0 (zero)
train_basetable.drop(['TimeSnap','TimeHandoff','PlayerBirthDate','GameClock','PlayerHeight','NflId','NflIdRusher','Season','WindSpeed'], axis=1, inplace=True)
#train_basetable = train_basetable.fillna(0)

In [19]:
# Cria uma copia do dataset para backup
X = train_basetable.copy()
X.shape

(23171, 86)

In [20]:
# Transformação de variaveis categoricas para numericas usando LabelEncoder
le = preprocessing.LabelEncoder()

le_dict = {}
categoricals = ['Team_le','PossessionTeam_le','HomeTeamAbbr_le','VisitorTeamAbbr_le',
                'FieldPosition_le','PlayDirection_le']

for cat in categoricals:
    le_dict[cat] = LabelEncoder()
    X[cat] = le_dict[cat].fit_transform(X[cat[:-3]].apply(str))  

# Remove as features originais que foram transformadas
X.drop(['Team','PossessionTeam','HomeTeamAbbr','VisitorTeamAbbr','FieldPosition','PlayDirection'], axis=1, inplace=True)
X.shape

(23171, 86)

In [21]:
X.head()

Unnamed: 0,GameId,PlayId,back_from_scrimmage,back_oriented_down_field,back_moving_down_field,min_dist,max_dist,mean_dist,std_dist,def_min_dist,def_max_dist,def_mean_dist,def_std_dist,PlayerHeight_dense,PlayerAge,WindSpeed_dense,GameWeather_dense,Dir_sin,Dir_cos,diffScoreBeforePlay,DefendersInTheBox,X,Y,S,A,Dis,Orientation,Dir,YardLine,Quarter,Down,Distance,DefendersInTheBox.1,num_DL,num_LB,num_DB,num_QB,num_RB,num_WR,num_TE,num_OL,OL_diff,OL_TE_diff,run_def,Yards,fe1,fe5,fe7,fe8,fe10,fe11,is_rusher,is_home,is_possession_team,is_field_team,is_left,game_time,old_data,X_log,Y_log,S_log,A_log,Dis_log,Orientation_log,Dir_log,YardLine_log,PlayerHeight_dense_log,Dir_sin_log,Dir_cos_log,X_sq,Y_sq,S_sq,A_sq,Dis_sq,Orientation_sq,Dir_sq,YardLine_sq,PlayerHeight_dense_sq,Dir_sin_sq,Dir_cos_sq,Team_le,PossessionTeam_le,HomeTeamAbbr_le,VisitorTeamAbbr_le,FieldPosition_le,PlayDirection_le
0,2017090700,20170907000118,-33.75,0,1,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299,70,25.596251,8.0,1.0,-0.91169,-0.410878,0,6.0,78.75,30.53,3.63,3.35,0.38,161.98,245.74,45.0,1,3,2,6.0,2,3,6,1,1,3,1,5,3,4,0,8,72.591195,15.7229,0.0,0.050006,3.309436,1.491487,1.0,True,1.0,1.0,True,854,True,4.379022,3.451257,1.534714,1.472472,0.329304,5.093689,5.508376,3.828859,4.262821,-2.319633,-0.51229,6201.5625,932.0809,13.1769,11.2225,0.1444,26237.5204,60388.1476,2025.0,4900,0.831179,0.168821,1,20,20,15,21,0
1,2017090700,20170907000139,-18.07,1,1,0.792023,23.025872,8.614623,5.598683,4.287773,23.025872,10.297028,5.833217,70,25.596252,8.0,1.0,-0.740805,0.671721,0,6.0,71.07,27.16,3.06,2.41,0.34,210.7,312.2,53.0,1,1,10,6.0,2,3,6,1,1,3,1,5,3,4,0,3,65.675561,11.0024,0.0,0.046593,2.266862,2.055465,1.0,True,1.0,1.0,True,832,True,4.277777,3.338258,1.403643,1.229641,0.300105,5.355217,5.746874,3.989169,4.262821,-1.312318,0.519817,5050.9449,737.6656,9.3636,5.8081,0.1156,44394.49,97468.84,2809.0,4900,0.548791,0.451209,1,20,20,15,21,0
2,2017090700,20170907000189,26.34,0,1,1.64639,20.726285,8.482583,4.642121,4.22167,20.726285,9.903689,5.07329,70,25.596253,8.0,1.0,-0.668612,-0.743612,0,7.0,48.66,19.11,5.77,2.42,0.6,140.82,221.96,75.0,1,1,10,7.0,2,3,6,1,1,3,1,5,3,4,0,5,44.750458,36.1969,0.0,0.128937,3.857889,4.29064,1.0,True,1.0,-1.0,True,782,True,3.905401,3.001714,1.913977,1.23256,0.476234,4.954629,5.407037,4.330865,4.262821,-1.074735,-1.322801,2367.7956,365.1921,33.2929,5.8564,0.36,19830.2724,49266.2416,5625.0,4900,0.447042,0.552958,1,20,20,15,16,0
3,2017090700,20170907000345,92.47,1,1,0.918096,9.791231,5.549379,1.983128,4.528002,9.791231,6.309354,1.834174,71,26.852933,8.0,1.0,-0.995496,0.094803,0,9.0,15.53,25.36,4.45,3.2,0.46,186.22,275.44,108.0,1,2,2,9.0,4,4,3,1,2,0,2,6,2,4,1,2,20.048658,22.7465,0.911727,0.22196,4.429957,0.421875,1.0,True,1.0,-1.0,True,732,True,2.805782,3.272227,1.697449,1.437463,0.385262,5.232338,5.62203,4.69144,4.276805,-4.233332,0.099667,241.1809,643.1296,19.8025,10.24,0.2116,34677.8884,75867.1936,11664.0,5041,0.991012,0.008988,1,20,20,15,16,0
4,2017090700,20170907000395,5.01,0,0,0.502892,21.214806,9.168819,5.611232,4.288088,21.214806,11.056456,5.900009,71,22.091819,8.0,1.0,0.375901,-0.92666,7,7.0,29.99,27.12,3.9,2.53,0.44,34.27,157.92,35.0,1,1,10,7.0,3,2,6,1,1,1,3,5,2,5,0,7,12.802566,17.4364,0.0,0.304626,1.466013,3.613974,1.0,False,-1.0,-1.0,False,728,True,3.433987,3.336837,1.591274,1.264127,0.371564,3.563316,5.068464,3.583797,4.276805,0.32635,-2.484825,899.4001,735.4944,15.21,6.4009,0.1936,1174.4329,24938.7264,1225.0,5041,0.141301,0.858699,0,15,20,15,16,1


## 3. Feature Selection

In [None]:
import scipy.sparse as ss

class Dataset:
    """
    Dataset for LOFO
    Parameters
    ----------
    df: pandas dataframe
    target: string
        Column name for target within df
    features: list of strings
        List of column names within df
    feature_groups: dict, optional
        Name, value dictionary of feature groups as numpy.darray or scipy.csr.scr_matrix
    """

    def __init__(self, df, target, features, feature_groups=None):
        self.df = df.copy()
        self.features = list(features)
        self.feature_groups = feature_groups if feature_groups else dict()

        self.num_rows = df.shape[0]
        self.y = df[target].values

        for feature_name, feature_matrix in self.feature_groups.items():
            if not (isinstance(feature_matrix, np.ndarray) or isinstance(feature_matrix, ss.csr.csr_matrix)):
                raise Exception("Data type {dtype} is not a valid type!".format(dtype=type(feature_matrix)))

            if feature_matrix.shape[0] != self.num_rows:
                raise Exception("Expected {expected} rows but got {n} rows!".format(expected=self.num_rows,
                                                                                    n=feature_matrix.shape[0]))

            if feature_name in self.features:
                raise Exception("Feature group name '{name}' is the same with one of the features!")

    def getX(self, feature_to_remove, fit_params):
        """Get feature matrix and fit_params after removing a feature
        Parameters
        ----------
        feature_to_remove : string
            feature name to remove
        fit_params : dict
            fit parameters for the model
        Returns
        -------
        X : numpy.darray or scipy.csr.scr_matrix
            Feature matrix
        fit_params: dict
            Updated fit_params after feature removal
        """
        feature_list = [feature for feature in self.features if feature != feature_to_remove]
        concat_list = [self.df[feature_list].values]

        for feature_name, feature_matrix in self.feature_groups.items():
            if feature_name != feature_to_remove:
                concat_list.append(feature_matrix)

        fit_params = fit_params.copy()
        if "categorical_feature" in fit_params:
            cat_features = [f for f in fit_params["categorical_feature"] if f != feature_to_remove]
            fit_params["categorical_feature"] = [ix for ix, f in enumerate(feature_list) if (f in cat_features)]

        has_sparse = False
        for feature_name, feature_matrix in self.feature_groups.items():
            if feature_name != feature_to_remove and isinstance(feature_matrix, ss.csr.csr_matrix):
                has_sparse = True

        concat = np.hstack
        if has_sparse:
            concat = ss.hstack

        return concat(concat_list), fit_params

In [None]:
from lightgbm import LGBMClassifier, LGBMRegressor

def infer_model(df, features, y, n_jobs):
    model_class = LGBMRegressor
    if len(np.unique(y)) == 2:
        y = LabelEncoder().fit_transform(y)
        model_class = LGBMClassifier

    categoricals = df[features].select_dtypes(exclude=[np.number]).columns.tolist()
    for f in categoricals:
        df[f] = LabelEncoder().fit_transform(df[f].apply(str))

    min_child_samples = int(0.01*df.shape[0])

    model = model_class(min_child_samples=min_child_samples, n_jobs=n_jobs)

    return model, df, categoricals, y

In [None]:
def plot_importance(importance_df, figsize=(8, 8)):
    """Plot feature importance
    Parameters
    ----------
    importance_df : pandas dataframe
        Output dataframe from LOFO/FLOFO get_importance
    figsize : tuple
    """
    importance_df = importance_df.copy()
    importance_df["color"] = (importance_df["importance_mean"] > 0).map({True: 'g', False: 'r'})
    importance_df.sort_values("importance_mean", inplace=True)

    importance_df.plot(x="feature", y="importance_mean", xerr="importance_std",
                       kind='barh', color=importance_df["color"], figsize=figsize)

In [None]:
from sklearn.model_selection import cross_validate
import multiprocessing
#from lofo.infer_defaults import infer_model


class LOFOImportance:
    """
    Leave One Feature Out Importance
    Given a model and cross-validation scheme, calculates the feature importances.
    Parameters
    ----------
    dataset: LOFO Dataset object
    scoring: string or callable
        Same as scoring in sklearn API
    model: model (sklearn API), optional
        Not trained model object
    fit_params : dict, optional
        fit parameters for the model
    cv: int or iterable
        Same as cv in sklearn API
    n_jobs: int, optional
        Number of jobs for parallel computation
    """

    def __init__(self, dataset, scoring, model=None, fit_params=None, cv=4, n_jobs=None):

        self.fit_params = fit_params if fit_params else dict()
        if model is None:
            model, dataset.df, categoricals, dataset.y = infer_model(dataset.df, dataset.features, dataset.y, n_jobs)
            self.fit_params["categorical_feature"] = categoricals
            n_jobs = 1

        self.model = model
        self.dataset = dataset
        self.scoring = scoring
        self.cv = cv
        self.n_jobs = n_jobs
        if self.n_jobs is not None and self.n_jobs > 1:
            warning_str = ("Warning: If your model is multithreaded, please initialise the number"
                           "of jobs of LOFO to be equal to 1, otherwise you may experience performance issues.")
            warnings.warn(warning_str)

    def _get_cv_score(self, feature_to_remove):
        X, fit_params = self.dataset.getX(feature_to_remove=feature_to_remove, fit_params=self.fit_params)
        y = self.dataset.y

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cv_results = cross_validate(self.model, X, y, cv=self.cv, scoring=self.scoring, fit_params=fit_params)
        return cv_results['test_score']

    def _get_cv_score_parallel(self, feature, result_queue):
        test_score = self._get_cv_score(feature_to_remove=feature)
        result_queue.put((feature, test_score))
        return test_score

    def get_importance(self):
        """Run LOFO to get feature importances
        Returns
        -------
        importance_df : pandas dataframe
            Dataframe with feature names and corresponding importance mean and std (sorted by importance)
        """
        base_cv_score = self._get_cv_score(feature_to_remove=None)
        feature_list = self.dataset.features + list(self.dataset.feature_groups.keys())

        if self.n_jobs is not None and self.n_jobs > 1:

            pool = multiprocessing.Pool(self.n_jobs)
            manager = multiprocessing.Manager()
            result_queue = manager.Queue()

            for f in feature_list:
                pool.apply_async(self._get_cv_score_parallel, (f, result_queue))

            pool.close()
            pool.join()

            lofo_cv_result = [result_queue.get() for _ in range(len(feature_list))]
            lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for _, lofo_cv_score in lofo_cv_result])
            feature_list = [feature for feature, _ in lofo_cv_result]
        else:
            lofo_cv_scores = []
            for f in tqdm_notebook(feature_list):
                lofo_cv_scores.append(self._get_cv_score(feature_to_remove=f))

            lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for lofo_cv_score in lofo_cv_scores])

        importance_df = pd.DataFrame()
        importance_df["feature"] = feature_list
        importance_df["importance_mean"] = lofo_cv_scores_normalized.mean(axis=1)
        importance_df["importance_std"] = lofo_cv_scores_normalized.std(axis=1)

        return importance_df.sort_values("importance_mean", ascending=False)

In [None]:
# Procedimento para verificar as features mais importantes
# Usando LightGBM para treinamento
#from lofo import LOFOImportance, Dataset, plot_importance

n_folds = 5
kfold_lgb = KFold(n_folds, shuffle=True)

features = [x for x in X.columns if x not in ['Yards','GameId','PlayId']]

params2 = {'num_leaves': 15,
          'objective': 'mae',
          #'learning_rate': 0.1,
          "boosting": "gbdt",
          "num_rounds": 150
          }

model_lgb = lgb.LGBMRegressor(**params2)
dataset = Dataset(df=X, target="Yards", features=features)
lofo_imp = LOFOImportance(dataset, model=model_lgb, cv=kfold_lgb, scoring="neg_mean_absolute_error", fit_params={"categorical_feature": categoricals})

importance_df = lofo_imp.get_importance()

In [None]:
# Exibindo grafico com as features
plot_importance(importance_df, figsize=(12, 38))

In [None]:
best_features = importance_df.loc[importance_df['importance_mean'] > 0].feature
best_features

## 4. Criar e avaliar alguns algoritmos de Machine Learning

In [18]:
X.head()

Unnamed: 0,back_from_scrimmage,back_oriented_down_field,back_moving_down_field,min_dist,max_dist,mean_dist,std_dist,def_min_dist,def_max_dist,def_mean_dist,def_std_dist,PlayerHeight_dense,PlayerAge,WindSpeed_dense,GameWeather_dense,Dir_sin,Dir_cos,diffScoreBeforePlay,DefendersInTheBox,X,Y,S,A,Dis,Orientation,Dir,YardLine,Quarter,Down,Distance,DefendersInTheBox.1,num_DL,num_LB,num_DB,num_QB,num_RB,num_WR,num_TE,num_OL,OL_diff,OL_TE_diff,run_def,fe1,fe5,fe7,fe8,fe10,fe11,is_rusher,is_home,is_possession_team,is_field_team,is_left,game_time,old_data,X_log,Y_log,S_log,A_log,Dis_log,Orientation_log,Dir_log,YardLine_log,PlayerHeight_dense_log,Dir_sin_log,Dir_cos_log,X_sq,Y_sq,S_sq,A_sq,Dis_sq,Orientation_sq,Dir_sq,YardLine_sq,PlayerHeight_dense_sq,Dir_sin_sq,Dir_cos_sq,Team_le,PossessionTeam_le,HomeTeamAbbr_le,VisitorTeamAbbr_le,FieldPosition_le,PlayDirection_le
0,-33.75,0,1,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299,70,25.596251,8.0,1.0,-0.91169,-0.410878,0,6.0,78.75,30.53,3.63,3.35,0.38,161.98,245.74,45.0,1,3,2,6.0,2,3,6,1,1,3,1,5,3,4,0,72.591195,15.7229,0.0,0.050006,3.309436,1.491487,1.0,True,1.0,1.0,True,854,True,4.379022,3.451257,1.534714,1.472472,0.329304,5.093689,5.508376,3.828859,4.262821,-2.319633,-0.51229,6201.5625,932.0809,13.1769,11.2225,0.1444,26237.5204,60388.1476,2025.0,4900,0.831179,0.168821,1,20,20,15,21,0
1,-18.07,1,1,0.792023,23.025872,8.614623,5.598683,4.287773,23.025872,10.297028,5.833217,70,25.596252,8.0,1.0,-0.740805,0.671721,0,6.0,71.07,27.16,3.06,2.41,0.34,210.7,312.2,53.0,1,1,10,6.0,2,3,6,1,1,3,1,5,3,4,0,65.675561,11.0024,0.0,0.046593,2.266862,2.055465,1.0,True,1.0,1.0,True,832,True,4.277777,3.338258,1.403643,1.229641,0.300105,5.355217,5.746874,3.989169,4.262821,-1.312318,0.519817,5050.9449,737.6656,9.3636,5.8081,0.1156,44394.49,97468.84,2809.0,4900,0.548791,0.451209,1,20,20,15,21,0
2,26.34,0,1,1.64639,20.726285,8.482583,4.642121,4.22167,20.726285,9.903689,5.07329,70,25.596253,8.0,1.0,-0.668612,-0.743612,0,7.0,48.66,19.11,5.77,2.42,0.6,140.82,221.96,75.0,1,1,10,7.0,2,3,6,1,1,3,1,5,3,4,0,44.750458,36.1969,0.0,0.128937,3.857889,4.29064,1.0,True,1.0,-1.0,True,782,True,3.905401,3.001714,1.913977,1.23256,0.476234,4.954629,5.407037,4.330865,4.262821,-1.074735,-1.322801,2367.7956,365.1921,33.2929,5.8564,0.36,19830.2724,49266.2416,5625.0,4900,0.447042,0.552958,1,20,20,15,16,0
3,92.47,1,1,0.918096,9.791231,5.549379,1.983128,4.528002,9.791231,6.309354,1.834174,71,26.852933,8.0,1.0,-0.995496,0.094803,0,9.0,15.53,25.36,4.45,3.2,0.46,186.22,275.44,108.0,1,2,2,9.0,4,4,3,1,2,0,2,6,2,4,1,20.048658,22.7465,0.911727,0.22196,4.429957,0.421875,1.0,True,1.0,-1.0,True,732,True,2.805782,3.272227,1.697449,1.437463,0.385262,5.232338,5.62203,4.69144,4.276805,-4.233332,0.099667,241.1809,643.1296,19.8025,10.24,0.2116,34677.8884,75867.1936,11664.0,5041,0.991012,0.008988,1,20,20,15,16,0
4,5.01,0,0,0.502892,21.214806,9.168819,5.611232,4.288088,21.214806,11.056456,5.900009,71,22.091819,8.0,1.0,0.375901,-0.92666,7,7.0,29.99,27.12,3.9,2.53,0.44,34.27,157.92,35.0,1,1,10,7.0,3,2,6,1,1,1,3,5,2,5,0,12.802566,17.4364,0.0,0.304626,1.466013,3.613974,1.0,False,-1.0,-1.0,False,728,True,3.433987,3.336837,1.591274,1.264127,0.371564,3.563316,5.068464,3.583797,4.276805,0.32635,-2.484825,899.4001,735.4944,15.21,6.4009,0.1936,1174.4329,24938.7264,1225.0,5041,0.141301,0.858699,0,15,20,15,16,1


In [56]:
# Criar um dataset somente com as colunas mais importantes conforme visto anteriormente
#new_X = X.loc[:,best_features]
new_X = X.drop(['GameId','PlayId','Yards'], axis=1)
target = X.Yards

y = np.zeros((target.shape[0], 199))
for idx, target in enumerate(list(target)):
    y[idx][99 + target] = 1
    
# Normalizando as variaveis do dataset de treino
scaler = StandardScaler()
new_X = scaler.fit_transform(new_X)
new_X.shape, y.shape

((23171, 83), (23171, 199))

## 4.1. Teste com LightGBM

In [61]:
# evaluation metric
def crps(y_true, y_pred):
    y_true = np.clip(np.cumsum(y_true, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * y_true.shape[0])
    val_s = np.round(val_s, 6)
    return  val_s

In [62]:
metric = "multi_logloss"
param = {'num_leaves': 50, 
         'min_data_in_leaf': 30,
         'objective':'multiclass',
         'num_class': 199,
         'max_depth': -1,
         'learning_rate': 0.001,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.7, #0.9
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": metric,
         "lambda_l1": 0.1,
         "verbosity": -1,
         "seed":1234}

In [67]:
models = []
score = []
best_validation_scores = []
y_ = np.argmax(y, axis=1)

for k in range(2):
    kfold = KFold(5, random_state = 42 + k, shuffle = True)

    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(new_X, y_)):
        print("-----------")
        print(f'Fold : {k_fold}')
        print("-----------")        
        X_train, X_val, y_train, y_val = new_X[tr_inds], new_X[val_inds], y_[tr_inds], y_[val_inds]
        trn_data = lgb.Dataset(X_train, y_train)
        val_data = lgb.Dataset(X_val, y_val, reference=trn_data)

        num_round = 1000
        model = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
        score_ = crps(np.expand_dims(y_val, axis=1), model.predict(X_val, num_iteration=model.best_iteration))
        print(score_)

        best_validation_scores.append(model.best_score['valid_1'][metric])
        score.append(score_)
        models.append(model)

print(np.mean(score))

-----------
Fold : 0
-----------
Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 2.75028	valid_1's multi_logloss: 2.89109
[200]	training's multi_logloss: 2.65333	valid_1's multi_logloss: 2.87787
[300]	training's multi_logloss: 2.57378	valid_1's multi_logloss: 2.86682
[400]	training's multi_logloss: 2.50549	valid_1's multi_logloss: 2.85734
[500]	training's multi_logloss: 2.44527	valid_1's multi_logloss: 2.84915
[600]	training's multi_logloss: 2.39094	valid_1's multi_logloss: 2.84174
[700]	training's multi_logloss: 2.3413	valid_1's multi_logloss: 2.83514
[800]	training's multi_logloss: 2.29551	valid_1's multi_logloss: 2.82914
[900]	training's multi_logloss: 2.25295	valid_1's multi_logloss: 2.82389
[1000]	training's multi_logloss: 2.21295	valid_1's multi_logloss: 2.8193
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 2.21295	valid_1's multi_logloss: 2.8193
0.504594
-----------
Fold : 1
-----------
Training unti

[800]	training's multi_logloss: 2.28833	valid_1's multi_logloss: 2.89188
[900]	training's multi_logloss: 2.24615	valid_1's multi_logloss: 2.88705
[1000]	training's multi_logloss: 2.20643	valid_1's multi_logloss: 2.88278
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 2.20643	valid_1's multi_logloss: 2.88278
0.504555
-----------
Fold : 4
-----------
Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 2.74516	valid_1's multi_logloss: 2.881
[200]	training's multi_logloss: 2.64819	valid_1's multi_logloss: 2.86889
[300]	training's multi_logloss: 2.56881	valid_1's multi_logloss: 2.85848
[400]	training's multi_logloss: 2.50063	valid_1's multi_logloss: 2.84951
[500]	training's multi_logloss: 2.44033	valid_1's multi_logloss: 2.84168
[600]	training's multi_logloss: 2.38615	valid_1's multi_logloss: 2.83492
[700]	training's multi_logloss: 2.33665	valid_1's multi_logloss: 2.82894
[800]	training's multi_logloss: 2.29096	valid

## 4.3. Teste com Keras (New Struct)

In [None]:
class CRPSCallback(Callback):
    
    def __init__(self,validation, predict_batch_size=20, include_on_batch=False):
        super(CRPSCallback, self).__init__()
        self.validation = validation
        self.predict_batch_size = predict_batch_size
        self.include_on_batch = include_on_batch
        
        print('validation shape',len(self.validation))

    def on_batch_begin(self, batch, logs={}):
        pass

    def on_train_begin(self, logs={}):
        if not ('CRPS_score_val' in self.params['metrics']):
            self.params['metrics'].append('CRPS_score_val')

    def on_batch_end(self, batch, logs={}):
        if (self.include_on_batch):
            logs['CRPS_score_val'] = float('-inf')

    def on_epoch_end(self, epoch, logs={}):
        logs['CRPS_score_val'] = float('-inf')
            
        if (self.validation):
            X_valid, y_valid = self.validation[0], self.validation[1]
            y_pred = self.model.predict(X_valid)
            y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
            y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
            val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid.shape[0])
            val_s = np.round(val_s, 6)
            logs['CRPS_score_val'] = val_s

In [None]:
def get_model(x_tr,y_tr,x_val,y_val):
    inp = Input(shape = (x_tr.shape[1],))
    x = Dense(1024, input_dim=X.shape[1], activation='relu')(inp)
    x = Dropout(0.6)(x)
    x = BatchNormalization()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    
    out = Dense(199, activation='softmax')(x)
    model = Model(inp,out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[])
    #add lookahead
#     lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
#     lookahead.inject(model) # add into model

    
    es = EarlyStopping(monitor='CRPS_score_val', 
                       mode='min',
                       restore_best_weights=True, 
                       verbose=1, 
                       patience=15)

    mc = ModelCheckpoint('best_model.h5',monitor='CRPS_score_val',mode='min',
                                   save_best_only=True, verbose=1, save_weights_only=True)
    
    bsz = 1024
    steps = x_tr.shape[0]/bsz
    


    model.fit(x_tr, y_tr,callbacks=[CRPSCallback(validation = (x_val,y_val)),es,mc], epochs=250, batch_size=bsz,verbose=1)
    model.load_weights("best_model.h5")
    
    y_pred = model.predict(x_val)
    y_valid = y_val
    y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * x_val.shape[0])
    crps = np.round(val_s, 6)

    return model,crps

In [None]:
losses = []
models = []
crps_csv = []

s_time = time.time()

for k in range(2):
    kfold = KFold(10, random_state = 42 + k, shuffle = True)
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(y)):
        print("-----------")
        print("-----------")
        tr_x,tr_y = new_X[tr_inds],y[tr_inds]
        val_x,val_y = new_X[val_inds],y[val_inds]
        model,crps = get_model(tr_x,tr_y,val_x,val_y)

        models.append(model)
        print("the %d fold crps is %f"%((k_fold+1),crps))
        crps_csv.append(crps)
        
        '''if (crps <= 0.013):
            print("the %d fold crps is %f"%((k_fold),crps))
            models.append(model)
            crps_csv.append(crps)
            if (len(crps_csv) > 10):
                break
        else:
            print('Ignore KFold:',k_fold, '| CRPS:', crps)'''

print("mean crps is %f"%np.mean(crps_csv))


def predict(x_te):
    model_num = len(models)
    for k,m in enumerate(models):
        if k==0:
            y_pred = m.predict(x_te,batch_size=1024)
        else:
            y_pred+=m.predict(x_te,batch_size=1024)
            
    y_pred = y_pred / model_num
    
    return y_pred

In [None]:
print("mean crps is %f"%np.mean(crps_csv))

## 5. Realizar a submissão para o Kaggle

### 5.1. Submissao para LightGBM

In [None]:
from kaggle.competitions import nflrush
pd.options.mode.chained_assignment = None

env = nflrush.make_env()
iter_test = env.iter_test()

df_prev = pd.DataFrame()
df_test = pd.DataFrame()

for (test_df, sample_prediction_df) in tqdm_notebook(iter_test):
    # Feature Engineering
    basetable = create_features_01(test_df, True)
    basetable = create_features_02(basetable)
    basetable = logs(basetable, log_features)
    basetable = squares(basetable, squared_features)
    
    # Remove algumas colunas
    basetable.drop(['TimeHandoff','PlayerBirthDate','GameClock','PlayerHeight','NflId','NflIdRusher','Season'], axis=1, inplace=True)
    
    # Label Encoder para variaveis categoricas
    for cat in categoricals:
        le_dict[cat] = LabelEncoder()
        basetable[cat] = le_dict[cat].fit_transform(basetable[cat[:-3]].apply(str))  
        
    # Remove as colunas categoricas originais
    basetable.drop(['GameId','TimeSnap','Team','PossessionTeam','HomeTeamAbbr','VisitorTeamAbbr','FieldPosition','PlayDirection'], axis=1, inplace=True)
    
    # Considerar somente as colunas do Feature Selection
    #basetable = basetable.loc[:,best_features]
    
    # Normalizacao
    scaled_basetable = scaler.transform(basetable)
    
    # Make predictions
    y_pred = np.mean([model.predict(scaled_basetable, num_iteration=model.best_iteration) for model in models],axis=0)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]
    
    preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)
    
    df_test = df_test.append(basetable)
    df_prev = df_prev.append(preds_df)
    
    env.predict(preds_df)
    
env.write_submission_file()

### 5.2. Submissao para Keras

In [None]:
from kaggle.competitions import nflrush
pd.options.mode.chained_assignment = None

env = nflrush.make_env()
iter_test = env.iter_test()

df_prev = pd.DataFrame()
df_test = pd.DataFrame()

for (test_df, sample_prediction_df) in tqdm_notebook(iter_test):
    # Feature Engineering
    basetable = create_features_01(test_df, True)
    basetable = create_features_02(basetable)
    basetable = logs(basetable, log_features)
    basetable = squares(basetable, squared_features)
    
    # Remove algumas colunas
    basetable.drop(['TimeHandoff','PlayerBirthDate','GameClock','PlayerHeight','NflId','NflIdRusher','Season'], axis=1, inplace=True)
    
    # Label Encoder para variaveis categoricas
    for cat in categoricals:
        le_dict[cat] = LabelEncoder()
        basetable[cat] = le_dict[cat].fit_transform(basetable[cat[:-3]].apply(str))  
        
    # Remove as colunas categoricas originais
    basetable.drop(['GameId','TimeSnap','Team','PossessionTeam','HomeTeamAbbr','VisitorTeamAbbr','FieldPosition','PlayDirection'], axis=1, inplace=True)
    
    # Considerar somente as colunas do Feature Selection
    #basetable = basetable.loc[:,best_features]
    
    # Normalizacao
    scaled_basetable = scaler.transform(basetable)
    
    # Make predictions
    y_pred = predict(scaled_basetable)
    
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]
    preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)
    
    df_test = df_test.append(basetable)
    df_prev = df_prev.append(preds_df)
    
    env.predict(preds_df)
    
env.write_submission_file()