# NFL Competition

# Feature Engineering e Modelo de Machine Learning

- Version: 1.0: usando padrao do fork: https://www.kaggle.com/bestpredict/location-eda-8eb410
        *    Resultado: 0.012744
        *    LB: 0.01363
   
- <font color='red'>Version: 2.0: adicionado Feature Selection com LOFO Importance
        *    Resultado: 0.012780
        *    LB: 0.01365
        
- <font color='blue'>Version: 3.0: adicionado novas features (apenas feature fxx + old_data + YardLine_std)
        *    Resultado: 0.012614
        *    LB: 0.01361
        
- <font color='blue'>Version: 4.0: adicionado novas features (turf)
        *    Resultado: 0.012624
        *    LB: 0.01361
        
- <font color='red'>Version: 5.0: adicionado novas features (game_time)
        *    Resultado: 0.012635
        *    LB: 0.01362

- <font color='blue'>Version: 6.0: adicionado novas features (feat1, feat2, feat3, feat4) e removido (Turf + game_time)
        *    Resultado: 0.012536
        *    LB: Não é permitido
    
- <font color='red'>Version: 7.0: alteração do modelo de bagging
        *    Resultado: 0.012474
        *    LB: 0.01362

    
- <font color='blue'>Version: 8.0: adicionado novas features:
    norm_quat,mod_quat,norm_X,norm_Y,norm_A,norm_S, X_YardLine_std,X_YardLine_median,X_YardLine_max,X_YardLine_min
    
        *    Resultado: 0.012604
        *    LB:
    
- <font color='green'>Version: 9.0: adicionado novas features:
    
        *    Resultado:
        *    LB:   

## 1. Importa os pacotes e o dataset de treino

In [None]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import re
import codecs
import time
import datetime
import tsfresh
import pandasql as ps
import gc

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

# Variavel para controlar o treinamento no Kaggle
TRAIN_OFFLINE = True

In [None]:
# Importa os pacotes de algoritmos de regressão
import lightgbm as lgb
from lightgbm import LGBMRegressor

# Importa os pacotes de algoritmos de redes neurais (Keras)
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras.utils import to_categorical
from keras.layers import Dense,Input,Flatten,concatenate,Dropout,Lambda,BatchNormalization
from keras.layers import Activation
from keras.models import Sequential, Model
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint
import keras.backend as K
from keras.optimizers import Adam, Nadam, RMSprop
from keras import optimizers
#from keras_lookahead import Lookahead
#from keras_radam import RAdam

# Importa pacotes do sklearn
from sklearn import preprocessing
import sklearn.metrics as mtr
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import scale, MinMaxScaler, StandardScaler
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# train = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})
if TRAIN_OFFLINE:
    train = pd.read_csv('../data/train.csv', dtype={'WindSpeed': 'object'})
    #new_ft = pd.read_csv('../data/nfl-sample-features.csv')
else:
    train = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})
    

In [None]:
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()

## 2. Feature Engineering

In [None]:
def strtoseconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def strtofloat(x):
    try:
        return float(x)
    except:
        return -1
    
def get_time(x):
    x = x.split(":")
    return int(x[0])*60 + int(x[1])

def map_weather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans*=0.5
    if 'climate controlled' in txt or 'indoor' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt:
        return -3*ans
    return 0

def OffensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0, 'QB' : 0, 'RB' : 0, 'TE' : 0, 'WR' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def DefensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def orientation_to_cat(x):
    x = np.clip(x, 0, 360 - 1)
    try:
        return str(int(x/15))
    except:
        return "nan"    
    
def split_personnel(s):
    splits = s.split(',')
    for i in range(len(splits)):
        splits[i] = splits[i].strip()

    return splits

def defense_formation(l):
    dl = 0
    lb = 0
    db = 0
    other = 0

    for position in l:
        sub_string = position.split(' ')
        if sub_string[1] == 'DL':
            dl += int(sub_string[0])
        elif sub_string[1] in ['LB','OL']:
            lb += int(sub_string[0])
        else:
            db += int(sub_string[0])

    counts = (dl,lb,db,other)

    return counts

def offense_formation(l):
    qb = 0
    rb = 0
    wr = 0
    te = 0
    ol = 0

    sub_total = 0
    qb_listed = False
    for position in l:
        sub_string = position.split(' ')
        pos = sub_string[1]
        cnt = int(sub_string[0])

        if pos == 'QB':
            qb += cnt
            sub_total += cnt
            qb_listed = True
        # Assuming LB is a line backer lined up as full back
        elif pos in ['RB','LB']:
            rb += cnt
            sub_total += cnt
        # Assuming DB is a defensive back and lined up as WR
        elif pos in ['WR','DB']:
            wr += cnt
            sub_total += cnt
        elif pos == 'TE':
            te += cnt
            sub_total += cnt
        # Assuming DL is a defensive lineman lined up as an additional line man
        else:
            ol += cnt
            sub_total += cnt

    # If not all 11 players were noted at given positions we need to make some assumptions
    # I will assume if a QB is not listed then there was 1 QB on the play
    # If a QB is listed then I'm going to assume the rest of the positions are at OL
    # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
    if sub_total < 11:
        diff = 11 - sub_total
        if not qb_listed:
            qb += 1
            diff -= 1
        ol += diff

    counts = (qb,rb,wr,te,ol)

    return counts    

In [None]:
def create_features_01(df, deploy=False):
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2
        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def velocity(x2, x1, sec) :
        return (x2 - x1) / sec
    
    def diff_x(b, c, theta) :
        if 90.0 < theta < 270.0 :
            return np.sqrt(((b ** 2) + (c ** 2)) - 2 * b * c * np.cos(theta))
        else :
            return 0
        
    def diff_y(b, c, theta) :
        if theta <= 90.0 and theta >= 270.0 :
            return - np.sqrt(((b ** 2) + (c ** 2)) - 2 * b * c * np.cos(theta))
        else :
            return 0
        
    def stop_period(speed, acc) :
        return speed / acc   

    def new_roll_velocity(x1, y1, x2, y2) :  
        x_diff = np.sqrt((x1 - x2) ** 2)
        y_diff = np.sqrt((y1 - y2) ** 2)
        return np.sqrt(x_diff + y_diff) 
    
    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]
        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')
        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X','Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]
        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','max','mean','std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']

        return defense

    
    def crawl_dist(df) :
        crawl = df[df['NflId'] == df['NflIdRusher']][['X', 'Y', 'Dir', 'GameId', 'NflIdRusher', 'PlayId']]
        crawl['Crawller_X'] = crawl[['X', 'Y', 'Dir']].apply(lambda x : diff_x(x[0], x[1], x[2]), axis = 1)
        crawl['Crawller_y'] = crawl[['X', 'Y', 'Dir']].apply(lambda x : diff_y(x[0], x[1], x[2]), axis = 1)
        crawl = crawl[['PlayId', 'GameId', 'Crawller_X', 'Crawller_y', 'NflIdRusher']]
        return crawl
    
    def crawl_sec(df) :
        crawls = df[df['NflId'] == df['NflIdRusher']][['A', 'S', 'PlayId', 'GameId', 'NflIdRusher']]
        crawls['Crawller_second'] = crawls[['S', 'A']].apply(lambda x : stop_period(x[0], x[1]), axis = 1)
        crawls = crawls[['GameId', 'NflIdRusher', 'PlayId', 'Crawller_second']]
        return crawls
        
    def crawlling_velocity(df, crawl, crawls) :
        player_on_pitch = df[df['NflId'] ==  df['NflIdRusher']][['X', 'Y', 'PlayId', 'GameId', 'NflIdRusher']]
        player_on_pitch = pd.merge(player_on_pitch, crawl, on = ['GameId', 'PlayId'], how = 'inner')
        player_on_pitch = pd.merge(player_on_pitch, crawls, on = ['GameId', 'PlayId'], how = 'inner')
        player_on_pitch['Velocity_X'] = player_on_pitch[['X', 'Crawller_X', 'Crawller_second']].apply(lambda x : velocity(x[0], x[1], x[2]), axis = 1)
        player_on_pitch['Velocity_y'] = player_on_pitch[['Y', 'Crawller_y', 'Crawller_second']].apply(lambda x : velocity(x[0], x[1], x[2]), axis = 1)        
        player_on_pitch['Velocity_percent'] = player_on_pitch[['Velocity_X', 'Velocity_y', 'X', 'Y']].apply(lambda x : new_roll_velocity(x[0], x[1], x[2], x[3]), axis = 1)
        
        player_on_pitch = player_on_pitch.groupby(['GameId','PlayId', 'Velocity_X', 'Velocity_y', 'Crawller_X',
                                                   'Crawller_y', 'Crawller_second'])\
                                         .agg({'Velocity_percent' : ['min','max','mean']})\
                                         .reset_index()
        
        player_on_pitch.columns = ['GameId', 'PlayId', 'min_velocity_percent', 'max_velocity_percrent', 'mean_velocity_percent',
                                   'Velocity_X', 'Velocity_y', 'Crawller_X', 'Crawller_y', 'Crawller_second']
        
        return player_on_pitch
    
    
    def static_features(df):
        
        
        add_new_feas = []

        ## Height
        df['PlayerHeight_dense'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
        add_new_feas.append('PlayerHeight_dense')

        ## Time
        df['TimeHandoff'] = df['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
        df['TimeSnap'] = df['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

        df['TimeDelta'] = df.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        df['PlayerBirthDate'] =df['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

        ## Age
        seconds_in_year = 60*60*24*365.25
        df['PlayerAge'] = df.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        add_new_feas.append('PlayerAge')

        ## BMI
        df['BMI'] = df['PlayerWeight'] / df['PlayerHeight_dense']
        add_new_feas.append('BMI')
        
        ## WindSpeed
        df['WindSpeed_ob'] = df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
        df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
        df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
        df['WindSpeed_dense'] = df['WindSpeed_ob'].apply(strtofloat)
        add_new_feas.append('WindSpeed_dense')

        ## Weather
        df['GameWeather_process'] = df['GameWeather'].str.lower()
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
        df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
        df['GameWeather_dense'] = df['GameWeather_process'].apply(map_weather)
        add_new_feas.append('GameWeather_dense')

        ## Orientation and Dir
        df["Orientation_ob"] = df["Orientation"].apply(lambda x : orientation_to_cat(x)).astype("object")
        df["Dir_ob"] = df["Dir"].apply(lambda x : orientation_to_cat(x)).astype("object")

        df["Orientation_sin"] = df["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Orientation_cos"] = df["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        
        df["Dir_sin"] = df["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Dir_cos"] = df["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        
        add_new_feas.append("Dir_sin")
        add_new_feas.append("Dir_cos")

        ## diff Score
        df["diffScoreBeforePlay"] = df["HomeScoreBeforePlay"] - df["VisitorScoreBeforePlay"]
        add_new_feas.append("diffScoreBeforePlay")
    
        static_features = df[df['NflId'] == df['NflIdRusher']][add_new_feas+['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                                             'YardLine','Quarter','Down','Distance','DefendersInTheBox',
                                                                             'NflId','NflIdRusher','PossessionTeam','HomeTeamAbbr','Turf',
                                                                             'VisitorTeamAbbr','PlayDirection','GameClock','Season','Team',
                                                                             'FieldPosition']].drop_duplicates()
        static_features.fillna(-999,inplace=True)

        return static_features


    def personnel_features(df):
        personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
        personnel['num_DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
        personnel['num_LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
        personnel['num_DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(x))
        personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
        personnel['num_QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
        personnel['num_RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
        personnel['num_WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
        personnel['num_TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
        personnel['num_OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

        # Let's create some features to specify if the OL is covered
        personnel['OL_diff'] = personnel['num_OL'] - personnel['num_DL']
        personnel['OL_TE_diff'] = (personnel['num_OL'] + personnel['num_TE']) - personnel['num_DL']
        # Let's create a feature to specify if the defense is preventing the run
        # Let's just assume 7 or more DL and LB is run prevention
        personnel['run_def'] = (personnel['num_DL'] + personnel['num_LB'] > 6).astype(int)

        personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)
        
        return personnel
    
    def combine_features(relative_to_back, player_on_pitch, defense, static, personnel, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df, player_on_pitch, on = ['GameId', 'PlayId'], how = 'inner')
        df = pd.merge(df,personnel,on=['GameId','PlayId'],how='inner')
        
        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        return df
    
    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    
    crawlling_dist = crawl_dist(df)
    crawlling_second = crawl_sec(df)
    crawlling_velocity_diff = crawlling_velocity(df, crawlling_dist, crawlling_second)
    
    
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    def_feats = defense_features(df)
    static_feats = static_features(df)
    personnel = personnel_features(df)
    basetable = combine_features(rel_back, def_feats, crawlling_velocity_diff, static_feats, personnel, deploy = deploy)
    
    return basetable


def uid_aggregation(comb, main_columns, uids, aggregations):
    X = pd.DataFrame()
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = col+'_'+main_column+'_'+agg_type
                temp_df = comb[[col, main_column]]
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                X[new_col_name] = comb[col].map(temp_df)
                del temp_df
                gc.collect()
    return X

In [None]:
def create_features_02(t_):
    t_['fe1'] = pd.Series(np.sqrt(np.absolute(np.square(t_.X.values) - np.square(t_.Y.values))))
    t_['fe5'] = np.square(t_['S'].values) + 2 * t_['A'].values * t_['Dis'].values  # N
    t_['fe7'] = np.arccos(np.clip(t_['X'].values / t_['Y'].values, -1, 1))  # N
    t_['fe8'] = t_['S'].values / np.clip(t_['fe1'].values, 0.6, None)
    radian_angle = (90 - t_['Dir']) * np.pi / 180.0
    t_['fe10'] = np.abs(t_['S'] * np.cos(radian_angle))
    t_['fe11'] = np.abs(t_['S'] * np.sin(radian_angle))
    
    t_['IsRusher'] = (t_['NflId'] == t_['NflIdRusher'])
    temp = t_[t_["IsRusher"]][["Team", "PlayId"]].rename(columns={"Team":"RusherTeam"})
    t_ = t_.merge(temp, on = "PlayId")
    t_["IsRusherTeam"] = t_["Team"] == t_["RusherTeam"]
    
    #t_["is_rusher"]          = 1.0*(t_["NflId"] == t_["NflIdRusher"])
    #t_["is_home"]            = t_["Team"] == "home"
    t_["is_possession_team"] = 1.0*(t_["PossessionTeam"] == t_["HomeTeamAbbr"]) - 1.0*(t_["PossessionTeam"] == t_["VisitorTeamAbbr"])
    t_["is_field_team"]      = 1.0*(t_["FieldPosition"] == t_["HomeTeamAbbr"]) - 1.0*(t_["FieldPosition"] == t_["VisitorTeamAbbr"])
    t_["is_left"]            = t_["PlayDirection"] == "left"
    
    t_["game_time"]   = t_["GameClock"].apply(get_time)
    t_["old_data"]    = t_["Season"] == 2017
    t_['YardLine_std'] = 100 - t_['YardLine']
    
    
    Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 
            'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 
            'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
            'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 
            'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 

    t_['Turf'] = t_['Turf'].map(Turf)
    t_['Turf'] = t_['Turf'] == 'Natural'
    
    t_['X_std'] = t_['X']
    t_.loc[t_['is_left'], 'X_std'] = 120 - t_.loc[t_['is_left'], 'X'] 
    t_['Y_std'] = t_['Y']
    t_.loc[t_['is_left'], 'Y_std'] = 160/3 - t_.loc[t_['is_left'], 'Y'] 
    
    
    t_['Orientation_rad'] = np.mod(t_.Orientation, 360) * np.pi/180.0
    t_.loc[t_.Season >= 2018, 'Orientation_rad'] = np.mod(t_.loc[t_.Season >= 2018, 'Orientation'] - 90, 360) * np.pi/180.0
    t_['Orientation_std'] = t_.Orientation_rad
    t_.loc[t_.is_left, 'Orientation_std'] = np.mod(np.pi + t_.loc[t_.is_left, 'Orientation_rad'], 2*np.pi)

    
    
    t_["Start"] = t_["YardLine"]
    t_['PlayDirection_new'] = t_['PlayDirection'].map({'right': 1, 'left': -1})
    t_['Orientation_new'] = 2 * np.pi * (90 - t_['Orientation']) / 360
    t_['locX'] = (t_['X'].values - t_['Start'].values) * t_['PlayDirection_new'].values
    t_['locY'] = t_['Y'].values - 53.3 / 2
    t_['velX'] = t_['S'].values * np.cos(t_['Orientation_new'].values) * t_['PlayDirection_new'].values
    t_['velY'] = t_['S'].values * np.sin(t_['Orientation_new'].values)
    t_['accX'] = t_['A'].values * np.cos(t_['Orientation_new'].values) * t_['PlayDirection_new'].values
    t_['accY'] = t_['A'].values * np.sin(t_['Orientation_new'].values)
    
    
    t_['YardsFromOwnGoal'] = np.where(t_.FieldPosition == t_.PossessionTeam,t_.YardLine, 50 + (50-t_.YardLine))
    t_[['prev_game', 'prev_play', 'prev_team', 'prev_yfog']] = t_[['GameId', 'PlayId', 'Team', 'YardsFromOwnGoal']].shift(1)

    filt = (t_.GameId==t_.prev_game) & (t_.Team==t_.prev_team) & (t_.PlayId-t_.prev_play<30)
    t_.loc[filt,'est_prev_yards'] = t_[filt]['YardsFromOwnGoal'] - t_[filt]['prev_yfog']

    
    t_['norm_quat'] = (t_['X']**2 + t_['Y']**2 + t_['A']**2 + t_['S']**2)
    t_['mod_quat'] = (t_['norm_quat'])**0.5
    t_['norm_X'] = t_['X'] / t_['mod_quat']
    t_['norm_Y'] = t_['Y'] / t_['mod_quat']
    t_['norm_A'] = t_['A'] / t_['mod_quat']
    t_['norm_S'] = t_['S'] / t_['mod_quat']    

    i_cols = ['YardLine']
    uids = ['X']
    aggregations = ['mean','std','median','max','min']
    X_agg = uid_aggregation(t_, i_cols, uids, aggregations)
    t_ = pd.concat([t_, X_agg], axis=1)

    t_ = t_.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'IsRusherTeam', 'IsRusher']).reset_index(drop = True)


    return t_

def logs(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)   
        res.columns.values[m] = l + '_log'
        m += 1
    return res

def squares(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(res[l]*res[l]).values)   
        res.columns.values[m] = l + '_sq'
        m += 1
    return res


In [None]:
%time train_basetable = create_features_01(train, False)

In [None]:
%time train_basetable = create_features_02(train_basetable)

In [None]:
train_basetable.head()

In [None]:
# Cria as novas features (etapa 03)
log_features = ['back_from_scrimmage','X','Y','S','A','Dis','Orientation','Dir','YardLine','PlayerAge']
train_basetable = logs(train_basetable, log_features)

In [None]:
# Transformação de variaveis categoricas para numericas usando LabelEncoder
le = preprocessing.LabelEncoder()

le_dict = {}
categoricals = ['Team_le','PossessionTeam_le','HomeTeamAbbr_le','VisitorTeamAbbr_le',
                'FieldPosition_le','PlayDirection_le']

for cat in categoricals:
    le_dict[cat] = LabelEncoder()
    train_basetable[cat] = le_dict[cat].fit_transform(train_basetable[cat[:-3]].apply(str))  

# Remove as features originais que foram transformadas
#X.drop(['TimeSnap','Team','PossessionTeam','HomeTeamAbbr','VisitorTeamAbbr','FieldPosition','PlayDirection'], axis=1, inplace=True)
train_basetable.shape

## 3. Feature Selection

In [None]:
# Cria uma copia do dataset para backup
X = train_basetable.copy()
X.fillna(0,inplace=True)
X.shape

In [None]:
X.drop(['NflId','NflIdRusher','PossessionTeam','HomeTeamAbbr','Turf',
                      'VisitorTeamAbbr','PlayDirection','GameClock','Season',
                      'Team','FieldPosition','Crawller_X','Crawller_y','prev_game','prev_play','prev_team','RusherTeam'], axis=1, inplace=True)

In [None]:
X.head()

In [None]:
import scipy.sparse as ss

class Dataset:
    """
    Dataset for LOFO
    Parameters
    ----------
    df: pandas dataframe
    target: string
        Column name for target within df
    features: list of strings
        List of column names within df
    feature_groups: dict, optional
        Name, value dictionary of feature groups as numpy.darray or scipy.csr.scr_matrix
    """

    def __init__(self, df, target, features, feature_groups=None):
        self.df = df.copy()
        self.features = list(features)
        self.feature_groups = feature_groups if feature_groups else dict()

        self.num_rows = df.shape[0]
        self.y = df[target].values

        for feature_name, feature_matrix in self.feature_groups.items():
            if not (isinstance(feature_matrix, np.ndarray) or isinstance(feature_matrix, ss.csr.csr_matrix)):
                raise Exception("Data type {dtype} is not a valid type!".format(dtype=type(feature_matrix)))

            if feature_matrix.shape[0] != self.num_rows:
                raise Exception("Expected {expected} rows but got {n} rows!".format(expected=self.num_rows,
                                                                                    n=feature_matrix.shape[0]))

            if feature_name in self.features:
                raise Exception("Feature group name '{name}' is the same with one of the features!")

    def getX(self, feature_to_remove, fit_params):
        """Get feature matrix and fit_params after removing a feature
        Parameters
        ----------
        feature_to_remove : string
            feature name to remove
        fit_params : dict
            fit parameters for the model
        Returns
        -------
        X : numpy.darray or scipy.csr.scr_matrix
            Feature matrix
        fit_params: dict
            Updated fit_params after feature removal
        """
        feature_list = [feature for feature in self.features if feature != feature_to_remove]
        concat_list = [self.df[feature_list].values]

        for feature_name, feature_matrix in self.feature_groups.items():
            if feature_name != feature_to_remove:
                concat_list.append(feature_matrix)

        fit_params = fit_params.copy()
        if "categorical_feature" in fit_params:
            cat_features = [f for f in fit_params["categorical_feature"] if f != feature_to_remove]
            fit_params["categorical_feature"] = [ix for ix, f in enumerate(feature_list) if (f in cat_features)]

        has_sparse = False
        for feature_name, feature_matrix in self.feature_groups.items():
            if feature_name != feature_to_remove and isinstance(feature_matrix, ss.csr.csr_matrix):
                has_sparse = True

        concat = np.hstack
        if has_sparse:
            concat = ss.hstack

        return concat(concat_list), fit_params

In [None]:
def plot_importance(importance_df, figsize=(8, 8)):
    """Plot feature importance
    Parameters
    ----------
    importance_df : pandas dataframe
        Output dataframe from LOFO/FLOFO get_importance
    figsize : tuple
    """
    importance_df = importance_df.copy()
    importance_df["color"] = (importance_df["importance_mean"] > 0).map({True: 'g', False: 'r'})
    importance_df.sort_values("importance_mean", inplace=True)

    importance_df.plot(x="feature", y="importance_mean", xerr="importance_std",
                       kind='barh', color=importance_df["color"], figsize=figsize)

In [None]:
from sklearn.model_selection import cross_validate
import multiprocessing
#from lofo.infer_defaults import infer_model


class LOFOImportance:
    """
    Leave One Feature Out Importance
    Given a model and cross-validation scheme, calculates the feature importances.
    Parameters
    ----------
    dataset: LOFO Dataset object
    scoring: string or callable
        Same as scoring in sklearn API
    model: model (sklearn API), optional
        Not trained model object
    fit_params : dict, optional
        fit parameters for the model
    cv: int or iterable
        Same as cv in sklearn API
    n_jobs: int, optional
        Number of jobs for parallel computation
    """

    def __init__(self, dataset, scoring, model=None, fit_params=None, cv=4, n_jobs=None):

        self.fit_params = fit_params if fit_params else dict()
        if model is None:
            model, dataset.df, categoricals, dataset.y = infer_model(dataset.df, dataset.features, dataset.y, n_jobs)
            self.fit_params["categorical_feature"] = categoricals
            n_jobs = 1

        self.model = model
        self.dataset = dataset
        self.scoring = scoring
        self.cv = cv
        self.n_jobs = n_jobs
        if self.n_jobs is not None and self.n_jobs > 1:
            warning_str = ("Warning: If your model is multithreaded, please initialise the number"
                           "of jobs of LOFO to be equal to 1, otherwise you may experience performance issues.")
            warnings.warn(warning_str)

    def _get_cv_score(self, feature_to_remove):
        X, fit_params = self.dataset.getX(feature_to_remove=feature_to_remove, fit_params=self.fit_params)
        y = self.dataset.y

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cv_results = cross_validate(self.model, X, y, cv=self.cv, scoring=self.scoring, fit_params=fit_params)
        return cv_results['test_score']

    def _get_cv_score_parallel(self, feature, result_queue):
        test_score = self._get_cv_score(feature_to_remove=feature)
        result_queue.put((feature, test_score))
        return test_score

    def get_importance(self):
        """Run LOFO to get feature importances
        Returns
        -------
        importance_df : pandas dataframe
            Dataframe with feature names and corresponding importance mean and std (sorted by importance)
        """
        base_cv_score = self._get_cv_score(feature_to_remove=None)
        feature_list = self.dataset.features + list(self.dataset.feature_groups.keys())

        if self.n_jobs is not None and self.n_jobs > 1:

            pool = multiprocessing.Pool(self.n_jobs)
            manager = multiprocessing.Manager()
            result_queue = manager.Queue()

            for f in feature_list:
                pool.apply_async(self._get_cv_score_parallel, (f, result_queue))

            pool.close()
            pool.join()

            lofo_cv_result = [result_queue.get() for _ in range(len(feature_list))]
            lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for _, lofo_cv_score in lofo_cv_result])
            feature_list = [feature for feature, _ in lofo_cv_result]
        else:
            lofo_cv_scores = []
            for f in tqdm_notebook(feature_list):
                lofo_cv_scores.append(self._get_cv_score(feature_to_remove=f))

            lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for lofo_cv_score in lofo_cv_scores])

        importance_df = pd.DataFrame()
        importance_df["feature"] = feature_list
        importance_df["importance_mean"] = lofo_cv_scores_normalized.mean(axis=1)
        importance_df["importance_std"] = lofo_cv_scores_normalized.std(axis=1)

        return importance_df.sort_values("importance_mean", ascending=False)

In [None]:
# Procedimento para verificar as features mais importantes
# Usando LightGBM para treinamento
#from lofo import LOFOImportance, Dataset, plot_importance

n_folds = 3
kfold_lgb = KFold(n_folds, shuffle=True)

features = [x for x in X.columns if x not in ['Yards','GameId','PlayId']]

params2 = {'num_leaves': 15,
          'objective': 'mae',
          #'learning_rate': 0.1,
          "boosting": "gbdt",
          "num_rounds": 100
          }

model_lgb = lgb.LGBMRegressor(**params2)
dataset = Dataset(df=X, target="Yards", features=features)
lofo_imp = LOFOImportance(dataset, model=model_lgb, cv=kfold_lgb, scoring="neg_mean_absolute_error")

importance_df = lofo_imp.get_importance()

In [None]:
# Exibindo grafico com as features
plot_importance(importance_df, figsize=(12, 18))

In [None]:
best_features = importance_df.loc[importance_df['importance_mean'] > 0].feature
best_features

In [None]:
# Criar um dataset somente com as colunas mais importantes conforme visto anteriormente
new_X = X.copy()
#new_X = X.loc[:,best_features]
new_X.replace(-np.inf,0,inplace=True)
new_X.replace(np.inf,0,inplace=True)

# Normalizando as variaveis do dataset de treino
scaler = StandardScaler()
new_X2 = scaler.fit_transform(new_X)


#new_X = X.drop(['GameId','PlayId','Yards'], axis=1)
target = X.Yards

y = np.zeros((target.shape[0], 199))
for idx, target in enumerate(list(target)):
    y[idx][99 + target] = 1
    

new_X2.shape, y.shape

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

rf_clf = RandomForestRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=15,
                              min_samples_split=7, n_estimators=1000, n_jobs=-1, random_state=42)
rf_clf.fit(new_X2, y)

In [None]:
ttt = pd.DataFrame(new_X2, index=new_X.index, columns=new_X.columns)

In [None]:
fs_ = SelectFromModel(rf_clf, prefit=True, threshold='0.2*mean')
feature_idx = fs_.get_support()
feature_name = ttt.columns[feature_idx]

print(len(feature_name))
print(feature_name)

with open('fe_imp.txt', 'w') as f:
    for item in feature_name:
        f.write("%s\n" % item)

## 4. Criar e avaliar alguns algoritmos de Machine Learning

### 4.1. Split Treino e Validação

In [None]:
X = train_basetable.copy()
X.head()

In [None]:
best_features = ['Velocity_y', 'A', 'YardLine', 'def_min_dist', 'fe5', 'fe10',
                 'fe11', 'YardLine_std', 'Start', 'YardsFromOwnGoal']

In [None]:
best_features = ['A','S','back_from_scrimmage','back_oriented_down_field','back_moving_down_field','old_data',
                 'def_mean_dist','def_std_dist','def_min_dist','def_max_dist','min_dist','max_dist','mean_dist',
                 'PlayerAge','PlayerHeight_dense','Dis','DefendersInTheBox',
                 'Distance','Dir','Dir_sin','Dir_cos','YardLine_std','Orientation_std',
                 'WindSpeed_dense','GameWeather_dense',
                 'fe1','fe5','fe8','fe10','fe11',
                 'norm_quat','mod_quat','norm_X','norm_Y','norm_A','norm_S',
                 'X_YardLine_std','X_YardLine_median','X_YardLine_max','X_YardLine_min',
                 'num_DL','num_LB','num_DB','num_QB','num_RB','num_WR','num_TE','num_OL']

#'A_log','Y_std','Quarter','Dis_log','Orientation','Down',
#                 'min_velocity_percent','max_velocity_percrent','mean_velocity_percent','X_log',
#                 'Crawller_second','Team_le','is_home','PlayDirection_le','VisitorTeamAbbr_le','Orientation_std',
#                 'Velocity_X','YardLine_log','is_rusher'
# 'locX','locY','velX','velY','accX','accY'            

In [None]:
def NFL_validation_split(df):
    games = df[['GameId', 'PossessionTeam']].drop_duplicates()

    # Sort so the latest games are first and label the games with cumulative counter
    games = games.sort_values(['PossessionTeam', 'GameId'], ascending=[True, False])
    games['row_number'] = games.groupby(['PossessionTeam']).cumcount() + 1

    # Use last 5 games for each team as validation. There will be overlap since two teams will have the same
    # GameId
    game_set = set([1, 2, 3, 4])

    # Set of unique game ids
    game_ids = set(games[games['row_number'].isin(game_set)]['GameId'].unique().tolist())

    return game_ids

In [None]:
train_basetable.fillna(0,inplace=True)
game_ids = NFL_validation_split(train_basetable)

X_train = train_basetable[~train_basetable['GameId'].isin(game_ids)]
X_test = train_basetable[train_basetable['GameId'].isin(game_ids)]

X_train = X_train.loc[:,best_features]
X_test  = X_test.loc[:,best_features]

train_inds, test_inds = X_train.index, X_test.index

# Normalizando as variaveis do dataset de treino
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

X_train.shape, X_test.shape

In [None]:
# Criar um dataset somente com as colunas mais importantes conforme visto anteriormente
new_X = X.loc[:,best_features]
#new_X.replace(-np.inf,0,inplace=True)
#new_X.replace(np.inf,0,inplace=True)

# Normalizando as variaveis do dataset de treino
scaler = StandardScaler()
new_X = scaler.fit_transform(new_X)

#new_X = X.drop(['GameId','PlayId','Yards'], axis=1)
target = X.Yards

y = np.zeros((target.shape[0], 199))
for idx, target in enumerate(list(target)):
    y[idx][99 + target] = 1
    

new_X.shape, y.shape

### 4.2. Teste com LightGBM

### 4.3. Teste com Keras (New NN Struct)

In [None]:
class CRPSCallback(Callback):
    
    def __init__(self,validation, predict_batch_size=20, include_on_batch=False):
        super(CRPSCallback, self).__init__()
        self.validation = validation
        self.predict_batch_size = predict_batch_size
        self.include_on_batch = include_on_batch
        
        #print('validation shape',len(self.validation))

    def on_batch_begin(self, batch, logs={}):
        pass

    def on_train_begin(self, logs={}):
        if not ('CRPS_score_val' in self.params['metrics']):
            self.params['metrics'].append('CRPS_score_val')

    def on_batch_end(self, batch, logs={}):
        if (self.include_on_batch):
            logs['CRPS_score_val'] = float('-inf')

    def on_epoch_end(self, epoch, logs={}):
        logs['CRPS_score_val'] = float('-inf')
            
        if (self.validation):
            X_valid, y_valid = self.validation[0], self.validation[1]
            y_pred = self.model.predict(X_valid)
            y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
            y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
            val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid.shape[0])
            val_s = np.round(val_s, 8)
            logs['CRPS_score_val'] = val_s

In [None]:
def get_model(x_tr,y_tr,x_val,y_val):
    inp = Input(shape = (x_tr.shape[1],))
    x = Dense(1024, input_dim=X.shape[1], activation='relu')(inp)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    out = Dense(199, activation='softmax')(x)
    model = Model(inp,out)
    
    model.compile(optimizer = optimizers.adam(lr = 0.001, decay = 1e-06),
                  loss='categorical_crossentropy', 
                  metrics=[])
     
    es = EarlyStopping(monitor='CRPS_score_val', 
                       mode='min',
                       restore_best_weights=True, 
                       verbose=1, 
                       patience=10)

    mc = ModelCheckpoint('best_model.h5',monitor='CRPS_score_val',mode='min',save_best_only=True, 
                         verbose=1, save_weights_only=True)
    
    bsz = 1024
    steps = x_tr.shape[0]/bsz
    
    model.fit(x_tr, y_tr,
              callbacks=[CRPSCallback(validation = (x_val,y_val)),es,mc], 
              epochs=100, 
              #steps_per_epoch = steps,
              batch_size=bsz,
              verbose=1)
    
    model.load_weights("best_model.h5")
    
    y_pred = model.predict(x_val)
    y_valid = y_val
    y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * x_val.shape[0])
    crps = np.round(val_s, 8)

    return model,crps

In [None]:
def predict(x_te):
    model_num = len(models)
    for k,m in enumerate(models):
        if k==0:
            y_pred = m.predict(x_te,batch_size=1024)
        else:
            y_pred+=m.predict(x_te,batch_size=1024)
            
    y_pred = y_pred / model_num
    
    return y_pred

In [None]:
models = []
crps_csv = []

s_time = time.time()

for k in range(2):
    kfold = KFold(10, random_state = 42 + k, shuffle = True)
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(y)):
        gc.collect()
        print("-----------")
        tr_x,tr_y = new_X[tr_inds],y[tr_inds]
        val_x,val_y = new_X[val_inds],y[val_inds]
    
        model,crps = get_model(tr_x,tr_y,val_x,val_y)
        models.append(model)
        
        print("the %d fold crps is %f"%((k_fold+1),crps))
        crps_csv.append(crps)
        gc.collect()
        
    print("mean crps is %f"%np.mean(crps_csv))

In [None]:
print("mean crps is %f"%np.mean(crps_csv))

In [None]:
0.013399, 0.013367

### 4.4. Usando Bayesian Optimization

In [None]:
!pip install scikit-optimize

In [None]:
#imports we know we'll need
import skopt
# !pip install scikit-optimize if  necessary
from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer  

import tensorflow
from tensorflow.python.keras import backend as K

In [None]:
dim_learning_rate    = Real(low=1e-4, high=1e-2, prior='log-uniform',name='learning_rate')
dim_num_dense_layers = Integer(low=1, high=5, name='num_dense_layers')
dim_num_input_nodes  = Integer(low=256, high=1024, name='num_input_nodes')
dim_num_dense_nodes  = Integer(low=256, high=1024, name='num_dense_nodes')
dim_activation       = Categorical(categories=['relu'],name='activation')
dim_batch_size       = Integer(low=128, high=1024, name='batch_size')
dim_adam_decay       = Real(low=1e-6,high=1e-2,name="adam_decay")

dimensions = [dim_learning_rate,
              dim_num_dense_layers,
              dim_num_input_nodes,
              dim_num_dense_nodes,
              dim_activation,
              dim_batch_size,
              dim_adam_decay
             ]
default_parameters = [1e-3, 1,512, 13, 'relu',64, 1e-3]

In [None]:
from keras.optimizers import Adam
def create_model(learning_rate, num_dense_layers,num_input_nodes,
                 num_dense_nodes, activation, adam_decay):
    #start the model making process and create our first layer
    model = Sequential()
    model.add(Dense(num_input_nodes, input_shape= input_shape, activation=activation
                   ))
    #create a loop making a new dense layer for the amount passed to this model.
    #naming the layers helps avoid tensorflow error deep in the stack trace.
    for i in range(num_dense_layers):
        name = 'layer_dense_{0}'.format(i+1)
        model.add(Dense(num_dense_nodes,
                 activation=activation,
                        name=name
                 ))
    #add our classification layer.
    model.add(Dense(10,activation='softmax'))
    
    #setup our optimizer and compile
    adam = Adam(lr=learning_rate, decay= adam_decay)
    model.compile(optimizer=adam, loss='categorical_crossentropy',
                 metrics=[])
    return model

### 4.5. Usando TALOS

In [None]:
!pip install talos

In [None]:
import wrangle
x_train, y_train, x_val, y_val = wrangle.array_split(new_X, y, .15)

In [None]:
def get_new_model(x_tr,y_tr,x_val,y_val,params):
    inp = Input(shape = (x_tr.shape[1],))
    x = Dense(1024, input_dim=X.shape[1], activation=params['activation'], kernel_initializer='normal')(inp)
    x = Dropout(params['dropout'])(x)
    x = BatchNormalization()(x)
    x = Dense(512, activation=params['activation'])(x)
    x = Dropout(params['dropout'])(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation=params['activation'])(x)
    x = Dropout(params['dropout'])(x)
    x = BatchNormalization()(x)
    
    out = Dense(199, activation=params['last_activation'], kernel_initializer='normal')(x)
    model = Model(inp,out)
    
    model.compile(optimizer=params['optimizer'],
                  loss=params['losses'],
                  metrics=[])
     
    es = EarlyStopping(monitor='CRPS_score_val', 
                       mode='min',
                       restore_best_weights=True, 
                       verbose=1, 
                       patience=10)

    mc = ModelCheckpoint('best_model.h5',monitor='CRPS_score_val',mode='min',save_best_only=True, 
                         verbose=1, save_weights_only=True)
    
    #bsz = 1024
    #steps = x_tr.shape[0]/bsz
    
    out = model.fit(x_tr, y_tr,
                    callbacks=[CRPSCallback(validation = (x_val,y_val)),es,mc], 
                    epochs=params['epochs'],
                    batch_size=params['batch_size'],
                    verbose=1)
    
    #model.load_weights("best_model.h5")
    
    #y_pred = model.predict(x_val)
    #y_valid = y_val
    #y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
    #y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    #val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * x_val.shape[0])
    #crps = np.round(val_s, 6)

    return out,model #,crps

In [None]:
# set the parameter space boundary
p = {'activation':['relu', 'elu'],
     'last_activation': ['softmax'],
     'optimizer': ['Nadam', 'Adam'],
     'losses': ['categorical_crossentropy'],
     'shapes': ['brick'],
     #'first_neuron': [16, 32, 64, 128],
     #'hidden_layers':[0, 1, 2, 3],
     'dropout': [.2, .3, .4, .5],
     'batch_size': [128, 256, 512, 1024],
     'epochs': [10]}

In [None]:
import talos

# start the experiment
scan_object = talos.Scan(x=x_train,
                         y=y_train,
                         x_val=x_val,
                         y_val=y_val,
                         model=get_new_model,
                         experiment_name='nfl_dsa_02',
                         params=p,
                         round_limit=100)

In [None]:
# use Scan object as input
analyze_object = talos.Analyze(scan_object)

# access the dataframe with the results
analyze_object.data

In [None]:
# get the highest result for any metric
analyze_object.low('CRPS_score_val')

In [None]:
# get the round with the best result
analyze_object.rounds2high

In [None]:
# make predictions with the model
models[5].predict(x_val)

## 5. Realizar a submissão para o Kaggle

In [None]:
%%time

if  TRAIN_OFFLINE==False:
    
    from kaggle.competitions import nflrush
    env = nflrush.make_env()
    iter_test = env.iter_test()
    df_prev = pd.DataFrame()
    df_test = pd.DataFrame()

    for (test_df, sample_prediction_df) in tqdm_notebook(iter_test):
        basetable = create_features(test_df, deploy=True)
        basetable = create_features_02(basetable)
        basetable.drop(['GameId','PlayId','NflId','NflIdRusher','PossessionTeam','HomeTeamAbbr','VisitorTeamAbbr','PlayDirection','GameClock','Season','Team','FieldPosition'], axis=1, inplace=True)
        
        # Considerar somente as colunas do Feature Selection
        basetable = basetable.loc[:,best_features]
    
        scaled_basetable = scaler.transform(basetable)

        y_pred = predict(scaled_basetable)
        y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]

        preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)
        
        df_test = df_test.append(basetable)
        df_prev = df_prev.append(preds_df)
    
        env.predict(preds_df)

    env.write_submission_file()

In [None]:
df_test.head()

In [None]:
df_prev.head()