# Kaggle
## Competition NFL Big Data Bowl

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/nfl-big-data-bowl-2020/train.csv
/kaggle/input/nfl-big-data-bowl-2020/kaggle/competitions/nflrush/test.csv.encrypted
/kaggle/input/nfl-big-data-bowl-2020/kaggle/competitions/nflrush/sample_submission.csv.encrypted
/kaggle/input/nfl-big-data-bowl-2020/kaggle/competitions/nflrush/__init__.py
/kaggle/input/nfl-big-data-bowl-2020/kaggle/competitions/nflrush/competition.cpython-36m-x86_64-linux-gnu.so


In [2]:
# Carregando os pacotes
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Statistic lib
from scipy import stats
from scipy.stats import skew, norm

# Sklearn lib
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

# Models
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from mlxtend.regressor import StackingCVRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
import xgboost as XGB
from sklearn.cluster import KMeans
import tqdm

# Misc lib
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from functools import partial
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from IPython.display import Image

# Utils
import pandasql as ps
import re 
import math, string, os
import datetime

# Options
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
pd.set_option('display.max_columns', None)
import gc
gc.enable()

In [3]:
# Carregando os dados de treino
#train = pd.read_csv('../data/train.csv', low_memory=False)
train = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)
print ("Data is ready !!")

Data is ready !!


# Criando as funções auxiliares de limpeza e conversao

In [4]:
# Funcao para tratar os dados missing de cada variavel
def fill_na(data):
    data['WindDirection'].fillna('unknown',inplace=True)
    data['OffenseFormation'].fillna('unknown',inplace=True)
    data['StadiumType'].fillna('unknown',inplace=True)
    data['GameWeather'].fillna('unknown',inplace=True)
    data['FieldPosition'].fillna('NA',inplace=True)
    
    data['Temperature'].fillna(data['Temperature'].mean(), inplace=True)
    data['Humidity'].fillna(data['Humidity'].mean(), inplace=True)
    data['DefendersInTheBox'].fillna(math.ceil(data['DefendersInTheBox'].mean()),inplace=True)
    
# Funcao para agrupar as descricoes dos tipos de estadio
def agrupar_tipo_estadio(StadiumType):
    outdoor       = ['Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 'Outdor', 'Ourdoor', 'Outside', 'Outddors', 'Outdoor Retr Roof-Open', 'Oudoor', 'Bowl']
    indoor_closed = ['Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed', 'Retractable Roof', 'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed']
    indoor_open   = ['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open']
    dome_closed   = ['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']
    dome_open     = ['Domed, Open', 'Domed, open']
    
    if StadiumType in outdoor:
        return 'outdoor'
    elif StadiumType in indoor_closed:
        return 'indoor_closed'
    elif StadiumType in indoor_open:
        return 'indoor_open'
    elif StadiumType in dome_closed:
        return 'dome_closed'
    elif StadiumType in dome_open:
        return 'dome_open'
    else:
        return 'unknown' # se for n/a
    
# Funcao para agrupar as descricoes dos estadios
def agrupar_estadio(Stadium):

    if Stadium == 'Broncos Stadium at Mile High':
        return 'Broncos Stadium At Mile High'
    
    elif Stadium in ('CenturyField', 'CenturyLink'):
        return 'CenturyLink Field'
    
    elif Stadium == 'EverBank Field':
        return 'Everbank Field'
    
    elif Stadium in ('FirstEnergy', 'FirstEnergy Stadium', 'FirstEnergyStadium'):
        return 'First Energy Stadium'
   
    elif Stadium == 'Lambeau field':
        return 'Lambeau Field'

    elif Stadium == 'Los Angeles Memorial Coliesum':
        return 'Los Angeles Memorial Coliseum'
    
    elif Stadium in ('M & T Bank Stadium', 'M&T Stadium'):
        return 'M&T Bank Stadium'

    elif Stadium in ('Mercedes-Benz Dome', 'Mercedes-Benz Superdome'):
        return 'Mercedes-Benz SuperDome'
    
    elif Stadium in ('MetLife Stadium', 'Metlife Stadium', 'MetLife'):
        return 'MetLife Stadium' 
    
    elif Stadium == 'NRG':
        return 'NRG Stadium' 

    elif Stadium == 'Oakland-Alameda County Coliseum':
        return 'Oakland Alameda-County Coliseum' 
    
    elif Stadium == 'Paul Brown Stdium':
        return 'Paul Brown Stadium' 

    elif Stadium == 'Twickenham':
        return 'Twickenham Stadium' 
    
    else:
        return Stadium
    
# Funcao para agrupar a localizacao do estadio e do jogo
def agrupar_local(Location):

    if Location == "Arlington, Texas":
        return "Arlington, TX"
    elif Location in ("Baltimore, Maryland","Baltimore, Md."):
        return "Baltimore, MD"
    elif Location == "Charlotte, North Carolina":
        return "Charlotte, NC"
    elif Location == "Chicago. IL":
        return "Chicago, IL"
    elif Location == "Cincinnati, Ohio":
        return "Cincinnati, OH"
    elif Location in ("Cleveland","Cleveland Ohio","Cleveland, Ohio","Cleveland,Ohio"):
        return "Cleveland, OH"
    elif Location == "Detroit":
        return "Detroit, MI"
    elif Location == "E. Rutherford, NJ" or Location == "East Rutherford, N.J.":
        return "East Rutherford, NJ"
    elif Location == "Foxborough, Ma":
        return "Foxborough, MA"
    elif Location == "Houston, Texas":
        return "Houston, TX"
    elif Location in ("Jacksonville Florida","Jacksonville, Fl","Jacksonville, Florida"):
        return "Jacksonville, FL"
    elif Location == "London":
        return "London, England"
    elif Location == "Los Angeles, Calif.":
        return "Los Angeles, CA"
    elif Location == "Miami Gardens, Fla.":
        return "Miami Gardens, FLA"
    elif Location in ("New Orleans","New Orleans, La."):
        return "New Orleans, LA"
    elif Location == "Orchard Park NY":
        return "Orchard Park, NY"
    elif Location == "Philadelphia, Pa.":
        return "Philadelphia, PA"
    elif Location == "Pittsburgh":
        return "Pittsburgh, PA"
    elif Location == "Seattle":
        return "Seattle, WA"
    else:
        return Location
    
# Funcao para agrupar o gramado do estadio
def agrupar_gramado(Turf):
    if Turf == 'Artifical':
        return 'Artificial'
    
    elif Turf in ('FieldTurf', 'Field turf'):
        return 'Field Turf'

    elif Turf in ('FieldTurf360', 'FieldTurf 360'):
        return 'Field Turf 360'

    elif Turf in ('Natural', 'Natural grass', 'Naturall Grass', 'grass', 'natural grass', 'SISGrass', 'Natural Grass'):
        return "Grass"

    elif Turf == "UBU Sports Speed S5-M":
        return "UBU Speed Series-S5-M"

    else:
        return Turf

# Funcao para agrupar os dados de direcao do vento
def agrupa_wind_direction(WindDirection):
    wd = str(WindDirection).upper()
    
    if wd == 'N' or 'FROM N' in wd:
        return 'north'
    if wd == 'S' or 'FROM S' in wd:
        return 'south'
    if wd == 'W' or 'FROM W' in wd:
        return 'west'
    if wd == 'E' or 'FROM E' in wd:
        return 'east'
    
    if 'FROM SW' in wd or 'FROM SSW' in wd or 'FROM WSW' in wd:
        return 'south west'
    if 'FROM SE' in wd or 'FROM SSE' in wd or 'FROM ESE' in wd:
        return 'south east'
    if 'FROM NW' in wd or 'FROM NNW' in wd or 'FROM WNW' in wd:
        return 'north west'
    if 'FROM NE' in wd or 'FROM NNE' in wd or 'FROM ENE' in wd:
        return 'north east'
    
    if 'NW' in wd or 'NORTHWEST' in wd:
        return 'north west'
    if 'NE' in wd or 'NORTH EAST' in wd:
        return 'north east'
    if 'SW' in wd or 'SOUTHWEST' in wd:
        return 'south west'
    if 'SE' in wd or 'SOUTHEAST' in wd:
        return 'south east'

    return 'unknown'

# Funcao para agrupar as descricoes de clima
def agrupar_clima(GameWeather):
    chuva   = ['Rainy', 'Rain Chance 40%', 'Showers',
               'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
               'Scattered Showers', 'Cloudy, Rain', 'Rain shower', 'Light Rain', 'Rain']
    nublado = ['Cloudy, light snow accumulating 1-3"', 'Party Cloudy', 'Cloudy, chance of rain',
               'Coudy', 'Cloudy, 50% change of rain', 'Rain likely, temps in low 40s.',
               'Cloudy and cold', 'Cloudy, fog started developing in 2nd quarter',
               'Partly Clouidy', '30% Chance of Rain', 'Mostly Coudy', 'Cloudy and Cool',
               'cloudy', 'Partly cloudy', 'Overcast', 'Hazy', 'Mostly cloudy', 'Mostly Cloudy',
               'Partly Cloudy', 'Cloudy']
    limpo   = ['Partly clear', 'Sunny and clear', 'Sun & clouds', 'Clear and Sunny',
               'Sunny and cold', 'Sunny Skies', 'Clear and Cool', 'Clear and sunny',
               'Sunny, highs to upper 80s', 'Mostly Sunny Skies', 'Cold',
               'Clear and warm', 'Sunny and warm', 'Clear and cold', 'Mostly sunny',
               'T: 51; H: 55; W: NW 10 mph', 'Clear Skies', 'Clear skies', 'Partly sunny',
               'Fair', 'Partly Sunny', 'Mostly Sunny', 'Clear', 'Sunny']
    neve    = ['Heavy lake effect snow', 'Snow']
    none    = ['N/A Indoor', 'Indoors', 'Indoor', 'N/A (Indoors)', 'Controlled Climate']

    
    if GameWeather in chuva:
        return 'chuva'
    elif GameWeather in nublado:
        return 'nublado'
    elif GameWeather in limpo:
        return 'limpo'
    elif GameWeather in neve:
        return 'neve'
    elif GameWeather in none:
        return 'none'
    else:
        return 'none' # se for n/a
    
# Funcao para converter a velocidade do vento
def convert_wind_speed(WindSpeed):
    ws = str(WindSpeed)

    if ws.isdigit():
        return int(ws)

    if '-' in ws:
        return int(ws.split('-')[0])

    if ws.split(' ')[0].isdigit():
        return int(ws.split(' ')[0])

    if 'mph' in ws.lower():
        return int(ws.lower().split('mph')[0])
    else:
        return 0
    
    
# Funcao para converter altura de feet-inches para centimetros
def convert_to_cm(ft_in):
    h_ft   = int(ft_in.split('-')[0])
    h_inch = int(ft_in.split('-')[1])
    h_inch += h_ft * 12
    h_cm = round(h_inch * 2.54, 1)
    #print("Your height is : %d cm." % h_cm)   
    
    return h_cm

# Funcao para converter peso em lbs para kg
def convert_to_kg(lbs):
    kg = lbs * 0.45359237
    #print("The weight is", kg, "in kilograms")
    
    return kg

# Funcao para converter temperatura Fahrenheit para Celsius
def convert_to_celsius(fah):
    celsius = (fah - 32) * 5.0/9.0
    #print("Temperature:", fah, "Fahrenheit = ", celsius, " C")
    return celsius

# Funcao para converter as features de data e extrair dia, mes, ano, hora, minuto, segundo
def convert_data(data):
    #data['PlayerBirthDate'] = pd.to_datetime(data['PlayerBirthDate'], "%m/%d/%Y")
    data['PlayerBirthDate'] = data['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
    data['PlayerBirthDate_day'] = data['PlayerBirthDate'].dt.day.astype(int)
    data['PlayerBirthDate_month'] = data['PlayerBirthDate'].dt.month.astype(int)
    data['PlayerBirthDate_year'] = data['PlayerBirthDate'].dt.year.astype(int)

    data['TimeSnap'] = data['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    #data['TimeSnap'] = pd.to_datetime(data['TimeSnap'], "%Y-%m-%dT%H:%M:%S.%fZ")
    data['TimeSnap_min'] = data['TimeSnap'].dt.minute.astype(int)
    data['TimeSnap_seg'] = data['TimeSnap'].dt.second.astype(int)
    
    data['TimeHandoff'] = data['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    #data['TimeHandoff'] = pd.to_datetime(data['TimeHandoff'], "%Y-%m-%dT%H:%M:%S.%fZ")
    data['TimeHandoff_min'] = data['TimeHandoff'].dt.minute.astype(int)
    data['TimeHandoff_seg'] = data['TimeHandoff'].dt.second.astype(int)
    
    
# Funcao para converter uma string horario em segundos
def str_to_seconds(time):
    time = time.split(':')
    sec = int(time[0])*60 + int(time[1]) + int(time[2])/60
    return sec

# Funcao para a metrica de validacao do modelo
def funcao_crps(labels,predictions) :
    y_pred = np.zeros((len(labels),199))
    y_ans = np.zeros((len(labels),199))
    j = np.array(range(199))
    for i,(p,t) in enumerate(zip(np.round(scaler.inverse_transform(predictions)),labels)) :
        k2 = j[j>=p-10]
        y_pred[i][k2]=(k2+10-p)*0.05
        k1 = j[j>=p+10]
        y_pred[i][k1]= 1.0
        k3 = j[j>=t]
        y_ans[i][k3]= 1.0
                           
    return 'CRPS: ', np.sum((y_pred-y_ans)**2)/(199*y_pred.shape[0]), False

# Feature Engineering

In [5]:
# Funcao para realizar feature engineering no dataset (treino ou teste)
def feature_engineering(df): 
    
    # Limpeza e conversao dos dados
    fill_na(df)
    convert_data(df)
    
    df_median = df.median()
    
    # Conversao de algumas features
    df['PlayerHeight']  = df['PlayerHeight'].apply(convert_to_cm)
    df['PlayerWeight']  = df['PlayerWeight'].apply(convert_to_kg)
    df['Temperature']   = df['Temperature'].apply(convert_to_celsius)
    df['StadiumType']   = df['StadiumType'].apply(agrupar_tipo_estadio)
    df['Stadium']       = df['Stadium'].apply(agrupar_estadio)
    df['Location']      = df['Location'].apply(agrupar_local)
    df['Turf']          = df['Turf'].apply(agrupar_gramado)
    df['WindDirection'] = df['WindDirection'].apply(agrupa_wind_direction)
    df['WindSpeed']     = df['WindSpeed'].apply(convert_wind_speed)
    df['GameWeather']   = df['GameWeather'].apply(agrupar_clima)


    # Corrigindo a feature Stadium
    df.loc[df['Stadium'] == 'MetLife Stadium', 'StadiumType'] = 'outdoor'
    df.loc[df['Stadium'] == 'StubHub Center', 'StadiumType'] = 'outdoor'    
    
    # Nova feature com a diferença entre o tempo de lançamento da bola até quando o jogador captura
    df['TimeDifer'] = df.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)            
         
    # Nova feature para indicar se é o jogador que esta realizando a jogada (corredor)
    df['IsRusher'] = df['NflId'] == df['NflIdRusher']
    
    # Novas features com base no horario do jogo
    df['Morning']   = df['GameClock'].apply(lambda x : 1 if (int(x[0:2]) >=0 and int(x[0:2]) <12) else 0)
    df['Afternoon'] = df['GameClock'].apply(lambda x : 1 if (int(x[0:2]) <18 and int(x[0:2]) >=12) else 0)
    df['Evening']   = df['GameClock'].apply(lambda x : 1 if (int(x[0:2]) >= 18 and int(x[0:2]) < 24) else 0)
    df['GameClock'] = df['GameClock'].apply(str_to_seconds) 
    
    # Criando novas features com dados de Distance, YardLine e DefendersInTheBox
    df['seconds_need_to_first_down'] = (df['Distance']*0.9144)/df['Dis']
    df['seconds_need_to_YardsLine'] = (df['YardLine']*0.9144)/df['Dis']    
    df['DefendersInTheBox_vs_Distance'] = df['DefendersInTheBox'] / df['Distance']
    
    # Ordenacao do dataset e renovando o index
    df = df.sort_values(by=['PlayId', 'Team', 'IsRusher']).reset_index()
    
    # Removendo colunas que não serão utilizadas
    df = df.drop(['index','Yards','GameId','PlayId','NflId', 'DisplayName','NflIdRusher', 'TimeHandoff', 'TimeSnap', 'PlayerBirthDate'], axis=1)

    # Atribuindo media para os demais dados missing
    df.fillna(df_median, inplace=True)

    # Executar somente para alguns modelos que nao tratam valores INF ou NAN
    #train_df.replace(-np.inf,0,inplace=True)
    #train_df.replace(np.inf,0,inplace=True)

    # Removendo todas as variaveis categoricas
    cat_features = []
    for col in df.columns:
        if df[col].dtype =='object':
            cat_features.append(col)

    df = df.drop(cat_features, axis=1)
    
    return df

In [6]:
# Criando um novo dataset aplicando Feature Engineering
train_df = feature_engineering(train)

In [7]:
train_df.shape

(509762, 38)

# Criação e Validação dos Modelos de ML

## Algumas considerações deste processo:
- **Cross Validation:** Estou usando 5-fold cross-validation
- **Models:** Light GBM Regression

####  Validação
Continuous Ranked Probability Score (CRPS) is derived based on the predicted scalar value.
The CRPS is computed as follows:
$$
C=\frac{1}{199N}\sum_{m=1}^N\sum_{n=-99}^{99}(P(y\geq n)-H(n-Y_m))^2
$$
$H(x)=1$ if $x\geq 0$ else $0$

In [8]:
# Fazendo uma limpeza na memoria
gc.collect()

0

In [9]:
# Setup cross validation folds
kf = 5
folds = KFold(n_splits=kf, shuffle=False, random_state=42)
print(str(kf) + ' Folds para treino...')

5 Folds para treino...


In [10]:
# Identificando as features para o modelo
features = list(train_df.columns)
X = train_df[features]

In [11]:
# Criando matriz de treino
train_data = np.zeros((509762//22,len(features))) 

for i in tqdm.tqdm(range(0,509762,22)):
    count=0
    for c in features:
        train_data[i//22][count] = train_df[c][i]
        count+=1

100%|██████████| 23171/23171 [00:20<00:00, 1104.00it/s]


In [12]:
# Split de dados e label
X_train = pd.DataFrame(data=train_data,columns=features)
y_train_ = np.array([train["Yards"][i] for i in range(0,509762,22)])

In [13]:
# Normalizacao das features 
y_tr = np.zeros(len(y_train_),dtype=np.float)
for i in range(len(y_tr)):
    y_tr[i]=(y_train_[i])

scaler = preprocessing.StandardScaler()
scaler.fit([[y] for y in y_tr])
y_tr = np.array([y[0] for y in scaler.transform([[y] for y in y_tr])])

In [14]:
best_params_lgb = {
    "boosting": "gbdt",
    "verbosity": -1,
    "num_leaves":3,
    "min_data_in_leaf": 10,
    "max_depth": -1,
    "learning_rate": 0.0005,
    "bagging_freq": 4,
    "bagging_fraction": 0.1,
    "bagging_seed": 11,
    "feature_fraction" : 1,
    "random_seed": 19,
    "metric": "rmse",
    "boost_from_average" : False
}

In [15]:
oof = np.zeros(len(X_train))
tr_rmse  = []
val_rmse = []
models   = []

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train,y_tr)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = train_df.iloc[trn_idx][features], train_df.iloc[val_idx][features]
    train_y, y_val = y_tr[trn_idx], y_tr[val_idx]

    model = lgb.LGBMRegressor(**best_params_lgb, n_estimators = 100, n_jobs = -1)
    model.fit(X_tr, 
              train_y, 
              eval_set=[(X_tr, train_y), (X_val, y_val)], 
              eval_metric=funcao_crps,
              verbose=10, 
              early_stopping_rounds=99)
    
    oof[val_idx] = model.predict(X_val)
    
    val_score = mean_squared_error(y_val, oof[val_idx])
    val_rmse.append(val_score)
    
    tr_score = mean_squared_error(train_y, model.predict(X_tr))
    tr_rmse.append(tr_score)
    
    models.append(model)

fold 0
Training until validation scores don't improve for 99 rounds.
[10]	training's rmse: 0.993689	training's CRPS: : 0.0119145	valid_1's rmse: 1.02481	valid_1's CRPS: : 0.0119691
[20]	training's rmse: 0.993684	training's CRPS: : 0.0119145	valid_1's rmse: 1.02481	valid_1's CRPS: : 0.0119691
[30]	training's rmse: 0.993677	training's CRPS: : 0.0119145	valid_1's rmse: 1.02482	valid_1's CRPS: : 0.0119691
[40]	training's rmse: 0.99367	training's CRPS: : 0.0119145	valid_1's rmse: 1.02482	valid_1's CRPS: : 0.0119691
[50]	training's rmse: 0.993661	training's CRPS: : 0.0119145	valid_1's rmse: 1.02483	valid_1's CRPS: : 0.0119691
[60]	training's rmse: 0.993647	training's CRPS: : 0.0119145	valid_1's rmse: 1.02482	valid_1's CRPS: : 0.0119691
[70]	training's rmse: 0.993639	training's CRPS: : 0.0119145	valid_1's rmse: 1.02483	valid_1's CRPS: : 0.0119691
[80]	training's rmse: 0.993631	training's CRPS: : 0.0119145	valid_1's rmse: 1.02483	valid_1's CRPS: : 0.0119691
[90]	training's rmse: 0.993624	train

In [16]:
# Validacao do modelo
# Imprimir o score de treino e teste

mean_rmse_tr = np.mean(tr_rmse)
std_rmse_tr =  np.std(tr_rmse)

mean_rmse_val =  np.mean(val_rmse)
std_rmse_val =  np.std(val_rmse)

all_rmse = mean_squared_error(oof,y_tr)

print("Score de Treino")
print("Média RMSE: %.5f, std: %.5f." % (mean_rmse_tr, std_rmse_tr),'\n')

print("Score de Validação")
print("Média RMSE: %.5f, std: %.5f." % (mean_rmse_val, std_rmse_val),'\n')

print("Geral: %.5f." % (all_rmse),'\n')

print("CRPS: %.5f." % (funcao_crps(y_tr,oof)[1]))

Score de Treino
Média RMSE: 0.99983, std: 0.03182. 

Score de Validação
Média RMSE: 1.00006, std: 0.12725. 

Geral: 1.00006. 

CRPS: 0.01193.


# Realizando a submissão

In [17]:
# Funcao para realizar feature engineering no dataset de teste
def feature_engineering_test(df): 
    
    # Limpeza e conversao dos dados
    fill_na(df)
    convert_data(df)
    
    df_median = df.median()
    
    # Conversao de algumas features
    df['PlayerHeight']  = df['PlayerHeight'].apply(convert_to_cm)
    df['PlayerWeight']  = df['PlayerWeight'].apply(convert_to_kg)
    df['Temperature']   = df['Temperature'].apply(convert_to_celsius)
    df['StadiumType']   = df['StadiumType'].apply(agrupar_tipo_estadio)
    df['Stadium']       = df['Stadium'].apply(agrupar_estadio)
    df['Location']      = df['Location'].apply(agrupar_local)
    df['Turf']          = df['Turf'].apply(agrupar_gramado)
    df['WindDirection'] = df['WindDirection'].apply(agrupa_wind_direction)
    df['WindSpeed']     = df['WindSpeed'].apply(convert_wind_speed)
    df['GameWeather']   = df['GameWeather'].apply(agrupar_clima)


    # Corrigindo a feature Stadium
    df.loc[df['Stadium'] == 'MetLife Stadium', 'StadiumType'] = 'outdoor'
    df.loc[df['Stadium'] == 'StubHub Center', 'StadiumType'] = 'outdoor'    
    
    # Nova feature com a diferença entre o tempo de lançamento da bola até quando o jogador captura
    df['TimeDifer'] = df.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)            
         
    # Nova feature para indicar se é o jogador que esta realizando a jogada (corredor)
    df['IsRusher'] = df['NflId'] == df['NflIdRusher']
    
    # Novas features com base no horario do jogo
    df['Morning']   = df['GameClock'].apply(lambda x : 1 if (int(x[0:2]) >=0 and int(x[0:2]) <12) else 0)
    df['Afternoon'] = df['GameClock'].apply(lambda x : 1 if (int(x[0:2]) <18 and int(x[0:2]) >=12) else 0)
    df['Evening']   = df['GameClock'].apply(lambda x : 1 if (int(x[0:2]) >= 18 and int(x[0:2]) < 24) else 0)
    df['GameClock'] = df['GameClock'].apply(str_to_seconds) 
    
    # Criando novas features com dados de Distance, YardLine e DefendersInTheBox
    df['seconds_need_to_first_down'] = (df['Distance']*0.9144)/df['Dis']
    df['seconds_need_to_YardsLine'] = (df['YardLine']*0.9144)/df['Dis']    
    df['DefendersInTheBox_vs_Distance'] = df['DefendersInTheBox'] / df['Distance']
    
    # Ordenacao do dataset e renovando o index
    df = df.sort_values(by=['PlayId', 'Team', 'IsRusher']).reset_index()
    
    # Removendo colunas que não serão utilizadas
    df = df.drop(['index', 'GameId','PlayId','NflId', 'DisplayName','NflIdRusher', 'TimeHandoff', 'TimeSnap', 'PlayerBirthDate'], axis=1)

    # Atribuindo media para os demais dados missing
    df.fillna(df_median, inplace=True)

    # Executar somente para alguns modelos que nao tratam valores INF ou NAN
    #train_df.replace(-np.inf,0,inplace=True)
    #train_df.replace(np.inf,0,inplace=True)

    # Removendo todas as variaveis categoricas
    cat_features = []
    for col in df.columns:
        if df[col].dtype =='object':
            cat_features.append(col)

    df = df.drop(cat_features, axis=1)
    
    return df

In [18]:
from kaggle.competitions import nflrush

pd.options.mode.chained_assignment = None
index = 0

env = nflrush.make_env()

for (test, sample_prediction_df) in tqdm.tqdm(env.iter_test()):
    df_test = feature_engineering_test(test)
    
    count=0
    test_data = np.zeros((1,len(features)))

    for c in features:
        if c in df_test:
            try:
                test_data[0][count] = df_test[c][index]
            except:
                test_data[0][count] = np.nan
            count+=1
    
    y_pred = np.zeros(199)        
    y_pred_p = np.sum(np.round(scaler.inverse_transform([model.predict(test_data)[0] for model in models])))/kf
    y_pred_p += 99
    
    for j in range(199):
        if j>=y_pred_p+10:
            y_pred[j]=1.0
        elif j>=y_pred_p-10:
            y_pred[j]=(j+10-y_pred_p)*0.05
    
    env.predict(pd.DataFrame(data=[y_pred],columns=sample_prediction_df.columns))
    
    index += 22
    
env.write_submission_file()

print([filename for filename in os.listdir('/kaggle/working') if '.csv' in filename])

3438it [06:52,  8.33it/s]


Your submission file has been saved!  Once you `Commit` your Notebook and it finishes running, you can submit the file to the competition from the Notebook Viewer `Output` tab.
['submission.csv']
