# Kaggle
## Competition NFL Big Data Bowl

In [59]:
# Carregando os pacotes
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Statistic lib
from scipy import stats
from scipy.stats import skew, norm, kurtosis

# Sklearn lib
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

# Models
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from mlxtend.regressor import StackingCVRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
import xgboost as XGB
from sklearn.cluster import KMeans
import tqdm
import optuna

# Misc lib
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from functools import partial
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from IPython.display import Image
from tqdm import tqdm_notebook

# Utils
import pandasql as ps
import re 
import math, string, os
import datetime

# Options
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
pd.set_option('display.max_columns', None)
import gc
gc.enable()

In [33]:
# Carregando os dados de treino
train = pd.read_csv('../data/train.csv', low_memory=False)
#train = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()
print ("Data is ready !!")

Data is ready !!


# Feature Engineering

## 1. Feature Básicas

In [34]:
def create_features(df, deploy=False):
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate
    
    # Adicionado por rodrigo
    def new_Y(y_coordinate, play_direction):
        if play_direction == 'left':
            return (160 / 3) - y_coordinate
        else:
            return y_coordinate
        
    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            #new_angle = np.mod(180 + angle, 360) # Rodrigo
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2

        return np.sqrt(x_diff + y_diff)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]

        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        #df['Y'] = df[['Y','PlayDirection']].apply(lambda x: new_Y(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)

        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')

        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]

        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','max','mean','std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']

        return defense

    def static_features(df):
        static_features = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                            'YardLine','Quarter','Down','Distance','DefendersInTheBox']].drop_duplicates()
        static_features['DefendersInTheBox'] = static_features['DefendersInTheBox'].fillna(np.mean(static_features['DefendersInTheBox']))

        return static_features


    def combine_features(relative_to_back, defense, static, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')

        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        return df
    
    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    def_feats = defense_features(df)
    static_feats = static_features(df)
    basetable = combine_features(rel_back, def_feats, static_feats, deploy=deploy)
    
    return basetable

In [35]:
%%time
train = create_features(train,False)

CPU times: user 1min 24s, sys: 2.29 s, total: 1min 27s
Wall time: 1min 22s


In [36]:
train.head()

Unnamed: 0,GameId,PlayId,back_from_scrimmage,back_oriented_down_field,back_moving_down_field,min_dist,max_dist,mean_dist,std_dist,def_min_dist,def_max_dist,def_mean_dist,def_std_dist,X,Y,S,A,Dis,Orientation,Dir,YardLine,Quarter,Down,Distance,DefendersInTheBox,Yards
0,2017090700,20170907000118,3.75,1,0,1.449724,22.415872,8.046559,4.873845,4.59331,22.415872,9.752491,5.327299,41.25,30.53,3.63,3.35,0.38,198.02,114.26,45.0,1,3,2,6.0,8
1,2017090700,20170907000139,4.07,0,0,0.792023,23.025872,8.614623,5.598683,4.287773,23.025872,10.297028,5.833217,48.93,27.16,3.06,2.41,0.34,149.3,47.8,53.0,1,1,10,6.0,3
2,2017090700,20170907000189,3.66,1,0,1.64639,20.726285,8.482583,4.642121,4.22167,20.726285,9.903689,5.07329,71.34,19.11,5.77,2.42,0.6,219.18,138.04,75.0,1,1,10,7.0,5
3,2017090700,20170907000345,3.53,0,0,0.918096,9.791231,5.549379,1.983128,4.528002,9.791231,6.309354,1.834174,104.47,25.36,4.45,3.2,0.46,173.78,84.56,108.0,1,2,2,9.0,2
4,2017090700,20170907000395,5.01,0,0,0.502892,21.214806,9.168819,5.611232,4.288088,21.214806,11.056456,5.900009,29.99,27.12,3.9,2.53,0.44,34.27,157.92,35.0,1,1,10,7.0,7


In [37]:
train.shape

(23171, 26)

In [38]:
features = list(train.columns)
print(len(features),'FEATURES.')
np.array(features)

26 FEATURES.


array(['GameId', 'PlayId', 'back_from_scrimmage',
       'back_oriented_down_field', 'back_moving_down_field', 'min_dist',
       'max_dist', 'mean_dist', 'std_dist', 'def_min_dist',
       'def_max_dist', 'def_mean_dist', 'def_std_dist', 'X', 'Y', 'S',
       'A', 'Dis', 'Orientation', 'Dir', 'YardLine', 'Quarter', 'Down',
       'Distance', 'DefendersInTheBox', 'Yards'], dtype='<U24')

# Criação e Validação dos Modelos de ML

In [39]:
# Fazendo uma limpeza na memoria
gc.collect()

248

In [40]:
def _EvalFunction(labels,predictions) :
    n = np.arange(-99, 100)
    n = np.row_stack([n] * predictions.shape[0])
    ym = labels.reshape(predictions.shape[0], 1)
    step_ym = np.heaviside(n - ym, 1)
    yn = predictions.reshape(labels.shape[0], 1)
    step_yn = np.heaviside(n - yn, 1)
    inner_sum = np.power(step_yn - step_ym, 2)
    inner_sum = inner_sum.sum(axis=1)
    total = inner_sum.sum() / (199 * predictions.shape[0])
                           
    return 'CRPS', total, False

In [50]:
## Split data into train/test
X = train.copy()
y_tr_ = np.array(train["Yards"])

y = np.zeros(len(y_tr_),dtype=np.float)
for i in range(len(y)):
    y[i]=(y_tr_[i])    
    
X.drop(['GameId','PlayId','Yards'], axis=1, inplace=True)

In [51]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [53]:
X.shape, y.shape

((23171, 23), (23171,))

In [55]:
from sklearn.model_selection import KFold
nfold = 5
folds = KFold(n_splits=nfold, shuffle=False, random_state=42)

print('-'*20)
print(str(nfold) + ' Folds training...')
print('-'*20)

--------------------
5 Folds training...
--------------------


In [66]:
best_params_lgb = {
    "boosting": "gbdt",
    "verbosity": -1,
    "num_leaves":3,
    "min_data_in_leaf": 10,
    "max_depth": -1,
    "learning_rate": 0.0005,
    "bagging_freq": 4,
    "bagging_fraction": 0.1,
    "bagging_seed": 11,
    "feature_fraction" : 1,
    "random_seed": 19,
    "metric": "rmse",
    "boost_from_average" : False
}

In [69]:
best_params_xgb = {'learning_rate':0.01,
                       'n_estimators':6000,
                       'max_depth':4,
                       'min_child_weight':0,
                       'gamma':0.6,
                       'subsample':0.7,
                       'colsample_bytree':0.7,
                       'objective':'reg:linear',
                       'nthread':-1,
                       'scale_pos_weight':1,
                       'seed':27,
                       'reg_alpha':0.00006,
                       'random_state':42
                  }

In [67]:
oof = np.zeros(len(X))
feature_importance_df = pd.DataFrame()

tr_mae = []
val_mae = []
models = []

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = train.iloc[trn_idx][features], train.iloc[val_idx][features]
    train_y, y_val = y[trn_idx], y[val_idx]

    model = lgb.LGBMRegressor(**best_params_lgb, n_estimators = 180, n_jobs = -1)
    #model = lgb.LGBMRegressor(learning_rate=0.01, n_estimators=200)
    model.fit(X_tr, 
              train_y, 
              eval_set=[(X_tr, train_y), (X_val, y_val)], 
              eval_metric=_EvalFunction,
              verbose=10, 
              early_stopping_rounds=50,
              
             )
    oof[val_idx] = model.predict(X_val)
    val_score = mean_absolute_error(y_val, oof[val_idx])
    val_mae.append(val_score)
    tr_score = mean_absolute_error(train_y, model.predict(X_tr))
    tr_mae.append(tr_score)
    models.append(model)

fold 0
Training until validation scores don't improve for 50 rounds
[10]	training's rmse: 7.64526	training's CRPS: 0.0208371	valid_1's rmse: 7.72931	valid_1's CRPS: 0.020352
[20]	training's rmse: 7.61557	training's CRPS: 0.0208371	valid_1's rmse: 7.7006	valid_1's CRPS: 0.020352
[30]	training's rmse: 7.58502	training's CRPS: 0.0208371	valid_1's rmse: 7.66996	valid_1's CRPS: 0.020352
[40]	training's rmse: 7.5549	training's CRPS: 0.0208371	valid_1's rmse: 7.64097	valid_1's CRPS: 0.020352
[50]	training's rmse: 7.5262	training's CRPS: 0.0208371	valid_1's rmse: 7.61383	valid_1's CRPS: 0.020352
Early stopping, best iteration is:
[1]	training's rmse: 7.67238	training's CRPS: 0.0208371	valid_1's rmse: 7.75522	valid_1's CRPS: 0.020352
fold 1
Training until validation scores don't improve for 50 rounds
[10]	training's rmse: 7.73186	training's CRPS: 0.0208945	valid_1's rmse: 7.36841	valid_1's CRPS: 0.0201222
[20]	training's rmse: 7.70245	training's CRPS: 0.0208945	valid_1's rmse: 7.34042	valid_1's

[90]	training's rmse: 7.29753	training's CRPS: 0.0204651	valid_1's rmse: 7.92951	valid_1's CRPS: 0.0215818
[100]	training's rmse: 7.2704	training's CRPS: 0.0204529	valid_1's rmse: 7.90105	valid_1's CRPS: 0.0215612
[110]	training's rmse: 7.24227	training's CRPS: 0.0204163	valid_1's rmse: 7.87093	valid_1's CRPS: 0.0215102
[120]	training's rmse: 7.21424	training's CRPS: 0.0203252	valid_1's rmse: 7.84127	valid_1's CRPS: 0.0214105
[130]	training's rmse: 7.18598	training's CRPS: 0.0202724	valid_1's rmse: 7.81105	valid_1's CRPS: 0.0213541
[140]	training's rmse: 7.15844	training's CRPS: 0.0201921	valid_1's rmse: 7.78074	valid_1's CRPS: 0.0212717
[150]	training's rmse: 7.13024	training's CRPS: 0.0201314	valid_1's rmse: 7.75111	valid_1's CRPS: 0.0211925
[160]	training's rmse: 7.10178	training's CRPS: 0.0201225	valid_1's rmse: 7.72122	valid_1's CRPS: 0.0211903
[170]	training's rmse: 7.07534	training's CRPS: 0.0201154	valid_1's rmse: 7.69364	valid_1's CRPS: 0.0211773
[180]	training's rmse: 7.04844

## Evaluation

In [68]:
mean_mae_tr = np.mean(tr_mae)
std_mae_tr =  np.std(tr_mae)

mean_mae_val =  np.mean(val_mae)
std_mae_val =  np.std(val_mae)

all_mae = mean_absolute_error(oof,y_tr)

print('-'*20)
print("Train's Score")
print('-'*20,'\n')
print("Mean mae: %.5f, std: %.5f." % (mean_mae_tr, std_mae_tr),'\n')

print('-'*20)
print("Validation's Score")
print('-'*20,'\n')
print("Mean mae: %.5f, std: %.5f." % (mean_mae_val, std_mae_val),'\n')

print("All mae: %.5f." % (all_mae))
print('-'*20)
print("CRPS Score")
print('-'*20,'\n')
print(_EvalFunction(y_tr,oof)[1])

--------------------
Train's Score
-------------------- 

Mean mae: 4.49075, std: 0.13249. 

--------------------
Validation's Score
-------------------- 

Mean mae: 4.48830, std: 0.18263. 

All mae: 4.48831.
--------------------
CRPS Score
-------------------- 

0.02037831468854349


# Realizando a submissão

In [None]:
pd.options.mode.chained_assignment = None
index = 21
for (df_test, sample_prediction_df) in tqdm_notebook(env.iter_test()):
    df_test = create_features(df_test,True)
    
    count=0
    test_data = np.zeros((1,len(features)))

    for c in features:
        if c in df_test:
            try:
                test_data[0][count] = df_test[c][index]
            except:
                test_data[0][count] = np.nan
            count+=1
    
    y_pred = np.zeros(199)        
    y_pred_p = np.mean([model.predict(test_data)[0] for model in models])
    y_pred_p += 99
    for j in range(199):
        if j>=y_pred_p+10:
            y_pred[j]=1.0
        elif j>=y_pred_p-10:
            y_pred[j]=(j+10-y_pred_p)*0.05
    env.predict(pd.DataFrame(data=[y_pred],columns=sample_prediction_df.columns))
    index += 22
env.write_submission_file()