# NFL Big Data Bowl - Feature Engineering v1
-------------------
TheNerdyCat <br>
27 Nov 2019 Deadline


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as patches
import seaborn as sns

import kaggle
import math
import datetime
import random

from sklearn import preprocessing
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import gc
#import optuna

from sklearn.metrics import mean_absolute_error

import tqdm
import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]
pd.options.display.max_columns = 100

In [None]:
df = pd.read_csv("../input/nfl-big-data-bowl-2020/train.csv", low_memory=False)

In [None]:
def preprocess(df):#
    
    def clean_position(pos):
        if pos == 'SAF':
            return 'DB'
        if pos == 'S':
            return 'DB'
        elif pos == 'OG':
            return 'G'
        elif pos == "OT":
            return 'T'
        else:
            return pos
    
    def clean_offenceformation(of):
        if of == "SHOTGUN":
            return 9
        elif of == "SINGLEBACK":
            return 8
        elif of == "JUMBO":
            return 6
        elif of == "PISTOL":
            return 5
        elif of == "I_FORM":
            return 4
        elif of == "ACE":
            return 3
        elif of ==  "WILDCAT":
            return 2
        elif of == "EMPTY":
            return 1
        else: 
            return 7
    
    def create_generalposition(pos):
        if pos == 'SS' or pos == 'FS' or pos == 'CB' or pos == 'DB':
            return 'DB'
        elif pos == 'DE' or pos == 'DT' or pos == 'DL':
            return 'DL'
        elif pos == 'ILB' or pos == 'OLB' or pos == 'MLB' or pos == 'LB':
            return 'LB'
        elif pos == 'WR':
            return 'WR'
        elif pos == 'TE':
            return 'TE'
        elif pos == 'T' or pos == 'G' or pos == 'C' or pos == 'NT' or pos == 'OL':
            return 'OL'
        elif pos == 'QB' or pos == 'RB' or pos == 'FB' or pos == 'HB' or pos == 'TB' or pos == 'WB':
            return 'OB'
        else:
            return 'Other'

    def utc2sec(x):
        return int(x.split("-")[2].split(":")[2].split(".")[0])
    
    def timesnap2date(x):
        day = x.split("T")[0]
        return day
    
    def timesnap2day(x):
        days = x.split("-")
        return 365 * int(days[0]) + 30 * int(days[1]) + int(days[2][:2])
        
    def gameclock2secs(x):
        clock = x.split(":")
        return (60 * int(clock[0])) + int(clock[1])        
        
    def group_stadium_types(stadium):
        outdoor = [
            'Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 'Outdor', 'Ourdoor', 
            'Outside', 'Outddors','Outdoor Retr Roof-Open', 'Oudoor', 'Bowl'
            ]
        indoor_closed = [
            'Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed',
            'Retractable Roof', 'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed',
        ]
        indoor_open   = ['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open']
        dome_closed   = ['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']
        dome_open     = ['Domed, Open', 'Domed, open']
        if stadium in outdoor:
            return 0 #'outdoor'
        elif stadium in indoor_closed:
            return 3 # 'indoor closed'
        elif stadium in indoor_open:
            return 2 #'indoor open'
        elif stadium in dome_closed:
            return 4 #'dome closed'
        elif stadium in dome_open:
            return 1 #'dome open'
        else:
            return 5 #'unknown'
        
    def group_game_weather(weather):
        rain = [
            'Rainy', 'Rain Chance 40%', 'Showers',
            'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
            'Scattered Showers', 'Cloudy, Rain', 'Rain shower', 'Light Rain', 'Rain'
        ]
        overcast = [
            'Cloudy, light snow accumulating 1-3"', 'Party Cloudy', 'Cloudy, chance of rain',
            'Coudy', 'Cloudy, 50% change of rain', 'Rain likely, temps in low 40s.',
            'Cloudy and cold', 'Cloudy, fog started developing in 2nd quarter',
            'Partly Clouidy', '30% Chance of Rain', 'Mostly Coudy', 'Cloudy and Cool',
            'cloudy', 'Partly cloudy', 'Overcast', 'Hazy', 'Mostly cloudy', 'Mostly Cloudy',
            'Partly Cloudy', 'Cloudy'
        ]
        clear = [
            'Partly clear', 'Sunny and clear', 'Sun & clouds', 'Clear and Sunny',
            'Sunny and cold', 'Sunny Skies', 'Clear and Cool', 'Clear and sunny',
            'Sunny, highs to upper 80s', 'Mostly Sunny Skies', 'Cold',
            'Clear and warm', 'Sunny and warm', 'Clear and cold', 'Mostly sunny',
            'T: 51; H: 55; W: NW 10 mph', 'Clear Skies', 'Clear skies', 'Partly sunny',
            'Fair', 'Partly Sunny', 'Mostly Sunny', 'Clear', 'Sunny'
        ]
        snow  = ['Heavy lake effect snow', 'Snow']
        none  = ['N/A Indoor', 'Indoors', 'Indoor', 'N/A (Indoors)', 'Controlled Climate']
        if weather in rain:
            return -1 #'rain'
        elif weather in overcast:
            return 1 #'overcast'
        elif weather in clear:
            return 2 #'clear'
        elif weather in snow:
            return -2 #snow'
        elif weather in none:
            return 0 #'none'        
        
    def clean_wind_speed(windspeed):
        """
        This is not a very robust function,
        but it should do the job for this dataset.
        """
        ws = str(windspeed)
        # if it's already a number just return an int value
        if ws.isdigit():
            return int(ws)
        # if it's a range, take their mean
        if '-' in ws:
            return (int(ws.split('-')[0]) + int(ws.split('-')[1]))/2
        # if there's a space between the number and mph
        if ws.split(' ')[0].isdigit():
            return int(ws.split(' ')[0])
        # if it looks like '10MPH' or '12mph' just take the first part
        if 'mph' in ws.lower():
            return int(ws.lower().split('mph')[0])
        else:
            return 0   
            
    def clean_wind_direction(wind_direction):
        wd = str(wind_direction).upper()
        if wd == 'N' or 'FROM S' in wd:
            return 90 #'north'
        if wd == 'S' or 'FROM N' in wd:
            return 270 #'south'
        if wd == 'W' or 'FROM E' in wd:
            return 180 #'west'
        if wd == 'E' or 'FROM W' in wd:
            return 0 #'east'
        if 'FROM SW' in wd or 'FROM SSW' in wd or 'FROM WSW' in wd:
            return 45 #'north east'
        if 'FROM SE' in wd or 'FROM SSE' in wd or 'FROM ESE' in wd:
            return 135 #'north west'
        if 'FROM NW' in wd or 'FROM NNW' in wd or 'FROM WNW' in wd:
            return 315 #'south east'
        if 'FROM NE' in wd or 'FROM NNE' in wd or 'FROM ENE' in wd:
            return 225 #'south west'
        if 'NW' in wd or 'NORTHWEST' in wd:
            return 135 #'north west'
        if 'NE' in wd or 'NORTH EAST' in wd:
            return 45 #'north east'
        if 'SW' in wd or 'SOUTHWEST' in wd:
            return 225 #'south west'
        if 'SE' in wd or 'SOUTHEAST' in wd:
            return 315 #'south east'            
            
    def birthday2day(x):
        days = x.split("/")
        return 30 * int(days[0]) + int(days[1]) + 365 * int(days[2])
    
    def height2inch(x):
        height = x.split("-")
        return 12 * int(height[0]) + int(height[1])    
    
    def uid_aggregation(comb, main_columns, uids, aggregations):
        X = pd.DataFrame()
        for main_column in main_columns:  
            for col in uids:
                for agg_type in aggregations:
                    new_col_name = col+'_'+main_column+'_'+agg_type
                    temp_df = comb[[col, main_column]]
                    temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                            columns={agg_type: new_col_name})
    
                    temp_df.index = list(temp_df[col])
                    temp_df = temp_df[new_col_name].to_dict()   
    
                    X[new_col_name] = comb[col].map(temp_df)
                    del temp_df
                    gc.collect()
        return X
    
    df['ToLeft'] = df.PlayDirection == "left"
    # Match the NFLId to that play's rusher's ID
    df['IsBallCarrier'] = df.NflId == df.NflIdRusher 
    
    # Correct differences in Team Name abbreviations
    map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
    for abb in df['PossessionTeam'].unique():
        map_abbr[abb] = abb
    
    # New feature to show Dir in radians
    df['Dir_rad'] = np.mod(90 - df.Dir, 360) * math.pi/180.0
    df['TeamOnOffense'] = "home"
    df.loc[df.PossessionTeam != df.HomeTeamAbbr, 'TeamOnOffense'] = "away"
    
    # IsOnOffense
    df['IsOnOffense'] = df.Team == df.TeamOnOffense 
    
    df['YardLine_std'] = 100 - df.YardLine
    df.loc[df.FieldPosition.fillna('') == df.PossessionTeam,  
              'YardLine_std'
             ] = df.loc[df.FieldPosition.fillna('') == df.PossessionTeam,  
              'YardLine']
    df['X_std'] = df.X
    df.loc[df.ToLeft, 'X_std'] = 120 - df.loc[df.ToLeft, 'X'] 
    df['Y_std'] = df.Y
    df.loc[df.ToLeft, 'Y_std'] = 160/3 - df.loc[df.ToLeft, 'Y'] 
    df['Orientation_std'] = -90 + df.Orientation
    df['Dir_std'] = df['Dir_rad']
    df.loc[df.ToLeft, 'Dir_std'] = np.mod(np.pi + df.loc[df.ToLeft, 'Dir_rad'], 2*np.pi)
    
    # Clean Position feature
    df['Position'] = df['Position'].apply(clean_position)
    
    # OffenseTeam & DefenseTeam
    df = df.rename(columns = {'PossessionTeam':'OffenseTeam'}) 
    df['DefenseTeam'] = df['VisitorTeamAbbr']
    df.loc[df.TeamOnOffense == 'away', 'DefenseTeam'] = df['HomeTeamAbbr']
    
    # IsOffenseAtHome
    df['IsOffenseAtHome'] = True
    df.loc[df.TeamOnOffense == 'away', 'IsOffenseAtHome'] = False
    
    # OffenseScore
    df['OffenseScore'] = df['HomeScoreBeforePlay']
    df.loc[df.TeamOnOffense == 'away', 'OffenseScore'] = df['VisitorScoreBeforePlay']
    
    # DefenseScore
    df['DefenseScore'] = df['VisitorScoreBeforePlay']
    df.loc[df.TeamOnOffense == 'away', 'DefenseScore'] = df['HomeScoreBeforePlay']
    
    # IsOffenseWinning
    df['IsOffenseWinning'] = False
    df.loc[df.OffenseScore > df.DefenseScore, 'IsOffenseWinning'] = True
    
    # OffenseInOwnTerritory
    df['OffenseInOwnTerritory'] = False
    df.loc[df.FieldPosition == df.OffenseTeam, 'OffenseInOwnTerritory'] = True
    
    # OffenseRushingPosition
    play_rushers = df.loc[df.NflIdRusher == df.NflId, ['PlayId', 'Position']]
    play_rushers = play_rushers.rename(columns={'Position': 'OffenseRushingPosition'})
    df = df.merge(play_rushers, how='left', left_on='PlayId', right_on='PlayId')
    
    # OffenceFormation
    df['OffenseFormation'] = df['OffenseFormation'].apply(clean_offenceformation)
    df['OffenseFormation'] = df['OffenseFormation'].fillna(7)
    
    # NumberOfTEsOnPlay, NumberOfWRsOnPlay, NumberOfBacksOnPlay, ....
    df['GeneralPosition'] = df['Position'].apply(create_generalposition)
    # Pivot to find counts of each general position
    gen_pos_counts = df[['PlayId','GeneralPosition']].pivot_table(index='PlayId', columns='GeneralPosition', 
                                                                  aggfunc=len, fill_value=0)
    gen_pos_counts = gen_pos_counts.rename(columns = 
                          {'DB':'NumberOfDBsOnPlay', 'DL':'NumberOfDLinemenOnPlay', 
                           'LB':'NumberOfLBsOnPlay', 'OB':'NumberOfBacksOnPlay',
                           'OL':'NumberOfOLinemenOnPlay', 'TE':'NumberOfTEsOnPlay',
                           'WR':'NumberOfWRsOnPlay'})
    df = df.merge(gen_pos_counts, how='left', left_on='PlayId', right_on='PlayId')
    
    # DefendersInTheBox
    df['DefendersInTheBox'] = df['DefendersInTheBox'].fillna(df['DefendersInTheBox'].median())
    
    # TimeBetweenSnapHandoff, Month, ...
    df['TimeBetweenSnapHandoff'] = df['TimeHandoff'].apply(utc2sec) - df['TimeSnap'].apply(utc2sec)
    df['MatchDay'] = df['TimeSnap'].apply(timesnap2day)
    df['DayOfYear'] = pd.to_datetime(df['TimeSnap'].apply(timesnap2date)).dt.dayofyear
    df['DayOfWeek'] = pd.to_datetime(df['TimeSnap'].apply(timesnap2date)).dt.dayofweek
    df['MonthOfYear'] = df['TimeSnap'].apply(lambda x : int(x[5:7]))
    df['Morning'] = df['TimeSnap'].apply(lambda x : 1 if (int(x[11:13]) >=0 and int(x[11:13]) <12) else 0)
    df['Afternoon'] = df['TimeSnap'].apply(lambda x : 1 if (int(x[11:13]) <18 and int(x[11:13]) >=12) else 0)
    df['Evening'] = df['TimeSnap'].apply(lambda x : 1 if (int(x[11:13]) >= 18 and int(x[11:13]) < 24) else 0)

    # QuarterGameSecs, TotalGameSecsPlayed, HalfGameSecs
    df['QuarterGameSecs'] = df['GameClock'].apply(gameclock2secs)
    df['TotalGameSecsPlayed'] = (900 - df['QuarterGameSecs']) + ((df['Quarter'] - 1) * 900)
    df['HalfGameSecsLeft'] = df['QuarterGameSecs']
    df.loc[(df['Quarter'].isin([1,3])), 'HalfGameSecsLeft'] = (900 + df['QuarterGameSecs'])
    
    # IsInEngland
    df['IsInEngland'] = df["Location"].str.lower().map(lambda x: True if "london" in x else False)
    
    # StadiumType
    # from https://www.kaggle.com/code1110/optimizing-lightgbm-hyperparameters
    df['StadiumType'] = df['StadiumType'].apply(group_stadium_types)
    
    # Turf
    # from https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112681#latest-649087
    Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 
            'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 
            'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
            'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 
            'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 
    df['Turf'] = df['Turf'].map(Turf)
    df['Turf'] = df['Turf'].map({"Natural": 0,"Artificial": 1})
    
    # GameWeather
    # https://www.kaggle.com/code1110/optimizing-lightgbm-hyperparameters
    df['GameWeather'] = df['GameWeather'].apply(group_game_weather)
        
    # Temperature 
    df['Temperature'] = df['Temperature'].fillna(df['Temperature'].median())
    
    # Humidity
    df['Humidity'] = df['Humidity'].fillna(df['Humidity'].median())
    
    # WindSpeed
    df['WindSpeed'] = df['WindSpeed'].apply(clean_wind_speed)
    
    # WindDirection
    # from https://www.kaggle.com/code1110/optimizing-lightgbm-hyperparameters
    df['WindDirection'] = df['WindDirection'].apply(clean_wind_direction)
    df['WindDirection'] = 2 * np.pi * (90 - df['WindDirection']) / 360
    
    # Team
    df['Team'] = df['Team'].map({"home": 0, "away": 1})
    
    # Dir
    df["Dir"] = np.mod(90 - df["Dir"].values, 360)
    
    # PlayerBirthDate
    df['PlayerBirthDate'] = df['PlayerBirthDate'].apply(birthday2day)
    
    # PlayerAge
    df['PlayerAge'] = df['MatchDay'] - df['PlayerBirthDate']
    
    # PlayDirection
    df['PlayDirection'] = df['PlayDirection'].map({'right': 1, 'left': -1})
    
    # PlayerWeight
    df['PlayerHeight'] = df['PlayerHeight'].apply(height2inch)
    
    # PlayerBMI
    df['PlayerBMI'] = df['PlayerWeight'] / df['PlayerHeight']
    
    # SecondsNeedToFirstDown
    # from https://www.kaggle.com/ryches/model-free-benchmark
    df['SecondsNeedToFirstDown'] = (df['Distance']*0.9144) / (df['Dis'].values + 0.01)
    
    # SecondsNeedToYardLine
    # from https://www.kaggle.com/ryches/model-free-benchmark
    df['SecondsNeedToYardLine'] = (df['YardLine']*0.9144) / (df['Dis'].values + 0.01)
    
    # DefendersInTheBox_vs_Distance
    # from https://www.kaggle.com/ryches/model-free-benchmark
    df['DefendersInTheBox_vs_Distance'] = df['DefendersInTheBox'] / df['Distance']
    
    # Start
    # from https://www.kaggle.com/sryo188558/cox-proportional-hazard-model
    df["Start"] = df["YardLine"]
    df.loc[(df["OffenseInOwnTerritory"] == 1) & (df["PlayDirection"] == 1), "Start"] = df.loc[(df["OffenseInOwnTerritory"] == 1) & (df["PlayDirection"] == 1), 
                                                                                       "YardLine"] + 10
    df.loc[(df["OffenseInOwnTerritory"] == 1) & (df["PlayDirection"] == -1), "Start"] = 120 - df.loc[(df["OffenseInOwnTerritory"] == 1) & (df["PlayDirection"] == -1), 
                                                                                       "YardLine"] - 10
    df.loc[(df["OffenseInOwnTerritory"] == 0) & (df["PlayDirection"] == 1), "Start"] = 120 - df.loc[(df["OffenseInOwnTerritory"] == 0) & (df["PlayDirection"] == 1), 
                                                                                       "YardLine"] - 10
    df.loc[(df["OffenseInOwnTerritory"] == 0) & (df["PlayDirection"] == -1), "Start"] = df.loc[(df["OffenseInOwnTerritory"] == 0) & (df["PlayDirection"] == -1), 
                                                                                       "YardLine"] + 10
    # Orientation 
    df['Orientation'] = 2 * np.pi * (90 - df['Orientation']) / 360
    
    # locX
    df['locX'] = (df['X'].values - df['Start'].values) * df['PlayDirection'].values
    
    # locY
    df['locY'] = df['Y'].values - 53.3 / 2
    
    # velX
    df['velX'] = df['S'].values * np.cos(df['Orientation'].values) * df['PlayDirection'].values
    
    # velY
    df['velY'] = df['S'].values * np.sin(df['Orientation'].values)
    
    # accX
    df['accX'] = df['A'].values * np.cos(df['Orientation'].values) * df['PlayDirection'].values
    
    # accY
    df['accY'] = df['A'].values * np.sin(df['Orientation'].values)
    
    # Aggregations by NflId (each player)
    i_cols = ['HomeScoreBeforePlay','VisitorScoreBeforePlay','YardLine']
    uids = ['NflId']
    aggregations = ['mean','std','median', 'max', 'min']
    df_agg = uid_aggregation(df, i_cols, uids, aggregations)
    df = pd.concat([df, df_agg], axis=1)
    
    # OffenseTeam & DefenseTeam dummies
    df_train = pd.get_dummies(df, prefix=['OffenseTeam', 'DefenseTeam'], 
                              columns=['OffenseTeam', 'DefenseTeam'])
    return df_train

In [None]:
df_train = preprocess(df)

In [None]:
rm_cols = ['index','GameId','PlayId','NflId','PlayDirection'
           'Team','TeamOnOffense','HomeTeamAbbr','VisitorTeamAbbr',
           'HomeScoreBeforePlay','VisitorScoreBeforePlay', 
           'FieldPosition','DisplayName','TimeHandoff','TimeSnap',
           'DefensePersonnel', 'OffensePersonnel','GameClock',
           'Location','NflIdRusher','PlayerCollegeName','Stadium',
           'OffenseRushingPosition','Position','GeneralPosition',
           'OffenseTeam','DefenseTeam']
cat = []
for f in df_train.columns :
    if  (str(df_train[f].dtype)=="object" or str(df_train[f].dtype)=="category") :
        cat.append(f)
        
features = list(df_train.columns)
features =  [col for col in features if col not in  rm_cols + cat]
#features = [c for c in df.columns.values if c not in rm_cols]
#df_train = df[features]
#print(df_train.shape)
#df_train.head()

In [None]:
print(len(features),'FEATURES.')
np.array(features)

In [None]:
X = df_train[features]

In [None]:
# from https://www.kaggle.com/hukuda222/nfl-simple-model-using-lightgbm
train_data = np.zeros((509762//22,len(features)))
for i in tqdm.tqdm(range(21,509762,22)):
    count=0
    for c in features:
        train_data[i//22][count] = df_train[c][i]
        count+=1

In [None]:
X_train = pd.DataFrame(data=train_data,columns=features)
y_tr_ = np.array([df_train["Yards"][i] for i in range(21,509762,22)])
y_tr = np.zeros(len(y_tr_),dtype=np.float)
for i in range(len(y_tr)):
    y_tr[i]=(y_tr_[i])

In [None]:
def _EvalFunction(labels,predictions) :
    
    #predictions,labels = np.round(scaler.inverse_transform(predictions)),np.round(scaler.inverse_transform(labels))
    n = np.arange(-99, 100)
    n = np.row_stack([n] * predictions.shape[0])
    ym = labels.reshape(predictions.shape[0], 1)
    step_ym = np.heaviside(n - ym, 1)
    yn = predictions.reshape(labels.shape[0], 1)
    step_yn = np.heaviside(n - yn, 1)
    inner_sum = np.power(step_yn - step_ym, 2)
    inner_sum = inner_sum.sum(axis=1)
    total = inner_sum.sum() / (199 * predictions.shape[0])
                           
    return 'CRPS', total, False

## Modelling

In [None]:
best_params_lgb = {'lambda_l1': 0.13413394854686794, 
                   'lambda_l2': 0.0009122197743451751, 
                   'num_leaves': 44, 
                   'feature_fraction': 0.4271070738920401, 
                   'bagging_fraction': 0.9999128827046064, 
                   'bagging_freq': 3, 
                   "learning_rate": 0.005,
                   'min_child_samples': 43, 
                   'objective': 'regression', 
                   'metric': 'mae', 
                   'verbosity': -1, 
                   'boosting_type': 'gbdt', 
                   "boost_from_average" : False,
                   'random_state': 42}

In [None]:
from sklearn.model_selection import KFold
nfold = 5
folds = KFold(n_splits=nfold, shuffle=False, random_state=42)

print('-'*20)
print(str(nfold) + ' Folds training...')
print('-'*20)

In [None]:
oof = np.zeros(len(X_train))
feature_importance_df = pd.DataFrame()

tr_mae = []
val_mae = []
models = []

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train,y_tr)):
    strLog = "fold {}".format(fold_)
    print(strLog)

    X_tr, X_val = df_train.iloc[trn_idx][features], df_train.iloc[val_idx][features]
    train_y, y_val = y_tr[trn_idx], y_tr[val_idx]

    model = lgb.LGBMRegressor(**best_params_lgb, n_estimators = 180, n_jobs = -1)
    model.fit(X_tr, 
              train_y, 
              eval_set=[(X_tr, train_y), (X_val, y_val)], 
              eval_metric=_EvalFunction,
              verbose=10, 
              early_stopping_rounds=500,
              
             )
    oof[val_idx] = model.predict(X_val)
    val_score = mean_absolute_error(y_val, oof[val_idx])
    val_mae.append(val_score)
    tr_score = mean_absolute_error(train_y, model.predict(X_tr))
    tr_mae.append(tr_score)
    models.append(model)
    
    # Feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = X_tr.columns
    fold_importance_df["importance"] = model.feature_importances_[:len(X_tr.columns)]
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

## Evaluation

In [None]:
mean_mae_tr = np.mean(tr_mae)
std_mae_tr =  np.std(tr_mae)

mean_mae_val =  np.mean(val_mae)
std_mae_val =  np.std(val_mae)

all_mae = mean_absolute_error(oof,y_tr)

print('-'*20)
print("Train's Score")
print('-'*20,'\n')
print("Mean mae: %.5f, std: %.5f." % (mean_mae_tr, std_mae_tr),'\n')

print('-'*20)
print("Validation's Score")
print('-'*20,'\n')
print("Mean mae: %.5f, std: %.5f." % (mean_mae_val, std_mae_val),'\n')

print("All mae: %.5f." % (all_mae))

In [None]:
cols_imp = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:50].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols_imp)]

plt.figure(figsize=(14,26))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('LightGBM Features (averaged over folds)')
plt.tight_layout()

In [None]:
_EvalFunction(y_tr,oof)[1]

## Submission

In [None]:
from kaggle.competitions import nflrush
env = nflrush.make_env()

In [None]:
index = 0

for (test_df, sample_prediction_df) in tqdm.tqdm(env.iter_test()):
    test = preprocess(test_df)
    count=0
    test_data = np.zeros((1,len(features)))
    for c in features:
        try:
            test_data[0][count] = test[c][index]
        except:
            test_data[0][count] = np.nan
        count+=1
        
    y_pred = np.zeros(199)        
    y_pred_p = np.mean([model.predict(test_data)[0] for model in models])

    y_pred_p += 99
    for j in range(199):
        if j>=y_pred_p+10:
            y_pred[j]=1.0
        elif j>=y_pred_p-10:
            y_pred[j]=(j+10-y_pred_p)*0.05

    env.predict(pd.DataFrame(data=[y_pred],columns=sample_prediction_df.columns))
    index += 22
env.write_submission_file()

Unused functions below

In [None]:
def create_football_field(linenumbers=True,
                          endzones=True,
                          highlight_line=False,
                          highlight_line_number=50,
                          highlighted_name='Line of Scrimmage',
                          fifty_is_los=False,
                          figsize=(12*2, 6.33*2)):
    """
    Function that plots the football field for viewing plays.
    Allows for showing or hiding endzones.
    """
    rect = patches.Rectangle((0, 0), 120, 53.3, linewidth=0.1,
                             edgecolor='r', facecolor='darkgreen', zorder=0,  alpha=0.5)

    fig, ax = plt.subplots(1, figsize=figsize)
    ax.add_patch(rect)
    plt.plot([10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             [0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             color='white')
    if fifty_is_los:
        plt.plot([60, 60], [0, 53.3], color='gold')
        plt.text(62, 50, '<- Player Yardline at Snap', color='gold')
    # Endzones
    if endzones:
        ez1 = patches.Rectangle((0, 0), 10, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ez2 = patches.Rectangle((110, 0), 120, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ax.add_patch(ez1)
        ax.add_patch(ez2)
    plt.xlim(0, 120)
    plt.ylim(-5, 58.3)
    plt.axis('off')
    if linenumbers:
        for x in range(20, 110, 10):
            numb = x
            if x > 50:
                numb = 120 - x
            plt.text(x, 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white')
            plt.text(x - 0.95, 53.3 - 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white', rotation=180)
    if endzones:
        hash_range = range(11, 110)
    else:
        hash_range = range(1, 120)
    for x in hash_range:
        ax.plot([x, x], [0.4, 0.7], color='white')
        ax.plot([x, x], [53.0, 52.5], color='white')
        ax.plot([x, x], [22.91, 23.57], color='white')
        ax.plot([x, x], [29.73, 30.39], color='white')
    if highlight_line:
        hl = highlight_line_number + 10
        plt.plot([hl, hl], [0, 53.3], color='yellow')
        plt.text(hl + 2, 50, '<- {}'.format(highlighted_name),
                 color='yellow')
    return fig, ax
#create_football_field()

def get_dx_dy(radian_angle, dist):
    dx = dist * math.cos(radian_angle)
    dy = dist * math.sin(radian_angle)
    return dx, dy
def show_play(play_id, df=df):
    df = df[df.PlayId == play_id]
    fig, ax = create_football_field()
    ax.scatter(df.X, df.Y, cmap='rainbow', c=~(df.Team == 'home'), s=100)
    rusher_row = df[df.NflIdRusher == df.NflId]
    ax.scatter(rusher_row.X, rusher_row.Y, color='black')
    yards_covered = rusher_row["Yards"].values[0]
    x = rusher_row["X"].values[0]
    y = rusher_row["Y"].values[0]
    rusher_dir = rusher_row["Dir_rad"].values[0]
    rusher_speed = rusher_row["S"].values[0]
    dx, dy = get_dx_dy(rusher_dir, rusher_speed)
    ax.arrow(x, y, dx, dy, length_includes_head=True, width=0.3, color='black')
    left = 'left' if df.ToLeft.sum() > 0 else 'right'
    plt.title(f'Play # {play_id} moving to {left}, yard distance is {yards_covered}', fontsize=20)
    plt.legend()
    plt.show()

In [None]:
# I wrote the helper function below to combine all files within a 
# folder into a single file to submit in Kaggle.

def generate_submission(path=None, closing_file='_closing_submission.py',
                        submission_file='submissions/submission.py'):
    """This function combines all PY files into a single submission file, to
    be uploaded as a single script in Kaggle.

    Args:
        path (str): The path from within combine PY files. If None, will use
            current
        closing_file (str): The last file to be added to the submission file.
            It contains the final function to be executed in the script.
        submission_file (str): The final submission file

    """
    if path is None or not os.path.exists(path):
        path = os.getcwd()

    dest = open(os.path.join(path, submission_file), "w")
    for filename in os.listdir(path):
        if filename.endswith(".py") and not filename.startswith('_'):
            f = open(os.path.join(path, filename), "r")
            for line in f.readlines():
                if line.startswith('from nfl.'):
                    continue
                if line.startswith("if __name__ == "):
                    break

                dest.write(line)

            dest.write('\n\n')
            dest.write('#' + '*' * 79)
            dest.write('\n\n')
            f.close()

    if not os.path.exists(closing_file):
        raise ValueError('Closing file does not exist!')

    f = open(os.path.join(path, closing_file), "r")
    dest.write(f.read())
    dest.close()
    
    
    # Better organizing the code in a proper file structure helped 
    # me figure out faster & better ways to develop & improve my algorithms. It enabled my very final code becoming something very neat:
#n_splits = 5
#dataset = Dataset('/kaggle/input/nfl-big-data-bowl-2020/train.csv')
#model1 = KerasModel(n_splits=n_splits, input_dim=103)
#model2 = XGBModel(n_splits=n_splits)
#model3 = LGBModel(n_splits=n_splits)
#model4 = CatBoostModel(n_splits=n_splits)
#ensemble = Ensemble(models=[model1, model2, model3, model4], dataset=dataset)
#ensemble.train()
#env = nflrush.make_env()
#ensemble.make_submission(env)