In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime
from kaggle.competitions import nflrush
import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.layers import Dense,Input,Flatten,concatenate,Dropout,Lambda,BatchNormalization
from keras.models import Model
import keras.backend as K
from keras.callbacks import Callback
from  keras.callbacks import EarlyStopping,ModelCheckpoint
import datetime

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]

In [None]:
train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})
#print(train.shape)
#train.head()

In [None]:
def strtoseconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def strtofloat(x):
    try:
        return float(x)
    except:
        return -1

def map_weather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans*=0.5
    if 'climate controlled' in txt or 'indoor' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt:
        return -3*ans
    return 0

def OffensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0, 'QB' : 0, 'RB' : 0, 'TE' : 0, 'WR' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def DefensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def orientation_to_cat(x):
    x = np.clip(x, 0, 360 - 1)
    try:
        return str(int(x/15))
    except:
        return "nan"

In [None]:
def create_features(df, deploy=False):
    def new_X(x_coordinate, play_direction):
        if play_direction == 'left':
            return 120.0 - x_coordinate
        else:
            return x_coordinate

    def new_line(rush_team, field_position, yardline):
        if rush_team == field_position:
            # offense starting at X = 0 plus the 10 yard endzone plus the line of scrimmage
            return 10.0 + yardline
        else:
            # half the field plus the yards between midfield and the line of scrimmage
            return 60.0 + (50 - yardline)

    def new_orientation(angle, play_direction):
        if play_direction == 'left':
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def euclidean_distance(x1,y1,x2,y2):
        x_diff = (x1-x2)**2
        y_diff = (y1-y2)**2

        return np.sqrt(x_diff + y_diff)
    
    def distX(x1,x2):
        return abs(x1 - x2)

    def back_direction(orientation):
        if orientation > 180.0:
            return 1
        else:
            return 0

    def update_yardline(df):
        new_yardline = df[df['NflId'] == df['NflIdRusher']]
        new_yardline['YardLine'] = new_yardline[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
        new_yardline = new_yardline[['GameId','PlayId','YardLine']]

        return new_yardline

    def update_orientation(df, yardline):
        df['X'] = df[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
        df['Orientation'] = df[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
        df['Dir'] = df[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)

        df = df.drop('YardLine', axis=1)
        df = pd.merge(df, yardline, on=['GameId','PlayId'], how='inner')

        return df

    def back_features(df):
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
#         carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
#         carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
#         carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage']]
        return carriers

    def features_relative_to_back(df, carriers):
        player_distance = df[['GameId','PlayId','NflId','X','Y']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
#         player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage'])\
                                         .agg({'dist_to_back':['min','max','mean','std']})\
                                         .reset_index()
#         player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field','min_dist','max_dist','mean_dist','std_dist']
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','min_dist','max_dist','mean_dist','std_dist']

        return player_distance

    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
#        attac = defense[defense['Team'] == defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
#        attac['def_dist_to_back'] = attac[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        defense['distX'] = defense[['X','RusherX']].apply(lambda x: distX(x[0],x[1]), axis=1)
#        defense['distY'] = defense[['Y','RusherY']].apply(lambda x: distX(x[0],x[1]), axis=1)
        
#         Boucle for pour trouver le min et sa position
#         Signe de Ymin - Yrush
#         Trouver dist_to_back min avec signe opposé
#         Calculer la distance entre ce joueur et rush
#         Si plus proche tout en haut ou tout en bas mettre 0 ou max
        
        min2 = []
        dist_12 = []
        
        for i in range(int(len(defense)/11)):
            match = defense.iloc[i*11:(i+1)*11]
            indexMin = match.index[match["def_dist_to_back"] == min(match["def_dist_to_back"])].tolist()[0]
            deltaY = (match["Y"][indexMin] - match["RusherY"][indexMin])>0
            if match[((match["RusherY"] - match["Y"]) > 0) == deltaY]["def_dist_to_back"].tolist() == []:
                if deltaY:
                    min2.append(0)
                    dist_12.append(match["Y"][indexMin])
                else:
                    min2.append(53.3)
                    dist_12.append(abs(53.3 - match["Y"][indexMin]))
            else:    
                min2.append(min(match[((match["RusherY"] - match["Y"]) > 0) == deltaY]["def_dist_to_back"]))
                indexMin2 = match.index[match["def_dist_to_back"] == min2[i]].tolist()[0]
                dist_12.append(euclidean_distance(match["X"][indexMin],match["Y"][indexMin],match["X"][indexMin2],match["Y"][indexMin2]))

#        attac = attac.groupby(['GameId','PlayId'])\
#                         .agg({'def_dist_to_back':['min','mean','std']})\
#                         .reset_index()
#        attac.columns = ['GameId','PlayId','att_min_dist','att_mean_dist','att_std_dist']
        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','mean','std'],'distX':['min','mean','std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_mean_dist','def_std_dist','minX','meanX','stdX']
#        defense = pd.merge(defense, attac, on=['GameId','PlayId'],how='inner')
        return (defense, min2, dist_12)

    def static_features(df):
        
        
        add_new_feas = []

        ## Height
        #df['PlayerHeight_dense'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
        
        #add_new_feas.append('PlayerHeight_dense')

        ## Time
        #df['TimeHandoff'] = df['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
        #df['TimeSnap'] = df['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

        #df['TimeDelta'] = df.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        #df['PlayerBirthDate'] =df['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

        ## Age
        #seconds_in_year = 60*60*24*365.25
        #df['PlayerAge'] = df.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        #add_new_feas.append('PlayerAge')

        ## WindSpeed
        #df['WindSpeed_ob'] = df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
        #df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
        #df['WindSpeed_ob'] = df['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
        #df['WindSpeed_dense'] = df['WindSpeed_ob'].apply(strtofloat)
        #add_new_feas.append('WindSpeed_dense')

        ## Weather
        #df['GameWeather_process'] = df['GameWeather'].str.lower()
        #df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
        #df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
        #df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
        #df['GameWeather_process'] = df['GameWeather_process'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
        #df['GameWeather_dense'] = df['GameWeather_process'].apply(map_weather)
        #add_new_feas.append('GameWeather_dense')

        ## Orientation and Dir
        df["Orientation_ob"] = df["Orientation"].apply(lambda x : orientation_to_cat(x)).astype("object")
        df["Dir_ob"] = df["Dir"].apply(lambda x : orientation_to_cat(x)).astype("object")

        df["Orientation_sin"] = df["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Orientation_cos"] = df["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        df["Dir_sin"] = df["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
        df["Dir_cos"] = df["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
        add_new_feas.append("Dir_sin")
        add_new_feas.append("Dir_cos")

        ## diff Score
        df["diffScoreBeforePlay"] = df["HomeScoreBeforePlay"] - df["VisitorScoreBeforePlay"]
        add_new_feas.append("diffScoreBeforePlay")
        
    
    
#         static_features = df[df['NflId'] == df['NflIdRusher']][add_new_feas+['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
#                                                             'YardLine','Quarter','Down','Distance','DefendersInTheBox','OffenseFormation']].drop_duplicates()
        static_features = df[df['NflId'] == df['NflIdRusher']][add_new_feas+['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                                                            'YardLine','Quarter','Down','Distance','DefendersInTheBox']].drop_duplicates()

#         static_features['DefendersInTheBox'] = static_features['DefendersInTheBox'].fillna(np.mean(static_features['DefendersInTheBox']))
        static_features.fillna(-999,inplace=True)
#         for i in add_new_feas:
#             static_features[i] = static_features[i].fillna(np.mean(static_features[i]))
            

        return static_features

    def combine_features(relative_to_back, defense, static, min2, dist_12, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')

        if not deploy:
            df = pd.merge(df, outcomes, on=['GameId','PlayId'], how='inner')

        df['min2'] = min2
        df['dist_12'] = dist_12
        
        return df
    
    yardline = update_yardline(df)
    df = update_orientation(df, yardline)
    back_feats = back_features(df)
    rel_back = features_relative_to_back(df, back_feats)
    def_feats, min2, dist_12 = defense_features(df)
    static_feats = static_features(df)
    basetable = combine_features(rel_back, def_feats, static_feats, min2, dist_12, deploy=deploy)
    
    return basetable

In [None]:
outcomes = train[['GameId','PlayId','Yards']].drop_duplicates()

In [None]:
# moy7 = np.mean(train[train['Season']<2018]['S'])
# std7 = np.std(train[train['Season']<2018]['S'])
# moy8 = np.mean(train[train['Season']>2017]['S'])
# std8 = np.std(train[train['Season']>2017]['S'])
# train['S'] = train['S']*moy7/moy8
# train['A'] = train['A']*moy7/moy8
#train = train[train['Season']>2017]
# plt.hist(train[train['Season']<2018]["Yards"],50)
# plt.hist(train[train['Season']>2017]["Yards"],50)

In [None]:
%%time
#train = preprocess(train)
train = create_features(train, False)

## OffenseFormation    
# off_form = train['OffenseFormation'].unique()
# train = pd.concat([train.drop(['OffenseFormation'], axis=1), pd.get_dummies(train['OffenseFormation'], prefix='Formation')], axis=1)
# dummy_col = train.columns

In [None]:
X = train.copy()
yards = X.Yards

def return_step(x):
    temp = np.zeros(199)
    temp[x + 99:] = 1
    return temp

y = np.vstack(yards.apply(return_step).values)

X.drop(['GameId','PlayId','Yards','min_dist','max_dist','mean_dist','std_dist','Quarter','Down','Dir', 'Dir_cos','X','Dis'], axis=1, inplace=True)

In [None]:
y2 = np.zeros((yards.shape[0], 199))
for idx, target in enumerate(list(yards)):
    y2[idx][99 + target] = 1

In [None]:
#Xc = train.copy()
#Xc.drop(['GameId','PlayId','Quarter','Down','X','Dir', 'Dir_cos'], axis=1, inplace=True)
#sns.set(rc={'figure.figsize':(30, 30)})
#corr = Xc.corr()
#plt.figure() 
#ax = sns.heatmap(corr, linewidths=.5, annot=True, cmap="YlGnBu", fmt='.1g')
#plt.savefig('corr_heatmap.png')
#plt.show()

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
#train_newFE = []
#for i in range(len(train_y)):
#    train_newFE.append(min(np.sqrt(((train_dense_players[i,:,0]-train_dense_players[i,21,0])**2+(train_dense_players[i,:,1]-train_dense_players[i,21,1])**2)[0:20])))

In [None]:
from lightgbm import LGBMClassifier
class MultiLGBMClassifier():
    def __init__(self, resolution, params):
        ## smoothing size
        self.resolution = resolution
        ## initiarize models
        self.models = [LGBMClassifier(**params) for _ in range(resolution)]
        
    def fit(self, x, y):
        self.classes_list = []
        for k in tqdm_notebook(range(self.resolution)):
            ## train each model
            self.models[k].fit(x, (y + k) // self.resolution)
            ## (0,1,2,3,4,5,6,7,8,9) -> (0,0,0,0,0,1,1,1,1,1) -> (0,5)
            classes = np.sort(list(set((y + k) // self.resolution))) * self.resolution - k
            classes = np.append(classes, 999)
            self.classes_list.append(classes)
            
    def predict(self, x):
        pred199_list = []
        for k in range(self.resolution):
            preds = self.models[k].predict_proba(x)
            classes = self.classes_list[k]
            pred199s = self.get_pred199(preds, classes)
            pred199_list.append(pred199s)
        self.pred199_list = pred199_list
        pred199_ens = np.mean(np.stack(pred199_list), axis = 0)
        return pred199_ens
    
    def _get_pred199(self, p, classes):
        ## categorical prediction -> predicted distribution whose length is 199
        pred199 = np.zeros(199)
        for k in range(len(p)):
            pred199[classes[k] + 99 : classes[k+1] + 99] = p[k]
        return pred199

    def get_pred199(self, preds, classes):
        pred199s = []
        for p in preds:
            pred199 = np.cumsum(self._get_pred199(p, classes))
            pred199 = pred199/np.max(pred199)
            pred199s.append(pred199)
        return np.vstack(pred199s)
    
from keras.layers import Dense,Input,Flatten,concatenate,Dropout,Lambda
from keras.models import Model
import keras.backend as K
import re
from keras.losses import binary_crossentropy
from  keras.callbacks import EarlyStopping,ModelCheckpoint
import codecs

from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from sklearn.metrics import f1_score


class CRPSCallback(Callback):
    
    def __init__(self,validation, predict_batch_size=20, include_on_batch=False):
        super(CRPSCallback, self).__init__()
        self.validation = validation
        self.predict_batch_size = predict_batch_size
        self.include_on_batch = include_on_batch
        
        print('validation shape',len(self.validation))

    def on_batch_begin(self, batch, logs={}):
        pass

    def on_train_begin(self, logs={}):
        if not ('CRPS_score_val' in self.params['metrics']):
            self.params['metrics'].append('CRPS_score_val')

    def on_batch_end(self, batch, logs={}):
        if (self.include_on_batch):
            logs['CRPS_score_val'] = float('-inf')

    def on_epoch_end(self, epoch, logs={}):
        logs['CRPS_score_val'] = float('-inf')
            
        if (self.validation):
            X_valid, y_valid = self.validation[0], self.validation[1]
            y_pred = self.model.predict(X_valid)
            y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
            y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
            val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid.shape[0])
            val_s = np.round(val_s, 6)
            logs['CRPS_score_val'] = val_s
            
def get_model(x_tr,y_tr,x_val,y_val):
    inp = Input(shape = (x_tr.shape[1],))
    x = Dense(1024, input_dim=X.shape[1], activation='relu')(inp)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    out = Dense(199, activation='softmax')(x)
    model = Model(inp,out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[])
    #add lookahead
#     lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
#     lookahead.inject(model) # add into model

    
    es = EarlyStopping(monitor='CRPS_score_val', 
                       mode='min',
                       restore_best_weights=True, 
                       verbose=1, 
                       patience=10)

    mc = ModelCheckpoint('best_model.h5',monitor='CRPS_score_val',mode='min',
                                   save_best_only=True, verbose=1, save_weights_only=True)
    
    bsz = 1024
    steps = x_tr.shape[0]/bsz
    


    model.fit(x_tr, y_tr,callbacks=[CRPSCallback(validation = (x_val,y_val)),es,mc], epochs=100, batch_size=bsz,verbose=1)
    model.load_weights("best_model.h5")
    
    y_pred = model.predict(x_val)
    y_valid = y_val
    y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * x_val.shape[0])
    crps = np.round(val_s, 6)

    return model,crps

def predictK(x_te):
    model_num = len(models)
    for k,m in enumerate(models):
        if k==0:
            y_pred = m.predict(x_te,batch_size=1024)
        else:
            y_pred+=m.predict(x_te,batch_size=1024)
            
    y_pred = y_pred / model_num
    
    return y_pred

In [None]:
params = {'lambda_l1': 0.001, 'lambda_l2': 0.001,
 'num_leaves': 30, 'max_depth': 5, # 2**6 = 64 - Let's set 50 to prevent overfitting
 'feature_fraction': 0.8,
 'subsample': 0.8, 'min_child_samples': 10,
 'learning_rate': 0.011,
 'num_iterations': 700, 'random_state': 42}

In [None]:
from sklearn.model_selection import train_test_split, KFold
import time
losses = []
models = []

models2 = []
crps_csv = []

losses3 = []

#s_time = time.time()
for k in range(1):
    kfold = KFold(5, random_state = 42 + k, shuffle = True)
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(y)):
        print("-----------")
        print("-----------")
        model = MultiLGBMClassifier(resolution = 3, params = params)
        model.fit(X[tr_inds], yards.values[tr_inds])
        preds = model.predict(X[val_inds])
        loss = np.mean((y[val_inds] - preds) ** 2)
        models.append(model)
        losses.append(loss)
        model2,crps = get_model(X[tr_inds],y2[tr_inds],X[val_inds],y2[val_inds])
        models2.append(model2)
        crps_csv.append(crps)        
        preds3 = np.cumsum(model2.predict(X[val_inds]), axis=1)
        loss3 = np.mean((y[val_inds] - (preds + preds3)/2) ** 2)
        losses3.append(loss3)
        print(k_fold, loss)
        print("the %d fold crps is %f"%((k_fold+1),crps))
        print(loss3)
        
print("-------")
print(losses)
print(np.mean(losses))
print("mean crps is %f"%np.mean(crps_csv))
print(losses3)
print(np.mean(losses3))

In [None]:
feature_importances = 0
num_model = 0
for model in models:
    for m in model.models:
        feature_importances += m.booster_.feature_importance("gain")
        num_model += 1

feature_importances /= num_model

In [None]:
train.drop(['GameId','PlayId','Yards','min_dist','max_dist','mean_dist','std_dist','Quarter','Down','Dir', 'Dir_cos','X','Dis'], axis=1, inplace=True)
feature_names = list(train.columns)
feature_importance_df = pd.DataFrame(np.vstack([feature_importances, feature_names]).T, columns = ["importance", "name"])
feature_importance_df["importance"] = feature_importance_df["importance"].astype(np.float32)
feature_importance_df = feature_importance_df.groupby("name").agg("mean").reset_index()

In [None]:
plt.figure(figsize = (8, 18))
sns.barplot(data = feature_importance_df.sort_values(by = "importance", ascending = False).head(50), x = "importance", y = "name")
plt.show()

In [None]:
## bad features
list(feature_importance_df[feature_importance_df["importance"] < np.quantile(feature_importance_df["importance"], 0.3)]["name"])

## Prediction

In [None]:
def make_pred(test, sample, env, model):
    outcomes = test[['GameId','PlayId']].drop_duplicates()
    test = create_features(test, True)
#     test['OffenseFormation'] = test['OffenseFormation'].apply(lambda x: x if x in off_form else np.nan)
#     test = pd.concat([test.drop(['OffenseFormation'], axis=1), pd.get_dummies(test['OffenseFormation'], prefix='Formation')], axis=1)
#     missing_cols = set( dummy_col ) - set( test.columns ) - set('Yards')
#     for c in missing_cols:
#         test[c] = 0
#     test = test[dummy_col]
    test.drop(['GameId','PlayId','min_dist','max_dist','mean_dist','std_dist','Quarter','Down','X','Dir', 'Dir_cos','Dis'], axis=1, inplace=True)
    test2 = scaler.transform(test)
    
 #   test_newFE = []
 #   for i in range(len(test_dense_game)):
 #       test_newFE.append(min(np.sqrt(((test_dense_players[i,:,0]-test_dense_players[i,21,0])**2+(test_dense_players[i,:,1]-test_dense_players[i,21,1])**2)[0:20])))
      
    ## pred
    pred = 0
    for model in models:
        _pred = model.predict(test2)
        pred += _pred
    pred /= len(models)
    
    pred2 = 0
    for model2 in models2:
        _pred = np.cumsum(model2.predict(test2), axis=1)
        pred2 += _pred
    pred2 /= len(models2)
    
    pred3 = (pred + pred2)/2
    
    predF = np.clip(pred3, 0, 1)
    env.predict(pd.DataFrame(data=predF,columns=sample.columns))
    return pred

In [None]:
env = nflrush.make_env()
preds = []
for test, sample in tqdm_notebook(env.iter_test()):
    pred = make_pred(test, sample, env, models)
    preds.append(pred)
env.write_submission_file()

In [None]:
preds = np.vstack(preds)
## check whether prediction is submittable
print(np.mean(np.diff(preds, axis = 1) >= 0) == 1.0)
print(np.mean(preds > 1) == 0)

In [None]:
print(losses)
print(np.mean(losses))