In [45]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import math, copy, time, os

#from kaggle.competitions import nflrush

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet, LogisticRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

%matplotlib inline

pd.set_option('mode.chained_assignment', None)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
pd.set_option('display.max_columns', None)

In [52]:
train_original = pd.read_csv('../data/train.csv', low_memory = False)
train = copy.deepcopy(train_original[train_original['NflId']==train_original['NflIdRusher']])
train.index = train['PlayId']

In [53]:
train_original.shape, train.shape

((509762, 49), (23171, 49))

# Functions

It might be a bit harder to read this way, but I created all these functions so I could test different models and make small feature changes for quick experimentation.  Basically, all the functions allowed me to compile stats on different features... keep reading and you'll see what I mean.

In [54]:
def time_seconds(x):
    x = time.strptime(x[:-5], "%Y-%m-%dT%H:%M:%S")
    x = time.mktime(x)
    return x

def create_cdf(y):
    y_cdf = copy.deepcopy(y.to_frame())
    y_cdf.columns = ['Yards']
    y_cdf.head()
    for i in list(range(-99,100)):
    #for i in list(range(0,10)):    
        y_cdf['Yards' + str(i)] = y_cdf['Yards'].apply(lambda x: 1 if i >= x else 0)
    y_cdf.drop('Yards',1,inplace = True)
    #print('y_cdf.shape {}'.format(y_cdf.shape))
    y_cdf.head(3)
    return y_cdf

def add_custom_feats(train):
    train = train.loc[train['NflId']==train['NflIdRusher']]
    train['SecondsToHandoff'] = train['TimeHandoff'].apply(time_seconds) - train['TimeSnap'].apply(time_seconds)
    train['OffenseTeam'] = train['PossessionTeam']
    train['adj_height'] = train['PlayerHeight'].apply(lambda x: int(x[0])*12 + int(x[2:4]))
    train['OffenseHome'] = train[['OffenseTeam','HomeTeamAbbr']].apply(lambda x: 1 if x[0] == x[1] else 0, axis = 1)
    train['DefenseTeam'] = train[['OffenseHome','HomeTeamAbbr','VisitorTeamAbbr']].apply(lambda x: x[2] if x[0] == 1 else x[1], axis = 1)
    train['OffenseLead'] = train[['OffenseHome','HomeScoreBeforePlay','VisitorScoreBeforePlay']].apply(lambda x: x[1]-x[2] if x[0] == 1 else x[2]-x[1], axis = 1)
    train['YardsToGo'] = train[['FieldPosition','OffenseTeam','YardLine']].apply( \
        lambda x: (50-x['YardLine'])+50 if x['OffenseTeam']==x['FieldPosition'] else x['YardLine'],1)
    train['SecondsToHandoff'] = train['TimeHandoff'].apply(time_seconds) - train['TimeSnap'].apply(time_seconds)
    train['turf'] = train['Turf'].apply(lambda x: int('turf' in x.lower() or 'artific' in x.lower()))
    train['quarter_seconds_left'] = train['GameClock'].apply(lambda x: int(x[0:2])*60 + int(x[3:5]))
    train['game_seconds_left'] = train['Quarter'].map({1:2700, 2:1800, 3:900, 4:0}) + train['quarter_seconds_left']
    train['game_seconds_passed'] = 3600 - train['game_seconds_left']
    train['OffensePoints'] = train[['OffenseHome','HomeScoreBeforePlay','VisitorScoreBeforePlay']].apply(lambda x: x[1] if x[0] == 1 else x[2], axis = 1)
    train['OffensePointsPerMinute'] = train['OffensePoints'] / train['game_seconds_passed']*60
    return train

def add_yard_bins(test_df, test_data, bins = [0,2,5,8,10,15,20,30,40,50,60,70,80,100]):
    # Add yard_bins
    test_df['YardsToGo'] = test_df[['FieldPosition','OffenseTeam','YardLine']].apply( \
        lambda x: (50-x['YardLine'])+50 if x['OffenseTeam']==x['FieldPosition'] else x['YardLine'],1)
    yard_bins = pd.get_dummies(pd.cut(test_df['YardsToGo'],bins = bins))
    yard_bins.columns = [str(i) for i in yard_bins.columns.tolist()]
    test_data = pd.merge(test_data,yard_bins,left_index = True, right_index = True)
    test_data.columns = [str(x).replace(']','').replace('(','') for x in test_data.columns]
    return test_data

def add_categories(train_data, cat_feats, top_n_categories = 120, count_min = 50):
    print('train_data shape: {}'.format(train_data.shape))
    categorical_stats = pd.DataFrame()
    for i in cat_feats:
        stats = train[[i,'Yards']].groupby(i).agg(['mean','count','std','max','min'])
        stats.columns = stats.columns.droplevel(0)
        stats['feature'] = i
        stats['feature_type'] = stats.index.values
        stats.reset_index(inplace = True)
        stats.drop(i,1,inplace = True)
        stats = stats[['feature','feature_type','mean','count','std','max','min']]
        categorical_stats = categorical_stats.append(stats)

    # I basically just made up this calculation to select the most significant categorical features-values, and it seems to work well
    categorical_stats['mean_difference'] = (categorical_stats['mean'] - 4.21).abs()
    categorical_stats['significance'] = categorical_stats['mean_difference'] * categorical_stats['count']**0.2

    cat_feats_keep = categorical_stats[categorical_stats['count']>count_min] \
            .sort_values('significance',ascending = False).head(top_n_categories)
    
    #print('train_data shape: {}'.format(train_data.shape))
    for feature in set(cat_feats_keep['feature']):
        feature_types = cat_feats_keep[cat_feats_keep['feature']==feature]['feature_type'].tolist()
        feature_dummies = pd.get_dummies(train[feature].apply(lambda x: 0 if x not in feature_types else x),prefix = feature)
        feature_dummies.drop(feature + '_0',1,inplace = True)
        train_data = pd.merge(train_data,feature_dummies,left_index = True, right_index = True)
        #print('{} Added: new train_data shape: {}'.format(feature,train_data.shape))    
    return train_data, cat_feats_keep

def add_player_feats_df(train_data, train_original, top_n_player_counts = 500, top_n_players_significance = 100):
    players = train_original[['NflId','Yards']].groupby('NflId').agg(['mean','count'])
    players.columns = ['mean','count']
    players['weight'] = players['count']**.2
    players['difference'] = players['mean'] - 4.21
    players['significance'] = (players['difference'] * players['weight']).abs()
    #players['significance'] = players['count']
    players_keep = players
    players_keep = players_keep.sort_values('count',ascending = False).head(top_n_player_counts)
    players_keep = players_keep.sort_values('significance',ascending = False).head(top_n_players_significance)
    #### Creating player_feats (get_dummies) for Top Players
    player_list = list(players_keep.index)
    player_feats = train_original[['PlayId','NflId']][train_original['NflId'].isin(player_list)]
    player_feats.set_index('PlayId',inplace = True)
    player_feats = pd.get_dummies(player_feats['NflId'], prefix = 'NflId')
    player_feats = player_feats.groupby(player_feats.index).sum()
    train_data = pd.merge(train_data, player_feats, left_index = True, right_index = True, how = 'left')
    train_data.fillna(0,inplace = True)
    return train_data, player_list

def array_in_range(pred):
    ones = np.ones(pred.shape)
    zeros = np.zeros(pred.shape)
    pred = np.maximum(pred,zeros)
    pred = np.minimum(pred,ones)
    return pred

def array_increasing(pred):
    for i in range(1,pred.size):
        pred[0][i] = max(pred[0][i],pred[0][i-1])
    return pred

def model_scores(models, X_train, y_train, X_test, y_test, params, description):
    model_results = pd.DataFrame()
    model_data = {}
    for model in models:    
        model.fit(X_train, y_train)
        model_name = model.__class__.__name__
        test_predictions = model.predict(X_test)
        train_predictions = model.predict(X_train)
        test_score = round(mean_squared_error(y_test, test_predictions),4)
        train_score = round(mean_squared_error(y_train, train_predictions),4)
        test_mae = round(mean_absolute_error(y_test, test_predictions),4)
        train_mae = round(mean_absolute_error(y_train, train_predictions),4)
        model_data['model_name'] = [model_name]
        model_data['test_score'] = [test_score]
        model_data['MAE'] = [test_mae]
        model_data['params'] = [str(params)]
        model_data['feature_count'] = [X_train.shape[1]]
        model_data['description'] = [description]
        model_data['model'] = [model]
        model_data['train_score'] = [train_score]
        print('{} MSE: {}'.format(model_name,test_score))
        print('{} MAE: {}'.format(model_name,test_mae))
        model_results = model_results.append(pd.DataFrame.from_dict(model_data, orient = 'columns'))
    model_results.sort_values('test_score', ascending = True, inplace = True)
    return model_results

def transform_pred_cdf(prediction,sample_prediction_df):
    prediction = array_increasing(array_in_range(prediction))
    pred_target = pd.DataFrame(index = sample_prediction_df.index, \
                               columns = sample_prediction_df.columns, \
                               data = prediction)
    return pred_target

def dummy_all(train_data, train, features):
    for i in features:
        train_data = pd.merge(train_data, pd.get_dummies(train[i], prefix = i), left_index = True, right_index = True)
    return train_data

def fit_transform_num_feats(train, num_feats, scaler, imp):
    train_data = train[num_feats]
    train_data = pd.DataFrame(index = train_data.index, columns = train_data.columns, data = scaler.fit_transform(train_data))
    train_data = pd.DataFrame(index = train_data.index, columns = train_data.columns, data = imp.fit_transform(train_data))
    return train_data

def print_parameters(params, num_feats, bins, cat_feats):
    print('params:{} - num_feats:{} - bins:{} - cat_feats:{}'.format(params,num_feats,bins,cat_feats))

def add_team_stats(train,train_original):
    trainXY = copy.deepcopy(train[['X','Y','S','A','Position','Team']])
    origXY = copy.deepcopy(train_original[['X','Y','PlayId','S','A','Position','Team']])
    origXY.set_index('PlayId', inplace = True)
    trainXY.columns = ['train_' + x for x in trainXY.columns]
    #print(set(train_original['Position']))
    locations = pd.merge(trainXY,origXY, left_index = True, right_index = True, how = 'left')
    locations['PlayerDistance'] = locations.apply \
        (lambda x: np.sqrt(np.square(x['train_X'] - x['X']) +  np.square(x['train_Y'] - x['Y'])), axis = 1)
    locations.reset_index(inplace = True)
    distances = locations.groupby(['PlayId','Team'])['PlayerDistance','S','A'].mean().reset_index()
    team_stats = distances.pivot_table(index = ['PlayId'], columns = ['Team'], values = ['PlayerDistance','S','A'])
    team_stats.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in team_stats.columns]
    team_stats['speed_difference'] = team_stats['S_home'] - team_stats['S_away']
    train = pd.merge(train, team_stats,left_index = True, right_index = True)
    return train, team_stats.columns

def add_position_stats(train,train_original):
    trainXY = copy.deepcopy(train[['X','Y','S','A','Position','Team','OffenseDefense']])
    origXY = copy.deepcopy(train_original[['X','Y','PlayId','S','A','Position','Team','OffenseDefense']])
    origXY.set_index('PlayId', inplace = True)
    trainXY.columns = ['train_' + x for x in trainXY.columns]
    #print(set(train_original['Position']))
    locations = pd.merge(trainXY,origXY, left_index = True, right_index = True, how = 'left')
    locations['PlayerDistance'] = locations.apply \
        (lambda x: np.sqrt(np.square(x['train_X'] - x['X']) +  np.square(x['train_Y'] - x['Y'])), axis = 1)
    locations.reset_index(inplace = True)

    distances = locations.groupby(['PlayId','OffenseDefense','Position'])['PlayerDistance','S','A'].mean().reset_index()
    team_stats = distances.pivot_table(index = ['PlayId'], columns = ['OffenseDefense','Position'], values = ['S'])
    team_stats.columns = ['{}_{}_{}'.format(a,b,c) for a, b, c in team_stats.columns]
    team_stats.fillna(0,inplace = True)
    train = pd.merge(train, team_stats,left_index = True, right_index = True)
    return train, team_stats.columns

# Experiment Tester

You can run tons of experiments here and keep track of performance of each model and the the parameters or other statistics in case you need to go back and use it again.

In [55]:
description_suffix = 0
experiments = pd.DataFrame()

In [56]:
### Train Feature Engineering
scaler = StandardScaler()
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

### Setup
description_suffix += 1

description = 'old team stats' + str(description_suffix)
top_n_player_counts = 1000
top_n_players_significance = 100
top_n_categories = 100
count_min = 100
bins = [0,2,4,6,8,10,15,20,30,40,50,60,70,80,90,100]
params = str((top_n_player_counts, top_n_players_significance, top_n_categories, count_min))

##Custom Feats
train = add_custom_feats(train)

##Number Feats
train, team_stats_columns = add_team_stats(train, train_original)
#train, position_stats_columns = add_position_stats(train, train_original)

num_feats = ['S','A','Dis','adj_height','Temperature','Humidity','SecondsToHandoff','Distance','PlayerWeight','OffenseLead','quarter_seconds_left','game_seconds_left','YardsToGo']
#num_feats.extend(['S|defense', 'S|offense', 'speed_difference'])
num_feats.extend(team_stats_columns)
train_data = fit_transform_num_feats(train, num_feats, scaler, imp)

#Player Feats
train_data, player_list = add_player_feats_df(train_data, train_original, top_n_player_counts, top_n_players_significance)

#Yard Bins
train_data = add_yard_bins(train, train_data, bins) #clean up columns

# Categories
cat_feats = ['DisplayName','PlayerCollegeName','OffensePersonnel','DefensePersonnel','Position','OffenseFormation','Down','OffenseTeam']
train_data, cat_feats_keep = add_categories(train_data, cat_feats, top_n_categories, count_min)

# Dummies:
#train_data = dummy_all(train_data, train, ['DefendersInTheBox'])

train_data shape: (23171, 135)


In [57]:
train_data.head()

Unnamed: 0_level_0,S,A,Dis,adj_height,Temperature,Humidity,SecondsToHandoff,Distance,PlayerWeight,OffenseLead,quarter_seconds_left,game_seconds_left,YardsToGo,A_away,A_home,PlayerDistance_away,PlayerDistance_home,S_away,S_home,speed_difference,NflId_276,NflId_2307,NflId_2346,NflId_71197,NflId_71493,NflId_89746,NflId_238498,NflId_496735,NflId_496788,NflId_496937,NflId_497048,NflId_497238,NflId_497322,NflId_2495139,NflId_2495459,NflId_2495485,NflId_2495488,NflId_2495655,NflId_2495872,NflId_2495977,NflId_2506106,NflId_2506112,NflId_2507166,NflId_2507491,NflId_2507590,NflId_2507716,NflId_2507999,NflId_2532890,NflId_2533008,NflId_2533049,NflId_2533050,NflId_2534832,NflId_2538145,NflId_2539207,NflId_2540169,NflId_2540180,NflId_2540197,NflId_2541187,NflId_2541243,NflId_2541864,NflId_2541966,NflId_2543456,NflId_2543466,NflId_2543471,NflId_2543498,NflId_2543729,NflId_2543830,NflId_2543836,NflId_2550256,NflId_2550328,NflId_2550559,NflId_2550656,NflId_2552332,NflId_2552363,NflId_2552424,NflId_2552455,NflId_2552460,NflId_2552475,NflId_2552489,NflId_2552490,NflId_2552652,NflId_2552670,NflId_2552676,NflId_2552686,NflId_2553437,NflId_2555173,NflId_2555180,NflId_2555181,NflId_2555334,NflId_2555349,NflId_2555364,NflId_2555388,NflId_2555468,NflId_2556370,NflId_2556489,NflId_2557856,NflId_2557862,NflId_2557871,NflId_2557887,NflId_2557898,NflId_2557997,NflId_2558019,NflId_2558033,NflId_2558062,NflId_2558094,NflId_2558116,NflId_2558125,NflId_2558136,NflId_2558846,NflId_2559044,NflId_2560728,NflId_2560753,NflId_2560757,NflId_2560813,NflId_2560851,NflId_2560888,NflId_2560955,NflId_2560968,NflId_2561020,NflId_2561039,"0, 2","2, 4","4, 6","6, 8","8, 10","10, 15","15, 20","20, 30","30, 40","40, 50","50, 60","60, 70","70, 80","80, 90","90, 100","OffensePersonnel_1 RB, 1 TE, 3 WR","OffensePersonnel_1 RB, 2 TE, 2 WR","OffensePersonnel_1 RB, 3 TE, 1 WR","OffensePersonnel_6 OL, 1 RB, 1 TE, 2 WR","OffensePersonnel_6 OL, 1 RB, 2 TE, 1 WR","OffensePersonnel_6 OL, 2 RB, 2 TE, 0 WR",Position_FB,Position_WR,DisplayName_Aaron Jones,DisplayName_Adrian Peterson,DisplayName_Alfred Blue,DisplayName_Alvin Kamara,DisplayName_Ameer Abdullah,DisplayName_Austin Ekeler,DisplayName_Carlos Hyde,DisplayName_Chris Carson,DisplayName_Chris Ivory,DisplayName_Christian McCaffrey,DisplayName_Dalvin Cook,DisplayName_David Johnson,DisplayName_DeMarco Murray,DisplayName_Doug Martin,DisplayName_Elijah McGuire,DisplayName_Gus Edwards,DisplayName_Jalen Richard,DisplayName_Jamaal Williams,DisplayName_Javorius Allen,DisplayName_Jerick McKinnon,DisplayName_Jonathan Stewart,DisplayName_Jordan Howard,DisplayName_Kareem Hunt,DisplayName_Kerryon Johnson,DisplayName_Kerwynn Williams,DisplayName_LeGarrette Blount,DisplayName_LeSean McCoy,DisplayName_Leonard Fournette,DisplayName_Mark Ingram,DisplayName_Matt Breida,DisplayName_Mike Gillislee,DisplayName_Nick Chubb,DisplayName_Peyton Barber,DisplayName_Phillip Lindsay,DisplayName_Rex Burkhead,DisplayName_Samaje Perine,DisplayName_Saquon Barkley,DisplayName_Theo Riddick,DisplayName_Todd Gurley,"DefensePersonnel_2 DL, 3 LB, 6 DB","DefensePersonnel_3 DL, 2 LB, 6 DB","DefensePersonnel_3 DL, 3 LB, 5 DB","DefensePersonnel_3 DL, 4 LB, 4 DB","DefensePersonnel_4 DL, 1 LB, 6 DB","DefensePersonnel_4 DL, 2 LB, 5 DB","DefensePersonnel_4 DL, 3 LB, 4 DB","DefensePersonnel_4 DL, 4 LB, 3 DB","DefensePersonnel_5 DL, 2 LB, 4 DB","DefensePersonnel_5 DL, 3 LB, 3 DB",PlayerCollegeName_Boise State,PlayerCollegeName_Brigham Young,PlayerCollegeName_Colorado,PlayerCollegeName_Florida,PlayerCollegeName_Florida State,PlayerCollegeName_Georgia,PlayerCollegeName_Georgia Southern,PlayerCollegeName_LSU,PlayerCollegeName_Louisiana State,PlayerCollegeName_Louisiana-Lafayette,PlayerCollegeName_Michigan State,PlayerCollegeName_Nebraska,PlayerCollegeName_Northern Iowa,PlayerCollegeName_Oklahoma State,PlayerCollegeName_Oregon,PlayerCollegeName_Penn State,PlayerCollegeName_Rutgers,PlayerCollegeName_Tennessee,PlayerCollegeName_Texas-El Paso,PlayerCollegeName_Tiffin University,PlayerCollegeName_Toledo,PlayerCollegeName_Tulane,PlayerCollegeName_USC,PlayerCollegeName_Utah State,"PlayerCollegeName_Western State, Colo.",Down_4,OffenseTeam_ARZ,OffenseTeam_BUF,OffenseTeam_CHI,OffenseTeam_DET,OffenseTeam_HST,OffenseTeam_JAX,OffenseTeam_KC,OffenseTeam_LA,OffenseTeam_LAC,OffenseTeam_NE,OffenseTeam_NO,OffenseTeam_NYJ,OffenseTeam_TB,OffenseTeam_WAS,OffenseFormation_I_FORM,OffenseFormation_JUMBO,OffenseFormation_SHOTGUN
PlayId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1
20170907000118,-0.535149,0.7803,-0.631894,-0.605427,0.145903,0.839683,-0.565651,-1.667696,-0.901023,-0.067518,1.477886,1.635689,0.51517,-1.664813,-0.825583,0.589556,-1.082895,-1.646599,-0.634494,1.156035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
20170907000139,-1.032635,-0.213714,-1.002146,-0.605427,0.145903,0.839683,-0.565651,0.440227,-0.901023,-0.067518,1.394374,1.614839,0.197209,-0.225095,-0.3616,0.813942,-0.862343,-1.22715,-0.649832,0.660797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
20170907000189,1.332603,-0.203139,1.404492,-0.605427,0.145903,0.839683,1.567253,0.440227,-0.901023,-0.067518,1.204573,1.567451,-0.677184,1.043888,-0.671688,0.65186,-0.804641,0.847769,1.411435,0.635366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
20170907000345,0.180531,0.621681,0.10861,-0.067613,0.145903,0.839683,1.567253,-1.667696,-0.530013,-0.067518,1.014773,1.520063,-1.988773,-0.984177,-2.031481,-0.829245,-1.623794,-1.335244,-0.588478,0.853539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
20170907000395,-0.299498,-0.086819,-0.076516,-0.067613,0.145903,0.839683,-0.565651,0.440227,-0.084802,-0.7171,0.999589,1.516272,0.912622,-0.155878,1.103853,-0.772257,1.142175,0.072318,-0.35368,-0.483604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [None]:
# Train/Test Split
X, y = train_data, train['Yards']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Exhaustive List
test_models = [
    Lasso(),
    ElasticNet(),
    Ridge(**{'alpha': 220, 'solver': 'lsqr'}),
    #LogisticRegression()
    #Ridge(),
    #MLPRegressor(),
    #LinearSVR(C= 20, epsilon= 0.008, gamma=0.0003),
    BaggingRegressor(),
    LGBMRegressor(objective='regression', 
                  boosting='gbdt',
                  num_leaves=6,
                  learning_rate=0.0005, 
                  n_estimators=1000,
                  bagging_fraction=0.8,
                  bagging_freq=4, 
                  bagging_seed=8,
                  feature_fraction=0.2,
                  feature_fraction_seed=8,
                  min_sum_hessian_in_leaf = 11,
                  random_state=42),
    #VotingRegressor([('r',Ridge()),('f',RandomForestRegressor())]),
    GradientBoostingRegressor(),
    XGBRegressor()
    ]

print_parameters(params, num_feats, bins, cat_feats)

model_results = model_scores(test_models, X_train, y_train, X_test, y_test, params, description)
experiments = experiments.append(model_results)
experiments.sort_values('test_score', ascending = True)

In [None]:
y_cdf = create_cdf(y)
final_model = Ridge(**{'alpha': 220, 'solver': 'svd'}).fit(X, y_cdf)

In [None]:
# Setup
env = nflrush.make_env()
iter_test = env.iter_test()

In [None]:
def prep_prediction_submission(test_df, sample_prediction_df, final_model):
    #(test_df, sample_prediction_df) = next(iter_test)
    test_df_original = copy.deepcopy(test_df)
    test_df = test_df.loc[test_df['NflId']==test_df['NflIdRusher']]
    test_df = add_custom_feats(test_df)
    test_df.set_index('PlayId',inplace = True) #only on test
    test_df, test_df_columns = add_team_stats(test_df, test_df_original)
    test_data = test_df[num_feats]
    test_data = pd.DataFrame(index = test_data.index, columns = test_data.columns, data = scaler.transform(test_data))
    test_data = pd.DataFrame(index = test_data.index, columns = test_data.columns, data = imp.transform(test_data))
    # Add categories
    for feature in set(cat_feats_keep['feature']):
        feature_types = cat_feats_keep[cat_feats_keep['feature']==feature]['feature_type'].tolist()
        feature_dummies = pd.get_dummies(test_df[feature].apply(lambda x: 0 if x not in feature_types else x),prefix = feature)
        if feature + '_0' in list(feature_dummies.columns):
            feature_dummies.drop(feature + '_0',1,inplace = True)
        if feature_dummies.shape[1] > 0:
            test_data = pd.merge(test_data,feature_dummies,left_index = True, right_index = True)
    # Add player feats
    add_feats = set(test_df_original['NflId']).intersection(player_list)
    for i in add_feats:
        test_data[i] = 1
    test_data = add_yard_bins(test_df, test_data,bins) #clean up columns
    # Columns (missing and order)
    new_columns = np.setdiff1d(list(train_data.columns),list(test_data.columns))
    for i in new_columns:
        test_data[i] = 0
    test_data = test_data[train_data.columns]
    prediction = final_model.predict(test_data)
    pred_target = transform_pred_cdf(prediction,sample_prediction_df)
    return pred_target


In [None]:
progress_counter = 0

for (test_df, sample_prediction_df) in iter_test:
    progress_counter+=1
    if progress_counter%250 == 0:
        print(progress_counter)
    pred_target = prep_prediction_submission(test_df, sample_prediction_df, final_model)
    env.predict(pred_target)

In [None]:
env.write_submission_file()