In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBRegressor
import pandas as pd
import seaborn as sns
import joblib
import random
import math

In [2]:
# Downloading Data
# The variable dataset size determines how big of a dataset is imported 
# 1 = 2019 data, 2= 2019+2018, 3 = ...
dataset_sz  = 1

stem = "../data/pbp-201"
suffix = ".csv"
sets = []


for x in range(dataset_sz):
    num = str(9-x)
    name = stem+num+suffix
    sets.append(pd.read_csv(name))
    
data = pd.concat(sets)

In [3]:
len(data[data["Down"] == 4]["PlayType"])
#.value_counts()


3592

In [4]:
#Used RapidMiner API to identify columns that correlate with target variable while 
#also not having a large number of missing values (over 50% missing) or stability above 95%

good_columns = ['Minute', 'Second', 'OffenseTeam', 'DefenseTeam', 'Down', 'ToGo', 'YardLine',
'SeriesFirstDown', 'Formation', 'PlayType', 'IsRush', 'IsPass', 'IsIncomplete', 
'PassType', 'YardLineFixed', 'YardLineDirection', 'Quarter', 'IsTouchdown', 'IsSack', 
'IsInterception', 'IsFumble', 'IsPenalty', 'Yards']

data_gc = data[good_columns]

In [5]:
#Create a standard time column  = minute*60+seconds

data_gc["Time"] = data_gc["Minute"]*60+data_gc["Second"]
data_gc.drop(["Minute","Second"], axis = 1, inplace =True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [6]:
#See which columns have NaN's in order to begin imputing missing values 
imput_list = []
for col in data_gc.columns:
    x = data_gc[col].isna().value_counts()
    
    #Ignore column if no missing values 
    if len(x)>1: 
        print(col)
        imput_list.append(col)
        print(x)

OffenseTeam
False    39078
True      3108
Name: OffenseTeam, dtype: int64
Formation
False    41470
True       716
Name: Formation, dtype: int64
PlayType
False    40732
True      1454
Name: PlayType, dtype: int64
PassType
True     24669
False    17517
Name: PassType, dtype: int64


In [7]:
#Impute Pass Type columns with Misc
data_gc["PassType"].fillna('MISC', inplace =True)
#got tired of longer data frame name
dgci = data_gc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [8]:
#Losing about 3000 out of 42,000 columns by dropping rows with NaN's

dgci = dgci.dropna()

#sanity check

for col in dgci.columns:
    x = dgci[col].isna().value_counts()
    
    #Ignore column if no missing values 
    if len(x)>1: 
        print(col)
        print(x)
        
# If this prints something, something went wrong. 

In [9]:
dgci["PlayType"].value_counts()

PASS                    17520
RUSH                    11335
KICK OFF                 2422
PUNT                     2116
SACK                     1266
EXTRA POINT              1122
FIELD GOAL                911
NO PLAY                   764
SCRAMBLE                  763
QB KNEEL                  368
EXCEPTION                 175
TWO-POINT CONVERSION      114
FUMBLES                    94
CLOCK STOP                 71
PENALTY                     6
Name: PlayType, dtype: int64

In [128]:
# Must turn categorical variables into dummy vars to make sklearn happy 

#First see which columns are categorical (some already are made into dummy vars)
dgci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39047 entries, 0 to 42185
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   OffenseTeam        39047 non-null  object
 1   DefenseTeam        39047 non-null  object
 2   Down               39047 non-null  int64 
 3   ToGo               39047 non-null  int64 
 4   YardLine           39047 non-null  int64 
 5   SeriesFirstDown    39047 non-null  int64 
 6   Formation          39047 non-null  object
 7   PlayType           39047 non-null  object
 8   IsRush             39047 non-null  int64 
 9   IsPass             39047 non-null  int64 
 10  IsIncomplete       39047 non-null  int64 
 11  PassType           39047 non-null  object
 12  YardLineFixed      39047 non-null  int64 
 13  YardLineDirection  39047 non-null  object
 14  Quarter            39047 non-null  int64 
 15  IsTouchdown        39047 non-null  int64 
 16  IsSack             39047 non-null  int64

In [10]:
cats = ['DefenseTeam','OffenseTeam','Formation','PlayType','PassType', 'YardLineDirection', "Quarter", "Down"]
df = pd.get_dummies(dgci, columns = cats)

In [11]:
dgci.columns

Index(['OffenseTeam', 'DefenseTeam', 'Down', 'ToGo', 'YardLine',
       'SeriesFirstDown', 'Formation', 'PlayType', 'IsRush', 'IsPass',
       'IsIncomplete', 'PassType', 'YardLineFixed', 'YardLineDirection',
       'Quarter', 'IsTouchdown', 'IsSack', 'IsInterception', 'IsFumble',
       'IsPenalty', 'Yards', 'Time'],
      dtype='object')

In [12]:
y = df["Yards"]
X = df.drop("Yards", axis = 1)

In [142]:
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(X, y, test_size = 0.1)

In [143]:
model = XGBRegressor(verbosity=1)

In [144]:
model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [145]:
predictions = model.predict(x_test_data)
print(predictions)

[-0.18795937  2.311502    2.0018845  ... -0.17746079 -0.3912118
  3.5280123 ]


In [146]:
pred_cols = x_test_data.columns

In [147]:
modelf = model.fit(X,y)

def log_cosh_quantile(alpha):
    def _log_cosh_quantile(y_true, y_pred):
        err = y_pred - y_true
        err = np.where(err < 0, alpha * err, (1 - alpha) * err)
        grad = np.tanh(err)
        hess = 1 / np.cosh(err)**2
        return grad, hess
    return _log_cosh_quantile

alpha = 0.95 # 95% prediction interval
upper_model = XGBRegressor(verbosity=1,objective=log_cosh_quantile(alpha)) # predicts too high (top 2.5%)
lower_model = XGBRegressor(verbosity=1,objective=log_cosh_quantile(1-alpha)) # predicts too low (bottom 2.5%)
upper_model = upper_model.fit(X,y)
lower_model = lower_model.fit(X,y)

  
  


In [187]:
#Input the information in the correct columns below to make a prediction, leave the play type column blank as the model will determine 
# which play type maximizes the yards gained or points within a situation. Leave Yards blank too obv
datum = dgci.iloc[12] #This is the row of data 


datum = pd.DataFrame(datum).T

In [188]:
datum

Unnamed: 0,OffenseTeam,DefenseTeam,Down,ToGo,YardLine,SeriesFirstDown,Formation,PlayType,IsRush,IsPass,...,YardLineFixed,YardLineDirection,Quarter,IsTouchdown,IsSack,IsInterception,IsFumble,IsPenalty,Yards,Time
12,NYJ,LV,4,15,83,0,FIELD GOAL,FIELD GOAL,0,0,...,17,OPP,2,0,0,0,0,0,0,196


In [189]:
cats = ['DefenseTeam','OffenseTeam','Formation','PlayType','PassType', 'YardLineDirection', "Quarter", "Down"]
#The current options being considered to maximize

PT = ["PASS", "RUSH"]

preds = []

prediction_values = pd.DataFrame(np.zeros([122,122]),columns = np.asarray(pred_cols))

p_vals = pd.DataFrame(prediction_values.iloc[12]).T

datum  = pd.get_dummies(datum, columns = cats).drop("Yards", axis = 1)

for x in datum.columns:
    p_vals[x] = int(datum[x])
    
for x in PT: 
    dummy = p_vals.copy()
    dummy["PlayType_"+x] = 1
    preds.append([upper_model.predict(dummy)[0],modelf.predict(dummy)[0],lower_model.predict(dummy)[0]])

print(preds)
# to_go = p_vals["ToGo"].iloc[0].item()
# pass_yards,rush_yards = preds[0], preds[1]
# if p_vals["Down_4"].iloc[0].item() == 1 and max(pass_yards,rush_yards) > to_go:
#     print("Our prediction shows a gain of "+str(max(pass_yards,rush_yards)) + " yards. We recommend punting")
# else:
#     if pass_yards > rush_yards:
#         print("We recommend passing."+" Predicted Yards:",pass_yards)
#     else:
#         print("We recommend rushing."+" Predicted Yards:",rush_yards)
    
    
    

[[0.16601852, 0.24230364, -0.02068144], [0.47250915, 0.05968088, -0.16087317]]


In [138]:
preds

[array([2.820426], dtype=float32), array([2.820426], dtype=float32)]

In [None]:
#Looks like RUsh maximizes Yards here. 
#Gang