In [1]:
import pandas as pd
import nfl_data_py as nfl
import numpy as np

#The csv Files were too large to upload into gradescope, we included the columns.txt file to show the columns we used.


with open('columns.txt', 'r') as file:
    # Read the contents of the file
    content = file.read()
columnsList = content.split(',')
columnsList = [item.strip() for item in columnsList]


trainingDataframe= pd.read_csv('2007to2022.csv', usecols=columnsList)

def preprocess(data):
    #drop plays that are not pass or run
    data = data[data['play_type'].isin(['pass','run'])]
    #drop rows where qb_kneel is true and qb_spike is true
    data = data[(data['qb_kneel'] != True) & (data['qb_spike'] != True)]
    #if special_teams_play is true, drop the row
    data = data[data['special_teams_play'] != True]
    #penalty is 1, drop the row
    data = data[data['penalty'] != 1]
    #create subset data frame with only the columns we want
    data = data[['shotgun', 'run_location','defenders_in_box','number_of_pass_rushers'
        ,'time_to_throw','was_pressure', 'route',
    'defense_man_zone_type','play_type','yards_gained', 'penalty', 'desc',
    'half_seconds_remaining', 'down', 'qb_scramble', 'sack', 'ydstogo', 'yardline_100', 'defense_coverage_type',
    'game_seconds_remaining', 'posteam_score', 'defteam_score', 'qb_hit', 'score_differential',
    'first_down_rush', 'first_down_pass', 'wp']]
    #If 'Field Goal formation' or 'Punt formation' included in desc, drop the row
    data = data[data["desc"].str.contains("Field Goal formation") == False]
    data = data[data["desc"].str.contains("Punt formation") == False]
    #Remove Two Point Conversion attempts
    data = data[data["desc"].str.contains("TWO-POINT") == False]
    #fill shotgun nan values with 0
    data['shotgun'] = data['shotgun'].fillna(0)
    #If run location is nan and play type is pass, fill with none
    data.loc[(data['run_location'].isnull()) & (data['play_type'] == 'pass'), 'run_location'] = 'none'
    #If run location is nan, fill with unknown
    data['run_location'] = data['run_location'].fillna('unknown')
    #if defenders in box is nan, fill with mean of defenders in box
    data['defenders_in_box'] = data['defenders_in_box'].fillna(data['defenders_in_box'].mean())
    #If number of pass rushers is nan, fill with mean of number of pass rushers
    data['number_of_pass_rushers'] = data['number_of_pass_rushers'].fillna(data['number_of_pass_rushers'].mean())
    #If time to throw is nan, and play type is run, fill with -1
    data.loc[(data['time_to_throw'].isnull()) & (data['play_type'] == 'run'), 'time_to_throw'] = -1
    #if time to throw is nan, fill with mean of time to throw
    data['time_to_throw'] = data['time_to_throw'].fillna(data['time_to_throw'].mean())
    #if was pressure is nan, and play type is run, fill with 0
    data.loc[(data['was_pressure'].isnull()) & (data['play_type'] == 'run'), 'was_pressure'] = 0
    #if was pressure is nan, but there is a sack, fill with 1
    data.loc[(data['was_pressure'].isnull()) & (data['sack'] == 1), 'was_pressure'] = 1
    #else assume no pressure
    data['was_pressure'] = data['was_pressure'].fillna(0)
    #if route is nan, but the play type is run, fill with NoRoute
    data.loc[(data['route'].isnull()) & (data['play_type'] == 'run'), 'route'] = 'NoRoute'
    #otherwise, fill the route with unknown
    data['route'] = data['route'].fillna('Unknown')
    #if defense man zone type is nan, fill with unknown
    data['defense_man_zone_type'] = data['defense_man_zone_type'].fillna('unknown')
    #if defense coverage type is nan, fill with unknown
    data['defense_coverage_type'] = data['defense_coverage_type'].fillna('unknown')
    #If qb scramble is nan, fill with 0
    data['qb_scramble'] = data['qb_scramble'].fillna(0)
    #If qb hit is nan, fill with 0
    data['qb_hit'] = data['qb_hit'].fillna(0)
    #find the categorical variables, and load them into a list
    categorical = [ 'run_location', 'route',
   'defense_man_zone_type', 'play_type', 'defense_coverage_type']
    #eliminate all rows with yards gained below -5
    data = data[data['yards_gained'] >= -5]
    #Perform one hot encoding on the categorical variables
    data = pd.get_dummies(data, columns=categorical, dtype=float)
    data = data.drop('penalty', axis=1)
    data = data.drop('sack', axis=1)
    #time to throw with route_POST
    data['time_to_throw_post'] = data['time_to_throw'] * data['route_POST']
    #time to throw with route_GO
    data['time_to_throw_go'] = data['time_to_throw'] * data['route_GO']
    #first down rush with run location left
    data['first_down_rush_left'] = data['first_down_rush'] * data['run_location_left']
    #first down rush with run location middle
    data['first_down_rush_middle'] = data['first_down_rush'] * data['run_location_middle']
    #first down rush with run location right
    data['first_down_rush_right'] = data['first_down_rush'] * data['run_location_right']
    #first down pass with time to throw
    data['first_down_pass_time_to_throw'] = data['first_down_pass'] * data['time_to_throw']
    #first down pass and shotgun
    data['first_down_pass_shotgun'] = data['first_down_pass'] * data['shotgun']
    #time to throw cross
    data['time_to_throw_cross'] = data['time_to_throw'] * data['route_CROSS']
    #time to throw flat
    data['time_to_throw_flat'] = data['time_to_throw'] * data['route_FLAT']
    #time to throw hitch
    data['time_to_throw_hitch'] = data['time_to_throw'] * data['route_HITCH']
    #time to throw against man coverage
    data['time_to_throw_manCoverage'] = data['time_to_throw']*data['defense_man_zone_type_MAN_COVERAGE']
    #time to throw against zone coverage
    data['time_to_throw_zoneCoverage'] = data['time_to_throw']*data['defense_man_zone_type_ZONE_COVERAGE']
    #time to throw against cover 0
    data['time_to_throw_cover0'] = data['time_to_throw']*data['defense_coverage_type_COVER_0']
    #time to throw against cover 1
    data['time_to_throw_cover1'] = data['time_to_throw']*data['defense_coverage_type_COVER_1']
    #time to throw against cover 2
    data['time_to_throw_cover2'] = data['time_to_throw']*data['defense_coverage_type_COVER_2']
    #time to throw against cover 3
    data['time_to_throw_cover3'] = data['time_to_throw']*data['defense_coverage_type_COVER_3']
    #run location left against down
    data['run_location_left_down'] = data['run_location_left']*data['down']
    #run location middle against down
    data['run_location_middle_down'] = data['run_location_middle']*data['down']
    #run location right against down
    data['run_location_right_down'] = data['run_location_right']*data['down']
    #play_type_run against down
    data['play_type_run_down'] = data['play_type_run']*data['down']
    #play_type_pass against down
    data['play_type_pass_down'] = data['play_type_pass']*data['down']
    #Shotgun against down
    data['shotgun_down'] = data['shotgun']*data['down']
    #shotgun against play_type_run
    data['shotgun_run'] = data['shotgun']*data['play_type_run']
    #shotgun against play_type_pass
    data['shotgun_pass'] = data['shotgun']*data['play_type_pass']
    #ydstogo and shotgun
    data['ydstogo_shotgun'] = data['ydstogo']*data['shotgun']
    #ydstogo and time to throw
    data['ydstogo_time_to_throw'] = data['ydstogo']*data['time_to_throw']
    #yardline_100 and first down pass
    data['yardline_100_first_down_pass'] = data['yardline_100']*data['first_down_pass']
    #yardline_100 and first down rush
    data['yardline_100_first_down_rush'] = data['yardline_100']*data['first_down_rush']
    #ydstogo and play_type_run
    data['ydstogo_run'] = data['ydstogo']*data['play_type_run']
    #ydstogo and play_type_pass
    data['ydstogo_pass'] = data['ydstogo']*data['play_type_pass']
    #yardline_100 and play_type_run
    data['yardline_100_run'] = data['yardline_100']*data['play_type_run']
    #yardline_100 and play_type_pass
    data['yardline_100_pass'] = data['yardline_100']*data['play_type_pass']
    
    return data

trainDf = (preprocess(trainingDataframe))
testDf = pd.read_csv('2023.csv', usecols=columnsList)
testDf = preprocess(testDf)

print(trainDf.isnull().values.any())
print(trainDf.shape)
print(testDf.shape)




  trainingDataframe= pd.read_csv('2007to2022.csv', usecols=columnsList)
  testDf = pd.read_csv('2023.csv', usecols=columnsList)


False
(517401, 85)
(33859, 85)


In [2]:
tempDf = trainDf.drop('desc', axis=1)
correlationMatrix = tempDf.corr()
target_correlations = correlationMatrix['yards_gained'].drop('yards_gained')
# Sort the correlations in descending order
sorted_correlations = target_correlations.abs().sort_values(ascending=False)

# Display the sorted correlations
for i in sorted_correlations.index:
    print(f"{i}: {sorted_correlations[i]}")

first_down_pass: 0.5975386510338233
yardline_100_first_down_pass: 0.5947021190378692
first_down_pass_shotgun: 0.458421042357405
first_down_pass_time_to_throw: 0.4450800739209828
yardline_100_first_down_rush: 0.19975311346040237
yardline_100_pass: 0.17320191538209234
first_down_rush: 0.17111727258649723
play_type_pass: 0.14479189121721173
run_location_none: 0.14479189121721173
play_type_run: 0.14479189121721167
route_NoRoute: 0.14478257457177265
ydstogo_pass: 0.14230241436391644
time_to_throw: 0.12901334357048355
play_type_run_down: 0.12441174525454225
ydstogo_time_to_throw: 0.11881792739358527
first_down_rush_left: 0.11881552719931783
first_down_rush_right: 0.11205056548065302
play_type_pass_down: 0.10633605483283955
yardline_100: 0.1052405323672947
ydstogo_run: 0.10084532888017107
yardline_100_run: 0.09594095979508166
shotgun_pass: 0.08472890146931479
route_POST: 0.07774017560664459
time_to_throw_post: 0.07737456853910843
time_to_throw_go: 0.07646076881663554
run_location_middle: 0.07

In [3]:

import tensorflow as tf

X = trainDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y = trainDf['yards_gained']
X_test = testDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y_test = testDf['yards_gained']
#print shapes
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(83, input_dim=83))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(83))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(83))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(83))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(83))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(1))


# Compile Model
model.compile(loss='mean_absolute_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001))
#fit the model
model.fit(x = X, y = y, epochs=40)


#evaluate the model
model.evaluate(X_test, y_test)
model.save('NoAirYards.keras')


(517401, 83)
(517401,)
(33859, 83)
(33859,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 551us/step - loss: 3.9799
Epoch 2/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 563us/step - loss: 3.3745
Epoch 3/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 547us/step - loss: 3.2718
Epoch 4/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 549us/step - loss: 3.2003
Epoch 5/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 546us/step - loss: 3.1860
Epoch 6/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 548us/step - loss: 3.1530
Epoch 7/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 566us/step - loss: 3.1306
Epoch 8/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 570us/step - loss: 3.1205
Epoch 9/40
[1m16169/16169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 551us/step - loss: 3.1140
Epoch 10/40
[1m16169/16169[0m [32m

In [4]:

X = trainDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y = trainDf['yards_gained']
X_test = testDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y_test = testDf['yards_gained']

In [6]:
import tensorflow as tf

#load the UpdatedModel.keras
model = tf.keras.models.load_model('NoAirYards.keras')
#get the accuracy for the model on X_test and y_test
model.evaluate(X_test, y_test)
#print average error on the 2023 data
print("average error: ", np.mean(np.abs(model.predict(X_test).flatten() - y_test)))
#which plays does the model predict the worst
testDf['yards_gained'] = y_test
testDf['predicted_yards_gained'] = model.predict(X_test)
testDf['error'] = np.abs(testDf['predicted_yards_gained'] - testDf['yards_gained'])
#print the description, error, yards gained, and predicted yards gained of the top 10 worst plays
worst = (testDf[['desc', 'error', 'yards_gained', 'predicted_yards_gained']].sort_values('error', ascending=False).head(5))
for i in range(5):
    print(worst.iloc[i]['desc'])
    print(worst.iloc[i]['error'])
    print(worst.iloc[i]['yards_gained'])
    print(worst.iloc[i]['predicted_yards_gained'])
    print()

[1m1059/1059[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270us/step - loss: 2.8148
[1m1059/1059[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303us/step
average error:  2.897627365295731
[1m1059/1059[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280us/step
(10:57) (Shotgun) 2-M.Rudolph pass short left to 14-G.Pickens for 86 yards, TOUCHDOWN.
77.80409908294678
86.0
8.195901

(10:52) (Shotgun) 9-M.Stafford pass short right to 17-P.Nacua pushed ob at NYG 2 for 80 yards (24-D.Belton).
69.91142463684082
80.0
10.088575

(13:02) 20-Bre.Hall right guard to BUF 13 for 83 yards (47-C.Benford).
68.46684741973877
83.0
14.533153

(8:55) (Shotgun) 8-L.Jackson pass short right to 35-G.Edwards to DET 11 for 80 yards (31-K.Joseph).
68.40205001831055
80.0
11.59795

(11:22) (Shotgun) 6-J.Browning pass short middle to 5-T.Higgins for 80 yards, TOUCHDOWN.
66.37120342254639
80.0
13.628797

