In [2]:
import pandas as pd
import nfl_data_py as nfl
import numpy as np

#The csv Files were too large to upload into gradescope, we included the columns.txt file to show the columns we used.


with open('columns.txt', 'r') as file:
    # Read the contents of the file
    content = file.read()
columnsList = content.split(',')
columnsList = [item.strip() for item in columnsList]


trainingDataframe= pd.read_csv('2007to2022.csv', usecols=columnsList)

def preprocess(data):
    #drop plays that are not pass or run
    data = data[data['play_type'].isin(['pass','run'])]
    #drop rows where qb_kneel is true and qb_spike is true
    data = data[(data['qb_kneel'] != True) & (data['qb_spike'] != True)]
    #if special_teams_play is true, drop the row
    data = data[data['special_teams_play'] != True]
    #penalty is 1, drop the row
    data = data[data['penalty'] != 1]
    #create subset data frame with only the columns we want
    data = data[['shotgun', 'run_location','defenders_in_box','number_of_pass_rushers'
        ,'time_to_throw','was_pressure', 'route',
    'defense_man_zone_type','play_type','yards_gained', 'penalty', 'desc',
    'half_seconds_remaining', 'down', 'qb_scramble', 'sack', 'ydstogo', 'yardline_100', 'defense_coverage_type',
    'game_seconds_remaining', 'posteam_score', 'defteam_score', 'air_yards', 'qb_hit', 'score_differential',
    'first_down_rush', 'first_down_pass', 'wp']]
    #If 'Field Goal formation' or 'Punt formation' included in desc, drop the row
    data = data[data["desc"].str.contains("Field Goal formation") == False]
    data = data[data["desc"].str.contains("Punt formation") == False]
    #Remove Two Point Conversion attempts
    data = data[data["desc"].str.contains("TWO-POINT") == False]
    #fill shotgun nan values with 0
    data['shotgun'] = data['shotgun'].fillna(0)
    #If run location is nan and play type is pass, fill with none
    data.loc[(data['run_location'].isnull()) & (data['play_type'] == 'pass'), 'run_location'] = 'none'
    #If run location is nan, fill with unknown
    data['run_location'] = data['run_location'].fillna('unknown')
    #if defenders in box is nan, fill with mean of defenders in box
    data['defenders_in_box'] = data['defenders_in_box'].fillna(data['defenders_in_box'].mean())
    #If number of pass rushers is nan, fill with mean of number of pass rushers
    data['number_of_pass_rushers'] = data['number_of_pass_rushers'].fillna(data['number_of_pass_rushers'].mean())
    #If time to throw is nan, and play type is run, fill with -1
    data.loc[(data['time_to_throw'].isnull()) & (data['play_type'] == 'run'), 'time_to_throw'] = -1
    #if time to throw is nan, fill with mean of time to throw
    data['time_to_throw'] = data['time_to_throw'].fillna(data['time_to_throw'].mean())
    #if was pressure is nan, and play type is run, fill with 0
    data.loc[(data['was_pressure'].isnull()) & (data['play_type'] == 'run'), 'was_pressure'] = 0
    #if was pressure is nan, but there is a sack, fill with 1
    data.loc[(data['was_pressure'].isnull()) & (data['sack'] == 1), 'was_pressure'] = 1
    #else assume no pressure
    data['was_pressure'] = data['was_pressure'].fillna(0)
    #if route is nan, but the play type is run, fill with NoRoute
    data.loc[(data['route'].isnull()) & (data['play_type'] == 'run'), 'route'] = 'NoRoute'
    #otherwise, fill the route with unknown
    data['route'] = data['route'].fillna('Unknown')
    #if defense man zone type is nan, fill with unknown
    data['defense_man_zone_type'] = data['defense_man_zone_type'].fillna('unknown')
    #if defense coverage type is nan, fill with unknown
    data['defense_coverage_type'] = data['defense_coverage_type'].fillna('unknown')
    #Fill air yards nan values with 0
    data['air_yards'] = data['air_yards'].fillna(0)
    #If qb scramble is nan, fill with 0
    data['qb_scramble'] = data['qb_scramble'].fillna(0)
    #If qb hit is nan, fill with 0
    data['qb_hit'] = data['qb_hit'].fillna(0)
    #find the categorical variables, and load them into a list
    categorical = [ 'run_location', 'route',
   'defense_man_zone_type', 'play_type', 'defense_coverage_type']
    #eliminate all rows with yards gained below -5
    data = data[data['yards_gained'] >= -5]
    #Perform one hot encoding on the categorical variables
    data = pd.get_dummies(data, columns=categorical, dtype=float)
    #create new feature, air yards against defense_man_zone_type_MAN_COVERAGE
    data['air_yards_againstManCoverage'] = data['air_yards'] * data['defense_man_zone_type_MAN_COVERAGE']
    #create new feature, air yards against defense_man_zone_type_ZONE_COVERAGE
    data['air_yards_againstZoneCoverage'] = data['air_yards'] * data['defense_man_zone_type_ZONE_COVERAGE']
    #create new feature, air yards against defense_coverage_type_COVER_0
    data['air_yards_againstCover0'] = data['air_yards'] * data['defense_coverage_type_COVER_0']
    #create new feature, air yards against defense_coverage_type_COVER_1
    data['air_yards_againstCover1'] = data['air_yards'] * data['defense_coverage_type_COVER_1']
    #create new feature, air yards against defense_coverage_type_COVER_2
    data['air_yards_againstCover2'] = data['air_yards'] * data['defense_coverage_type_COVER_2']
    #create new feature, air yards against defense_coverage_type_COVER_3
    data['air_yards_againstCover3'] = data['air_yards'] * data['defense_coverage_type_COVER_3']
    #create new feature, air yards against defense_coverage_type_COVER_4
    data['air_yards_againstCover4'] = data['air_yards'] * data['defense_coverage_type_COVER_4']    
    data = data.drop('penalty', axis=1)
    data = data.drop('sack', axis=1)
    #air yards with post route
    data['air_yards_post'] = data['air_yards'] * data['route_POST']
    #air yards with route_GO
    data['air_yards_go'] = data['air_yards'] * data['route_GO']
    #air yards with route_SLANT
    data['air_yards_slant'] = data['air_yards'] * data['route_SLANT']
    #air yards with route_HITCH
    data['air_yards_hitch'] = data['air_yards'] * data['route_HITCH']
    #air yards with route_OUT
    data['air_yards_out'] = data['air_yards'] * data['route_OUT']
    #air yards with route_CORNER
    data['air_yards_corner'] = data['air_yards'] * data['route_CORNER']
    #air yards with route_ANGLE
    data['air_yards_angle'] = data['air_yards'] * data['route_ANGLE']
    #time to throw with route_POST
    data['time_to_throw_post'] = data['time_to_throw'] * data['route_POST']
    #time to throw with route_GO
    data['time_to_throw_go'] = data['time_to_throw'] * data['route_GO']
    #first down rush with run location left
    data['first_down_rush_left'] = data['first_down_rush'] * data['run_location_left']
    #first down rush with run location middle
    data['first_down_rush_middle'] = data['first_down_rush'] * data['run_location_middle']
    #first down rush with run location right
    data['first_down_rush_right'] = data['first_down_rush'] * data['run_location_right']
    #first down pass with air yards
    data['first_down_pass_air_yards'] = data['first_down_pass'] * data['air_yards']
    #first down pass with time to throw
    data['first_down_pass_time_to_throw'] = data['first_down_pass'] * data['time_to_throw']
    #first down pass and shotgun
    data['first_down_pass_shotgun'] = data['first_down_pass'] * data['shotgun']
    #time to throw cross
    data['time_to_throw_cross'] = data['time_to_throw'] * data['route_CROSS']
    #time to throw flat
    data['time_to_throw_flat'] = data['time_to_throw'] * data['route_FLAT']
    #time to throw hitch
    data['time_to_throw_hitch'] = data['time_to_throw'] * data['route_HITCH']
    #time to throw against man coverage
    data['time_to_throw_manCoverage'] = data['time_to_throw']*data['defense_man_zone_type_MAN_COVERAGE']
    #time to throw against zone coverage
    data['time_to_throw_zoneCoverage'] = data['time_to_throw']*data['defense_man_zone_type_ZONE_COVERAGE']
    #time to throw against cover 0
    data['time_to_throw_cover0'] = data['time_to_throw']*data['defense_coverage_type_COVER_0']
    #time to throw against cover 1
    data['time_to_throw_cover1'] = data['time_to_throw']*data['defense_coverage_type_COVER_1']
    #time to throw against cover 2
    data['time_to_throw_cover2'] = data['time_to_throw']*data['defense_coverage_type_COVER_2']
    #time to throw against cover 3
    data['time_to_throw_cover3'] = data['time_to_throw']*data['defense_coverage_type_COVER_3']
    #run location left against down
    data['run_location_left_down'] = data['run_location_left']*data['down']
    #run location middle against down
    data['run_location_middle_down'] = data['run_location_middle']*data['down']
    #run location right against down
    data['run_location_right_down'] = data['run_location_right']*data['down']
    #play_type_run against down
    data['play_type_run_down'] = data['play_type_run']*data['down']
    #play_type_pass against down
    data['play_type_pass_down'] = data['play_type_pass']*data['down']
    #Shotgun against down
    data['shotgun_down'] = data['shotgun']*data['down']
    #shotgun against play_type_run
    data['shotgun_run'] = data['shotgun']*data['play_type_run']
    #shotgun against play_type_pass
    data['shotgun_pass'] = data['shotgun']*data['play_type_pass']
    #Shotgun against air yards
    data['shotgun_air_yards'] = data['shotgun']*data['air_yards']
    #ydstogo and shotgun
    data['ydstogo_shotgun'] = data['ydstogo']*data['shotgun']
    #ydstogo and air yards
    data['ydstogo_air_yards'] = data['ydstogo']*data['air_yards']
    #ydstogo and time to throw
    data['ydstogo_time_to_throw'] = data['ydstogo']*data['time_to_throw']
    #yardline_100 and air yards
    data['yardline_100_air_yards'] = data['yardline_100']*data['air_yards']
    #yardline_100 and first down pass
    data['yardline_100_first_down_pass'] = data['yardline_100']*data['first_down_pass']
    #yardline_100 and first down rush
    data['yardline_100_first_down_rush'] = data['yardline_100']*data['first_down_rush']
    #air yards and down
    data['air_yards_down'] = data['air_yards']*data['down']
    #ydstogo and play_type_run
    data['ydstogo_run'] = data['ydstogo']*data['play_type_run']
    #ydstogo and play_type_pass
    data['ydstogo_pass'] = data['ydstogo']*data['play_type_pass']
    #yardline_100 and play_type_run
    data['yardline_100_run'] = data['yardline_100']*data['play_type_run']
    #yardline_100 and play_type_pass
    data['yardline_100_pass'] = data['yardline_100']*data['play_type_pass']
    
    return data

trainDf = (preprocess(trainingDataframe))
testDf = pd.read_csv('2023.csv', usecols=columnsList)
testDf = preprocess(testDf)

print(trainDf.isnull().values.any())
print(trainDf.shape)
print(testDf.shape)




  trainingDataframe= pd.read_csv('2007to2022.csv', usecols=columnsList)
  testDf = pd.read_csv('2023.csv', usecols=columnsList)


False
(517401, 105)
(33859, 105)


In [2]:
#In the training data, whats the average yards gained on a play?
print(trainDf['yards_gained'].mean())
#In the training data, whats the amount of plays with yards gained ggreater than 20, and ratio in the data?
print(trainDf[trainDf['yards_gained'] > 20].shape[0])
print(trainDf[trainDf['yards_gained'] > 20].shape[0]/trainDf.shape[0])
#In the training data, whats the average yards gained on first down for a run play?
print(trainDf[(trainDf['play_type_run'] == 1) & (trainDf['down'] == 1)]['yards_gained'].mean())

5.849157616626176
27827
0.053782269458311834
4.409508405544475


In [3]:
tempDf = trainDf.drop('desc', axis=1)
correlationMatrix = tempDf.corr()
target_correlations = correlationMatrix['yards_gained'].drop('yards_gained')
# Sort the correlations in descending order
sorted_correlations = target_correlations.abs().sort_values(ascending=False)

# Display the sorted correlations
for i in sorted_correlations.index:
    print(f"{i}: {sorted_correlations[i]}")

first_down_pass_air_yards: 0.6825683515122594
first_down_pass: 0.5975386510338233
yardline_100_first_down_pass: 0.5947021190378692
first_down_pass_shotgun: 0.458421042357405
first_down_pass_time_to_throw: 0.4450800739209828
yardline_100_air_yards: 0.2727264380292437
air_yards: 0.26574655088999316
ydstogo_air_yards: 0.23917417903155944
air_yards_down: 0.22466107118695514
yardline_100_first_down_rush: 0.19975311346040237
shotgun_air_yards: 0.18673254679605628
yardline_100_pass: 0.17320191538209234
first_down_rush: 0.17111727258649723
run_location_none: 0.14479189121721173
play_type_pass: 0.14479189121721173
play_type_run: 0.14479189121721167
route_NoRoute: 0.14478257457177265
ydstogo_pass: 0.14230241436391644
time_to_throw: 0.12901334357048355
air_yards_againstZoneCoverage: 0.12565562091505264
play_type_run_down: 0.12441174525454225
ydstogo_time_to_throw: 0.11881792739358527
first_down_rush_left: 0.11881552719931783
first_down_rush_right: 0.11205056548065302
play_type_pass_down: 0.106336

In [25]:
#finding outlier features
from sklearn.feature_selection import VarianceThreshold
currentX = trainDf.drop('desc', axis=1)
currentX = currentX.drop('yards_gained', axis=1)
currentY = trainDf['yards_gained']
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(currentX)
print("Number of features before variance threshold: ", len(currentX.columns))
print("Number of features after variance threshold: ", len(currentX.columns[sel.get_support()]))
print("Number of features removed: ", len(currentX.columns) - len(currentX.columns[sel.get_support()]))
print("Features removed: ", currentX.columns[~sel.get_support()])
print("Features kept: ", currentX.columns[sel.get_support()])


Number of features before variance threshold:  103
Number of features after variance threshold:  62
Number of features removed:  41
Features removed:  Index(['was_pressure', 'qb_scramble', 'qb_hit', 'first_down_rush',
       'first_down_pass', 'wp', 'run_location_left', 'run_location_middle',
       'run_location_right', 'run_location_unknown', 'route_ANGLE',
       'route_CORNER', 'route_CROSS', 'route_FLAT', 'route_GO', 'route_HITCH',
       'route_IN', 'route_OUT', 'route_POST', 'route_SCREEN', 'route_SLANT',
       'route_WHEEL', 'defense_man_zone_type_MAN_COVERAGE',
       'defense_man_zone_type_ZONE_COVERAGE', 'defense_man_zone_type_unknown',
       'defense_coverage_type_2_MAN', 'defense_coverage_type_COVER_0',
       'defense_coverage_type_COVER_1', 'defense_coverage_type_COVER_2',
       'defense_coverage_type_COVER_3', 'defense_coverage_type_COVER_4',
       'defense_coverage_type_COVER_6', 'defense_coverage_type_PREVENT',
       'defense_coverage_type_unknown', 'time_to_thro

In [None]:

import tensorflow as tf

X = trainDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y = trainDf['yards_gained']
X_test = testDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y_test = testDf['yards_gained']
#print shapes
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(103, input_dim=103))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(103))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(103))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(103))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(103))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(1))


# Compile Model
model.compile(loss='mean_absolute_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001))
#fit the model
model.fit(x = X, y = y, epochs=50)


#evaluate the model
model.evaluate(X_test, y_test)
model.save('FirstModel.keras')


In [None]:

model = tf.keras.models.Sequential()

# Input Layer
model.add(tf.keras.layers.Dense(1024, input_dim=103))  # Increased neurons
model.add(tf.keras.layers.ReLU())

# Hidden Layers (Increasing complexity by adding more layers and neurons)
model.add(tf.keras.layers.Dense(512))
model.add(tf.keras.layers.ReLU())

model.add(tf.keras.layers.Dense(256))
model.add(tf.keras.layers.ReLU())


model.add(tf.keras.layers.Dense(200))
model.add(tf.keras.layers.ReLU())

model.add(tf.keras.layers.Dense(150))
model.add(tf.keras.layers.ReLU())

model.add(tf.keras.layers.Dense(100))
model.add(tf.keras.layers.ReLU())

model.add(tf.keras.layers.Dense(50))
model.add(tf.keras.layers.ReLU())

model.add(tf.keras.layers.Dense(24))
model.add(tf.keras.layers.ReLU())

# Output Layer
model.add(tf.keras.layers.Dense(1))



# Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# Compile Model
model.compile(loss='mean_absolute_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001))

# Train the Model
model.fit(X, y, validation_split=0.2, epochs=50, batch_size=64, callbacks=[early_stopping])


#evaluate the model
model.evaluate(X_test, y_test)
model.save('Adam1024SecondModel.keras')



In [4]:

X = trainDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y = trainDf['yards_gained']
X_test = testDf.drop(columns = ['desc', 'yards_gained'], axis=1)
y_test = testDf['yards_gained']

In [5]:
import tensorflow as tf

#load the UpdatedModel.keras
model = tf.keras.models.load_model('FirstModel.keras')
#get the accuracy for the model on X_test and y_test
model.evaluate(X_test, y_test)
#print average error on the 2023 data
print("average error: ", np.mean(np.abs(model.predict(X_test).flatten() - y_test)))
#which plays does the model predict the worst
testDf['yards_gained'] = y_test
testDf['predicted_yards_gained'] = model.predict(X_test)
testDf['error'] = np.abs(testDf['predicted_yards_gained'] - testDf['yards_gained'])
#print the description, error, yards gained, and predicted yards gained of the top 10 worst plays
worst = (testDf[['desc', 'error', 'yards_gained', 'predicted_yards_gained', 'air_yards']].sort_values('error', ascending=False).head(5))
for i in range(5):
    print(worst.iloc[i]['desc'])
    print(worst.iloc[i]['error'])
    print(worst.iloc[i]['yards_gained'])
    print(worst.iloc[i]['predicted_yards_gained'])
    print(worst.iloc[i]['air_yards'])
    print()

[1m1059/1059[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316us/step - loss: 2.4392
[1m1059/1059[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 330us/step
average error:  2.47895845543307
[1m1059/1059[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321us/step
(10:52) (Shotgun) 9-M.Stafford pass short right to 17-P.Nacua pushed ob at NYG 2 for 80 yards (24-D.Belton).
73.06888103485107
80.0
6.931119
4.0

(10:57) (Shotgun) 2-M.Rudolph pass short left to 14-G.Pickens for 86 yards, TOUCHDOWN.
72.55834579467773
86.0
13.441654
10.0

(8:55) (Shotgun) 8-L.Jackson pass short right to 35-G.Edwards to DET 11 for 80 yards (31-K.Joseph).
69.44532680511475
80.0
10.554673
4.0

(13:02) 20-Bre.Hall right guard to BUF 13 for 83 yards (47-C.Benford).
69.02946758270264
83.0
13.970532
0.0

(11:22) (Shotgun) 6-J.Browning pass short middle to 5-T.Higgins for 80 yards, TOUCHDOWN.
66.9771614074707
80.0
13.022839
10.0



In [18]:
best = (testDf[['desc', 'error', 'yards_gained', 'predicted_yards_gained', 'air_yards']].sort_values('error', ascending=True).head(5))
for i in range(5):
    print(best.iloc[i]['desc'])
    print(best.iloc[i]['error'])
    print(best.iloc[i]['yards_gained'])
    print(best.iloc[i]['predicted_yards_gained'])
    print(best.iloc[i]['air_yards'])
    print()


(4:07) 4-A.O'Connell pass incomplete deep middle to 17-D.Adams [94-C.Wilkins].
Penalty on LV-72-J.Eluemunor, Offensive Holding, declined.
1.3262033462524414e-06
0.0
1.3262033e-06
41.0

(12:48) 2-D.Lock pass incomplete deep right to 11-J.Smith-Njigba (7-C.Ward).
2.3692846298217773e-06
0.0
-2.3692846e-06
52.0

(:55) (No Huddle, Shotgun) 7-G.Smith pass incomplete short middle to 84-C.Parkinson (23-D.Hill).
3.6507844924926758e-06
0.0
-3.6507845e-06
11.0

(1:26) 4-D.Prescott pass incomplete deep left to 3-B.Cooks.
4.127621650695801e-06
0.0
4.1276217e-06
41.0

(1:15) 22-D.Henry right tackle to CAR 1 for no gain (13-T.Hill).
4.649162292480469e-06
0.0
-4.6491623e-06
0.0



In [21]:
#predicted plays where difference between predicted and actual yards gained is less than or equal to 2
print(testDf[testDf['error'] <= 2].shape[0])

27777


In [5]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

#Test the model with linear regression 

# Create a linear regression model
model = LinearRegression()
regressionX = trainDf.drop(columns = ['desc', 'yards_gained'], axis=1)
regressionY = trainDf['yards_gained']
regressionX_test = testDf.drop(columns = ['desc', 'yards_gained'], axis=1)
regressionY_test = testDf['yards_gained']
# Fit the model
model.fit(regressionX, regressionY)
#Whats the mean absolute error of the model
print("Mean Absolute Error: ", mean_absolute_error(regressionY_test, model.predict(regressionX_test)))
#Whats the r2 score of the model
print("R2 Score: ", r2_score(regressionY_test, model.predict(regressionX_test)))



Mean Absolute Error:  3.169408777515341
R2 Score:  0.6492592498767362


In [11]:
for i in range(len(testDf)):

    print(testDf.iloc[i]['desc'])
    print("Yards gained")
    print(testDf.iloc[i]['yards_gained'])
    print("predicted")
    print(testDf.iloc[i]['predicted_yards_gained'])
    print("error")
    print(testDf.iloc[i]['error'])
    

(15:00) (Shotgun) 8-B.Robinson right tackle to WAS 28 for 3 yards (93-J.Ledbetter).
Yards gained
3.0
predicted
2.984728
error
0.015271902084350586
(14:30) (Shotgun) 14-S.Howell pass short right to 1-J.Dotson to WAS 34 for 6 yards (13-K.Clark, 10-J.Woods).
Yards gained
6.0
predicted
5.2972927
error
0.7027072906494141
(13:55) 23-C.Rodriguez left guard to WAS 36 for 2 yards (97-C.Thomas; 25-Z.Collins).
Yards gained
2.0
predicted
3.8047218
error
1.8047218322753906
(13:16) (Shotgun) 14-S.Howell pass incomplete short middle to 82-L.Thomas.
Yards gained
0.0
predicted
-0.00080130994
error
0.0008013099431991577
(13:12) 14-S.Howell pass short middle to 1-J.Dotson to WAS 48 for 12 yards (13-K.Clark).
Yards gained
12.0
predicted
13.044235
error
1.0442352294921875
(12:34) (Shotgun) 14-S.Howell pass short left to 4-C.Samuel to WAS 49 for 1 yard (34-J.Thompson).
Yards gained
1.0
predicted
3.0811553
error
2.081155300140381
(11:56) (Shotgun) 14-S.Howell pass incomplete short left to 82-L.Thomas (22-K.W