In [265]:
# initializing notebook - ashwat

In [266]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import keras

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical

%matplotlib inline

In [267]:
# import data from CSV file
raw_data = pd.read_csv('NFLCombineData.csv')
print(raw_data.shape)

(4945, 25)


In [268]:
# quick test, to see how many values are missing for a specific parameter
parameter = 'fortyyd'
raw_data_no_outlier = raw_data.loc[raw_data[parameter] > 0]
print("Number of Players' Data Missing for -->", "\b", parameter, ":", raw_data.shape[0]-raw_data_no_outlier.shape[0])

Number of Players' Data Missing for -->  fortyyd : 191


In [269]:
# combining positions based on related characterisitcs into groupings (6 classes)
    # 0: Running Backs: RB, FB
    # 1: Pass Catchers: WR, TE
    # 2: Defensive Backs: FS, SS, CB
    # 3: Lineman: OT, OG, OC, C, NT, DT, DE, LS 
    # 4: Linebackers: ILB, OLB
    # 5: Quarterback + Special Team: QB, P, K
    
# create dictionary of metrics corresponding to index in data
pos_dict = {'year': 0, 'name': 1, 'firstname': 2, 'lastname': 3, 'position': 4, 'heightfeet': 5, 
            'heightinches': 6, 'heightinchestotal': 7, 'weight': 8, 'arms': 9, 'hands': 10,
            'fortyyd': 11, 'twentyyd': 12, 'tenyd': 13, 'twentyss': 14, 'threecone': 15, 
            'vertical': 16, 'broad': 17, 'bench': 18, 'round': 19, 'college': 20, 
            'pickround': 21, 'picktotal': 22, 'wonderlic': 23, 'nflgrade': 24}

# extract all players (stored by position) from raw_data
rb = (raw_data.loc[raw_data['position'] == 'RB']).to_numpy()
fb = (raw_data.loc[raw_data['position'] == 'FB']).to_numpy()
wr = (raw_data.loc[raw_data['position'] == 'WR']).to_numpy()
te = (raw_data.loc[raw_data['position'] == 'TE']).to_numpy()
fs = (raw_data.loc[raw_data['position'] == 'FS']).to_numpy()
ss = (raw_data.loc[raw_data['position'] == 'SS']).to_numpy()
cb = (raw_data.loc[raw_data['position'] == 'CB']).to_numpy()
ot = (raw_data.loc[raw_data['position'] == 'OT']).to_numpy()
og = (raw_data.loc[raw_data['position'] == 'OG']).to_numpy()
oc = (raw_data.loc[raw_data['position'] == 'OC']).to_numpy()
c = (raw_data.loc[raw_data['position'] == 'C']).to_numpy()
nt = (raw_data.loc[raw_data['position'] == 'NT']).to_numpy()
dt = (raw_data.loc[raw_data['position'] == 'DT']).to_numpy()
de = (raw_data.loc[raw_data['position'] == 'DE']).to_numpy()
ls = (raw_data.loc[raw_data['position'] == 'LS']).to_numpy()
ilb = (raw_data.loc[raw_data['position'] == 'ILB']).to_numpy()
olb = (raw_data.loc[raw_data['position'] == 'OLB']).to_numpy()
qb = (raw_data.loc[raw_data['position'] == 'QB']).to_numpy()
p = (raw_data.loc[raw_data['position'] == 'P']).to_numpy()
k = (raw_data.loc[raw_data['position'] == 'K']).to_numpy()

# concatenate similar positions based on the classification of groups (detailed above)
run_back = np.concatenate((rb, fb))
pass_catch = np.concatenate((wr, te))
defense_back = np.concatenate((fs, ss, cb))
linemen = np.concatenate((ot, og, oc, c, nt, dt, de, ls))
lineback = np.concatenate((ilb, olb))
quarterback_special = np.concatenate((qb, p, k))

In [270]:
# initialize arrays to store data
data_players = []
pos_index = []

# *** INPUT HERE, WHICH PARAMETERS TO EXTRACT AND TRAIN ON ***
parameters = ['weight', 'heightinchestotal', 'broad', 'fortyyd']

# 0: Running Backs: RB, FB
for i in range(len(run_back)):
    rowArr = []
    for j in range(len(parameters)):
        rowArr.append(run_back[i][pos_dict.get(parameters[j])])
    data_players.append(np.array(rowArr))
    pos_index.append(0)
    
# 1: Pass Catchers: WR, TE
for i in range(len(pass_catch)):
    rowArr = []
    for j in range(len(parameters)):
        rowArr.append(pass_catch[i][pos_dict.get(parameters[j])])
    data_players.append(np.array(rowArr))
    pos_index.append(1)
    
# 2: Defensive Backs: FS, SS, CB
for i in range(len(defense_back)):
    rowArr = []
    for j in range(len(parameters)):
        rowArr.append(defense_back[i][pos_dict.get(parameters[j])])
    data_players.append(np.array(rowArr))
    pos_index.append(2)
    
# 3: Lineman: OT, OG, OC, C, NT, DT, DE, LS 
for i in range(len(linemen)):
    rowArr = []
    for j in range(len(parameters)):
        rowArr.append(linemen[i][pos_dict.get(parameters[j])])
    data_players.append(np.array(rowArr))
    pos_index.append(3)
    
# 4: Linebackers: ILB, OLB
for i in range(len(lineback)):
    rowArr = []
    for j in range(len(parameters)):
        rowArr.append(lineback[i][pos_dict.get(parameters[j])])
    data_players.append(np.array(rowArr))
    pos_index.append(4)
    
# 5: Quarterback + Special Team: QB, P, K
for i in range(len(quarterback_special)):
    rowArr = []
    for j in range(len(parameters)):
        rowArr.append(quarterback_special[i][pos_dict.get(parameters[j])])
    data_players.append(np.array(rowArr))
    pos_index.append(5)
    
# convert from list to array
data_players = np.array(data_players)

# one-hot encode outputs for classifcation
pos_index = np.array(pos_index)
pos_index = to_categorical(pos_index, 6)

print(len(data_players)) # --> 4945, all players metrics pulled
print(len(data_players[0])) # --> 4, all parameters considered
print(len(pos_index)) # --> 4945, all players' groupings labeled

# ALL GOOD SO FAR

4945
4
4945


In [271]:
model = Sequential()
model.add(Dense(units=6, input_shape=(len(parameters),), activation='softmax')) # 3 output units, because 3 classes
model.compile(Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
h = model.fit(x=data_players, y=pos_index, validation_split = 0.2, verbose=1, epochs=100)

In [None]:
### IGNORE BELOW THIS POINT

In [None]:
# my positions: QB P K FS SS
qb_data = raw_data.loc[raw_data['position'] == 'QB'] # 304 entries
p_data = raw_data.loc[raw_data['position'] == 'P'] # 12 entries
k_data = raw_data.loc[raw_data['position'] == 'K'] # 9 entries
fs_data = raw_data.loc[raw_data['position'] == 'FS'] # 211 entries
ss_data = raw_data.loc[raw_data['position'] == 'SS'] # 184 entries



# # good metric, consistent data for all positions
# sns.violinplot(x=qb_data["heightinchestotal"])
# sns.violinplot(x=fs_data["heightinchestotal"])
# sns.violinplot(x=ss_data["heightinchestotal"])

# # poor metric, large amount of absent data
# sns.violinplot(x=qb_data["threecone"])
# sns.violinplot(x=fs_data["threecone"])
# sns.violinplot(x=ss_data["threecone"])

# # fairly good metric, somewhat consistent data for all positions (few null)
# sns.violinplot(x=qb_data["broad"])
# sns.violinplot(x=fs_data["broad"])
# sns.violinplot(x=ss_data["broad"])

# # bad metric, no data for QB players
# sns.violinplot(x=qb_data["bench"])
# sns.violinplot(x=fs_data["bench"])
# sns.violinplot(x=ss_data["bench"])

# # potentially good metric, consistent data for all positions (but might not have correlation)
# sns.violinplot(x=qb_data["weight"])
# sns.violinplot(x=fs_data["weight"])
# sns.violinplot(x=ss_data["weight"])



raw_data_no_outlier = raw_data.loc[raw_data['vertical'] > 0]
raw_data_no_outlier = raw_data.loc[raw_data['threecone'] > 0]
raw_data_no_outlier = raw_data.loc[raw_data['broad'] > 0]
raw_data_no_outlier = raw_data.loc[raw_data['bench'] > 0]
raw_data_no_outlier = raw_data.loc[raw_data['round'] > 0]

In [None]:
# positions = list(raw_data['position'].unique())

# positions.remove('P')
# positions.remove('K')

positions = list(['QB', 'FS', 'SS'])

pos_index = []


In [None]:
qb_data = raw_data.loc[raw_data['position'] == 'QB']
fs_data = raw_data.loc[raw_data['position'] == 'FS']
ss_data = raw_data.loc[raw_data['position'] == 'SS']

In [None]:
# X = raw_data
dataPlayers = []
pos_index = []

# print(type(qb_data))

parameters = ['weight', 'heightinchestotal', 'broad']

qb_weight = qb_data['weight'].to_numpy()
qb_height = qb_data['heightinchestotal'].to_numpy()
qb_broad = qb_data['broad'].to_numpy()

fs_weight = fs_data['weight'].to_numpy()
fs_height = fs_data['heightinchestotal'].to_numpy()
fs_broad = fs_data['broad'].to_numpy()

ss_weight = ss_data['weight'].to_numpy()
ss_height = ss_data['heightinchestotal'].to_numpy()
ss_broad = ss_data['broad'].to_numpy()

for i in range(len(qb_data)):
    dataPlayers.append([qb_weight[i], qb_height[i], qb_broad[i]])
    pos_index.append(0)
for i in range(len(fs_data)):
    dataPlayers.append([fs_weight[i], fs_height[i], fs_broad[i]])
    pos_index.append(1)
for i in range(len(ss_data)):
    dataPlayers.append([ss_weight[i], ss_height[i], ss_broad[i]])
    pos_index.append(2)
    
pos_index = np.array(pos_index)
pos_cat = to_categorical(pos_index, 3)

dataPlayers = np.array(dataPlayers)

# print(dataQB)
# print(pos_index)
# print(qb_data['weight'])
# print(qb_data['heightinchestotal'])
# print(qb_data['broad'])

# print(dataPlayers[0])

print(pos_cat.shape)
print(dataPlayers.shape)


In [None]:
model = Sequential()
model.add(Dense(units=3, input_shape=(3,), activation='softmax')) # 3 output units, because 3 classes
model.compile(Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
h = model.fit(x=dataPlayers, y=pos_cat, batch_size=50, validation_split = 0.2, verbose=1, epochs=100)

In [None]:
plt.plot(h.history['accuracy'])
plt.xlabel('epoch')
plt.legend(['accuracy'])
plt.title('accuracy')

In [None]:
plt.plot(h.history['loss'])
plt.xlabel('epoch')
plt.legend(['loss'])
plt.title('loss')

In [None]:
point = np.array([dataPlayers[0]])

prediction = model.predict_classes(point)
print("Predicted Class:", prediction)

In [None]:
# height, weight, forty yard, bench, broad
# CB, NT

In [None]:
cb_data = raw_data.loc[raw_data['position'] == 'ILB']
nt_data = raw_data.loc[raw_data['position'] == 'WR']

In [None]:
# X = raw_data
dataPlayers = []
pos_index = []

# print(type(qb_data))

parameters = ['weight', 'heightinchestotal', 'broad', 'fortyyd', 'bench']

cb_weight = cb_data['weight'].to_numpy()
cb_height = cb_data['heightinchestotal'].to_numpy()
cb_broad = cb_data['broad'].to_numpy()
cb_forty = cb_data['fortyyd'].to_numpy()
cb_bench = cb_data['bench'].to_numpy()

nt_weight = nt_data['weight'].to_numpy()
nt_height = nt_data['heightinchestotal'].to_numpy()
nt_broad = nt_data['broad'].to_numpy()
nt_forty = nt_data['fortyyd'].to_numpy()
nt_bench = nt_data['bench'].to_numpy()

for i in range(len(cb_data)):
    dataPlayers.append([cb_weight[i], cb_height[i], cb_broad[i], cb_forty[i], cb_bench[i]])
    pos_index.append(0)
for i in range(len(nt_data)):
    dataPlayers.append([nt_weight[i], nt_height[i], nt_broad[i], nt_forty[i], nt_bench[i]])
    pos_index.append(1)
    
pos_index = np.array(pos_index)
pos_cat = to_categorical(pos_index, 2)

dataPlayers = np.array(dataPlayers)

# print(dataQB)
# print(pos_index)
# print(qb_data['weight'])
# print(qb_data['heightinchestotal'])
# print(qb_data['broad'])

print(dataPlayers)

# print(dataPlayers[0])

print(pos_cat.shape)
print(dataPlayers.shape)


In [None]:
model = Sequential()
model.add(Dense(units=2, input_shape=(5,), activation='softmax'))
model.compile(Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
h = model.fit(x=dataPlayers, y=pos_cat, batch_size=50, validation_split = 0.2, verbose=1, epochs=100)

In [None]:
plt.plot(h.history['accuracy'])
plt.xlabel('epoch')
plt.legend(['accuracy'])
plt.title('accuracy')

In [None]:
plt.plot(h.history['loss'])
plt.xlabel('epoch')
plt.legend(['loss'])
plt.title('loss')

In [None]:
point = np.array(dataPlayers)

# point = np.array([[200, 73, 10, 4.5, 20]])

prediction = model.predict_classes(point)
print("Predicted Class:", prediction)
print("Actual Class:", pos_index)