In [116]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("data/imputed_data.csv")
print(df.columns)

Index(['Name', 'Position', 'College', 'Round', 'Pick', 'Stat URL', 'Height',
       'Weight', '40 Yard Dash', 'Bench Press', 'Vertical Jump', 'Broad Jump',
       '3 Cone Drill', 'Shuttle', 'conf_abbr', 'games', 'seasons',
       'tackles_solo', 'tackles_assists', 'tackles_total', 'tackles_loss',
       'sacks', 'def_int', 'def_int_yds', 'def_int_td', 'pass_defended',
       'fumbles_rec', 'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced',
       'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'rush_att', 'rush_yds',
       'rush_yds_per_att', 'rush_td', 'scrim_att', 'scrim_yds',
       'scrim_yds_per_att', 'scrim_td', 'Year'],
      dtype='object')


In [117]:
df.loc[df.Round != 1, "Round"] = 0

# Dropping the columns which donot contribute in prediction
all_X = df.drop(["Name", "Round", "Pick", "College"], axis=1)
all_X = pd.get_dummies(all_X)

# Splitting testing and training sets
train_X = all_X[(all_X.Year != 2023)].drop(["Year"], axis=1)
test_X = all_X[all_X.Year == 2023].drop(["Year"], axis=1)
train_y = df[(df.Year != 2023)].Round
test_y = df[df.Year == 2023].Round



In [152]:

# Training the model using Random Forest

param = {
    'n_estimators': [100, 500, 1000]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier()

# Hypertuning parameters using 5-Fold Cross Validation method
clf = GridSearchCV(estimator=rf, param_grid=param, cv=5, scoring='accuracy')
clf.fit(train_X, train_y)

In [153]:
# Get the best parameters
best_params = clf.best_params_
print("Best Parameters:", best_params)

# Use the best estimator to make predictions
best_rf = clf.best_estimator_

Best Parameters: {'n_estimators': 100}


In [154]:
# Predicting the probabilities of Test set
preds = best_rf.predict_proba(test_X)
count = 1

# Ranking done according to the probability scores
for i in pd.DataFrame(preds).sort_values(by=1, ascending=False).index:
    print(str(count) + " " + str(df[df.Year==2023].reset_index().at[i, "Name"]))
    count += 1

1 C.J. Stroud
2 Anthony Richardson
3 Bryce Young
4 Jakorian Bennett
5 Dante Stills
6 Christian Gonzalez
7 Blake Freeland
8 Deonte Banks
9 Will Anderson Jr.
10 DJ Turner
11 Marvin Mims
12 Isaiah McGuire
13 Emmanuel Forbes
14 Hendon Hooker
15 Nolan Smith
16 Jaren Hall
17 Darnell Wright
18 Riley Moss
19 Tyler Steen
20 Nick Hampton
21 Carter Warren
22 Ryan Hayes
23 Isaiah Foskey
24 Owen Pappoe
25 Dawand Jones
26 Josh Downs
27 Julius Brents
28 Lukas Van Ness
29 Byron Young
30 Quentin Johnston
31 Zacch Pickens
32 Carrington Valentine
33 Paris Johnson Jr.
34 Matthew Bergeron
35 Tyree Wilson
36 Tre'Vius Hodges-Tomlinson
37 Jalin Hyatt
38 Anthony Johnson Jr.
39 Andre Carter II
40 Jartavius Martin
41 Warren McClendon
42 Trenton Simpson
43 Yasir Abdullah
44 Kelee Ringo
45 Anton Harrison
46 Ali Gaye
47 Charlie Thomas
48 Jay Ward
49 Darrell Luter Jr.
50 Mazi Smith
51 Derick Hall
52 Richard Gouraige
53 Darius Rush
54 Anfernee Orji
55 Rakim Jarrett
56 Malik Cunningham
57 Bijan Robinson
58 A.T. Perry


In [155]:
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# Convert predicted probabilities to binary predictions based on a threshold (e.g., 0.5)
predicted_labels = (preds[:, 1] > 0.5).astype(int)

# Calculate evaluation metrics - Accuracy and ROC_Score
accuracy = accuracy_score(test_y, predicted_labels)
roc_auc = roc_auc_score(test_y, preds[:, 1])



In [156]:
# Evaluation for ranking metrics
# Sort the predictions based on probability scores
sorted_indices = np.argsort(-preds[:, 1])

# Calculate Mean Reciprocal Rank (MRR)
mrr = 0
for idx, i in enumerate(sorted_indices):
    if test_y.iloc[i] == 1:  # Use iloc to access test_y by index
        mrr = 1 / (idx + 1)
        break

# Calculate Mean Average Precision (MAP)
num_relevant = sum(test_y)
ap = 0
for idx, i in enumerate(sorted_indices):
    if test_y.iloc[i] == 1:
        ap += sum(test_y.iloc[:idx + 1]) / (idx + 1)
map_score = ap / num_relevant

# Calculate Normalized Discounted Cumulative Gain (NDCG) at k=10
k = 10
dcg = 0
idcg = sum(1 / np.log2(np.arange(2, k + 2)))
for idx, i in enumerate(sorted_indices[:k]):
    if test_y.iloc[i] == 1:
        dcg += 1 / np.log2(idx + 2)
ndcg = dcg / idcg

# Calculate Precision at k (P@k) and Recall at k (R@k) at k=10
tp_at_k = sum(test_y.iloc[sorted_indices[:k]])
precision_at_k = tp_at_k / k
recall_at_k = tp_at_k / num_relevant


print("Accuracy:", accuracy)
print("ROC AUC Score:", roc_auc)
print("Mean Reciprocal Rank (MRR):", mrr)
print("Mean Average Precision (MAP):", map_score)
print("Normalized Discounted Cumulative Gain (NDCG) at k=10:", ndcg)
print("Precision at k (P@k) at k=10:", precision_at_k)
print("Recall at k (R@k) at k=10:", recall_at_k)

Accuracy: 0.897887323943662
ROC AUC Score: 0.7393509127789046
Mean Reciprocal Rank (MRR): 1.0
Mean Average Precision (MAP): 0.10529849177177532
Normalized Discounted Cumulative Gain (NDCG) at k=10: 0.6168295842361274
Precision at k (P@k) at k=10: 0.5
Recall at k (R@k) at k=10: 0.1724137931034483
