In [109]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("data/imputed_data.csv")
print(df.columns)

Index(['Name', 'Position', 'College', 'Round', 'Pick', 'Stat URL', 'Height',
       'Weight', '40 Yard Dash', 'Bench Press', 'Vertical Jump', 'Broad Jump',
       '3 Cone Drill', 'Shuttle', 'conf_abbr', 'games', 'seasons',
       'tackles_solo', 'tackles_assists', 'tackles_total', 'tackles_loss',
       'sacks', 'def_int', 'def_int_yds', 'def_int_td', 'pass_defended',
       'fumbles_rec', 'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced',
       'rec', 'rec_yds', 'rec_yds_per_rec', 'rec_td', 'rush_att', 'rush_yds',
       'rush_yds_per_att', 'rush_td', 'scrim_att', 'scrim_yds',
       'scrim_yds_per_att', 'scrim_td', 'Year'],
      dtype='object')


In [110]:
df.loc[df.Round != 1, "Round"] = 0

# Dropping the columns which donot contribute in prediction
all_X = df.drop(["Name", "Round", "Pick", "College"], axis=1)
all_X = pd.get_dummies(all_X)

# Splitting testing and training sets
train_X = all_X[(all_X.Year != 2023)].drop(["Year"], axis=1)
test_X = all_X[all_X.Year == 2023].drop(["Year"], axis=1)
train_y = df[(df.Year != 2023)].Round
test_y = df[df.Year == 2023].Round

# Training the model using Random Forest
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(train_X, train_y)


In [111]:
# Predicting the probabilities of Test set
preds = clf.predict_proba(test_X)
count = 1

# Ranking done according to the probability scores
for i in pd.DataFrame(preds).sort_values(by=1, ascending=False).index:
    print(str(count) + " " + str(df[df.Year==2023].reset_index().at[i, "Name"]))
    count += 1

1 Bryce Young
2 C.J. Stroud
3 Anthony Richardson
4 Jakorian Bennett
5 DJ Turner
6 Christian Gonzalez
7 Byron Young
8 Will Anderson Jr.
9 Jaren Hall
10 Darnell Wright
11 Emmanuel Forbes
12 Hendon Hooker
13 Marvin Mims
14 Blake Freeland
15 Deonte Banks
16 Dante Stills
17 Tyler Steen
18 Carrington Valentine
19 Dawand Jones
20 Nolan Smith
21 Cam Smith
22 Kelee Ringo
23 Trenton Simpson
24 Adetomiwa Adebawore
25 Isaiah Foskey
26 Ryan Hayes
27 Julius Brents
28 Carter Warren
29 Jalin Hyatt
30 Tre'Vius Hodges-Tomlinson
31 YaYa Diaby
32 Yasir Abdullah
33 Quentin Johnston
34 Lukas Van Ness
35 Joe Tippmann
36 Richard Gouraige
37 Owen Pappoe
38 Jalen Redmond
39 Rejzohn Wright
40 Nick Hampton
41 Nathaniel Dell
42 Thomas Incoom
43 A.T. Perry
44 Josh Downs
45 Tanner McKee
46 Malik Cunningham
47 Jartavius Martin
48 Malaesala Aumavae-Laulu
49 Riley Moss
50 Aidan O'Connell
51 Isaiah McGuire
52 Earl Bostick Jr.
53 Tavius Robinson
54 Rashee Rice
55 Anton Harrison
56 Ali Gaye
57 Paris Johnson Jr.
58 Zay Flo

In [113]:
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# Convert predicted probabilities to binary predictions based on a threshold (e.g., 0.5)
predicted_labels = (preds[:, 1] > 0.5).astype(int)

# Calculate evaluation metrics - Accuracy and ROC_Score
accuracy = accuracy_score(test_y, predicted_labels)
roc_auc = roc_auc_score(test_y, preds[:, 1])



In [114]:
# Evaluation for ranking metrics
# Sort the predictions based on probability scores
sorted_indices = np.argsort(-preds[:, 1])

# Calculate Mean Reciprocal Rank (MRR)
mrr = 0
for idx, i in enumerate(sorted_indices):
    if test_y.iloc[i] == 1:  # Use iloc to access test_y by index
        mrr = 1 / (idx + 1)
        break

# Calculate Mean Average Precision (MAP)
num_relevant = sum(test_y)
ap = 0
for idx, i in enumerate(sorted_indices):
    if test_y.iloc[i] == 1:
        ap += sum(test_y.iloc[:idx + 1]) / (idx + 1)
map_score = ap / num_relevant

# Calculate Normalized Discounted Cumulative Gain (NDCG) at k=10
k = 10
dcg = 0
idcg = sum(1 / np.log2(np.arange(2, k + 2)))
for idx, i in enumerate(sorted_indices[:k]):
    if test_y.iloc[i] == 1:
        dcg += 1 / np.log2(idx + 2)
ndcg = dcg / idcg

# Calculate Precision at k (P@k) and Recall at k (R@k) at k=10
tp_at_k = sum(test_y.iloc[sorted_indices[:k]])
precision_at_k = tp_at_k / k
recall_at_k = tp_at_k / num_relevant


print("Accuracy:", accuracy)
print("ROC AUC Score:", roc_auc)
print("Mean Reciprocal Rank (MRR):", mrr)
print("Mean Average Precision (MAP):", map_score)
print("Normalized Discounted Cumulative Gain (NDCG) at k=10:", ndcg)
print("Precision at k (P@k) at k=10:", precision_at_k)
print("Recall at k (R@k) at k=10:", recall_at_k)

Accuracy: 0.897887323943662
ROC AUC Score: 0.7208248816768086
Mean Reciprocal Rank (MRR): 1.0
Mean Average Precision (MAP): 0.10441733328653606
Normalized Discounted Cumulative Gain (NDCG) at k=10: 0.6804503724350792
Precision at k (P@k) at k=10: 0.6
Recall at k (R@k) at k=10: 0.20689655172413793
