In [30]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate

In [28]:
qb_data = pd.read_csv('../data/processed/cfb_to_nfl_qb_modeling.csv')



In [29]:

# QBR will be dropped from the model as it can be approximated froom other stats already
# fill missing rushing yards with 0 since there are few cases of missing values and NA likely means 0 in these cases (The QB did not rush at all)
qb_data[["rushing_avg", "rushing_long"]] = qb_data[["rushing_avg", "rushing_long"]].fillna(0)

qb_data.isna().mean().sort_values(ascending=False) * 100

pre_draft_ranking             27.710843
pre_draft_position_ranking    27.108434
pre_draft_grade               27.108434
height                        25.301205
weight                        25.301205
overall                       24.698795
pick                          24.698795
round                         24.698795
passing_qbr                   12.650602
player_key                     0.000000
position                       0.000000
season                         0.000000
athlete_name                   0.000000
games                          0.000000
fumbles_rec                    0.000000
fumbles_fum                    0.000000
fumbles_lost                   0.000000
interceptions_yds              0.000000
interceptions_int              0.000000
rushing_long                   0.000000
interceptions_td               0.000000
rushing_td                     0.000000
rushing_yds                    0.000000
passing_int                    0.000000
rushing_car                    0.000000


In [32]:
# prepare drafting columns
draft_cols = ["overall","round","pick","pre_draft_ranking","pre_draft_position_ranking","pre_draft_grade"]
meas_cols  = ["height","weight"]

qb_data["undrafted"] = qb_data["round"].isna().astype(int)  # or use "overall"

# Round / pick style fields
qb_data["round"]   = qb_data["round"].fillna(0).astype(int)
qb_data["pick"]    = qb_data["pick"].fillna(0).astype(int)
qb_data["overall"] = qb_data["overall"].fillna(0).astype(int)

# Rankings: fill with (max + 1) so undrafted is "worse than worst ranked"
for c in ["pre_draft_ranking", "pre_draft_position_ranking"]:
    if c in qb_data.columns:
        max_rank = qb_data[c].max(skipna=True)
        qb_data[c] = qb_data[c].fillna((max_rank if pd.notna(max_rank) else 0) + 1)

# Grade: choose a simple fill (0) + indicator (optional)
if "pre_draft_grade" in qb_data.columns:
    qb_data["pre_draft_grade_missing"] = qb_data["pre_draft_grade"].isna().astype(int)
    qb_data["pre_draft_grade"] = qb_data["pre_draft_grade"].fillna(0)


for c in meas_cols:
    if c in qb_data.columns:
        qb_data[c] = qb_data[c].fillna(qb_data[c].median())


In [33]:
x_cols = [
    "games",
    "fumbles_rec",
    "fumbles_lost",
    "fumbles_fum",
    "interceptions_td",
    "interceptions_yds",
    "interceptions_int",
    "rushing_long",
    "rushing_td",
    "rushing_yds",
    "rushing_car",
    "passing_int",
    "passing_td",
    "passing_yds",
    "passing_completions",
    "passing_attempts",
    "passing_avg",
    "rushing_avg",
    "overall",
    "round",
    "pick",
    "pre_draft_ranking",
    "pre_draft_position_ranking",
    "pre_draft_grade",
    "height",
    "weight"
]

features = qb_data[x_cols]

target = qb_data['avg_weekly_ppr']

In [34]:


gradient_boosting = GradientBoostingRegressor(n_estimators=200)
cv_results_gbdt = cross_validate(
    gradient_boosting,
    features,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [35]:
print("Gradient Boosting Decision Tree")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_gbdt['test_score'].mean():.3f} ± "
    f"{cv_results_gbdt['test_score'].std():.3f} k$"
)
print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds")
print(
    f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds"
)

Gradient Boosting Decision Tree
Mean absolute error via cross-validation: 4.921 ± 0.344 k$
Average fit time: 0.127 seconds
Average score time: 0.002 seconds


In [43]:
model = gradient_boosting.fit(features,target)

In [46]:
# Testing predictions with simple model
pred_df = pd.read_csv('../data/processed/cfb_to_nfl_qb_prediction_inputs.csv')
pred_df.head(20)

Unnamed: 0,player_key,athlete_name,season,position,games,overall,round,pick,pre_draft_ranking,pre_draft_position_ranking,...,rushing_car,passing_int,passing_td,passing_yds,passing_completions,passing_attempts,passing_avg,rushing_avg,passing_qbr,athlete_id
0,a j duffy,A.J. Duffy,2025,QB,10,,,,,,...,44,10,22,2347,197,334,7.026946,4.954545,,4685334
1,aaron philo,Aaron Philo,2025,QB,2,,,,,,...,4,1,1,373,21,28,13.321429,4.75,75.9,5132812
2,abram johnston,Abram Johnston,2025,QB,11,,,,,,...,119,9,7,1121,116,213,5.262911,2.512605,,5226585
3,achilles ringo,Achilles Ringo,2025,QB,1,,,,,,...,0,0,0,0,0,1,0.0,,,5300386
4,adam damante,Adam Damante,2025,QB,4,,,,,,...,16,4,5,711,70,104,6.836538,-0.9375,32.95,5152503
5,adam urena,Adam Urena,2025,QB,11,,,,,,...,44,10,13,2401,237,369,6.506775,-2.386364,,5227565
6,adrian mejia,Adrian Mejia,2025,QB,4,,,,,,...,14,1,1,105,7,12,8.75,4.214286,,5088339
7,aidan armenta,Aidan Armenta,2025,QB,11,,,,,,...,60,8,12,1619,147,252,6.424603,1.0,31.927273,5122054
8,aidan bouman,Aidan Bouman,2025,QB,12,,,,,,...,23,5,21,2297,176,283,8.116608,-4.304348,,4429179
9,aidan chiles,Aidan Chiles,2025,QB,8,,,,,,...,81,3,10,1392,128,203,6.857143,2.802469,61.014286,5075805


In [47]:
# These are the columns your model expects
train_cols = features.columns

# Keep only columns the model knows about (and create missing ones)
X_pred = pred_df.reindex(columns=train_cols, fill_value=0)


In [48]:
draft_cols = [
    "overall","round","pick",
    "pre_draft_ranking","pre_draft_position_ranking","pre_draft_grade",
    "height","weight"
]

draft_cols = [c for c in draft_cols if c in train_cols]  # only those actually used

# compute medians on TRAINING data
draft_medians = features[draft_cols].median()

# fill prediction NAs with training medians
X_pred[draft_cols] = X_pred[draft_cols].fillna(draft_medians)
X_pred = X_pred.fillna(0)



In [51]:
pred_df["predicted_rookie_fantasy_ppg"] = model.predict(X_pred)

# sort top prospects
pred_df_sorted = pred_df.sort_values("predicted_rookie_fantasy_ppg", ascending=False)

pred_df_sorted[["player_key", "athlete_name", "predicted_rookie_fantasy_ppg"]].head(20)


Unnamed: 0,player_key,athlete_name,predicted_rookie_fantasy_ppg
261,demond williams jr,Demond Williams Jr.,11.494725
605,marcel reed,Marcel Reed,11.431156
340,garrison davis,Garrison Davis,11.148511
514,julian sayin,Julian Sayin,11.062688
4,adam damante,Adam Damante,10.801172
170,carson beck,Carson Beck,10.536644
774,thomas gotkowski,Thomas Gotkowski,10.281239
534,keali i ah yat,Keali'i Ah Yat,10.112901
606,marco lainez,Marco Lainez,10.098262
578,logan kushner,Logan Kushner,9.950498


In [53]:
pred_df_sorted.to_csv("../data/processed/cfb_to_nfl_qb_predictions_2025.csv", index=False)