In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

RANDOM_STATE = 42
N_SPLITS = 5

# ---- paths ----
TRAIN_PATHS = {
    "QB": "../data/processed/nfl_to_nfl_qb_train.csv",
    "RB": "../data/processed/nfl_to_nfl_rb_train.csv",
    "WR": "../data/processed/nfl_to_nfl_wr_train.csv",
    "TE": "../data/processed/nfl_to_nfl_te_train.csv",
}

TARGET_COL = "target_fp_ppr"

# ---- scorers ----
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # neg MSE (sklearn convention)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

def train_and_cv_position(pos: str, path: str):
    df = pd.read_csv(path)

    # Basic hygiene: drop rows missing target (should already be clean from R)
    df = df.dropna(subset=[TARGET_COL]).copy()

    # Define feature set: drop id + season bookkeeping + target columns
    drop_cols = {TARGET_COL, "target_games", "target_season"}
    # Keep season if you want it as a feature; I usually drop it to avoid "year trend leakage"
    drop_cols.update({"season"})  # comment this out if you want season as a feature

    # Always drop player_id from model features
    drop_cols.update({"player_id"})

    X = df.drop(columns=[c for c in drop_cols if c in df.columns])
    y = df[TARGET_COL].astype(float)

    # Numeric-only pipeline (everything should already be numeric)
    # If any non-numeric slips in, coerce to numeric and set invalid to NaN
    X = X.apply(pd.to_numeric, errors="coerce")

    model = RandomForestRegressor(
        n_estimators=600,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        min_samples_leaf=2,
    )

    pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("rf", model),
    ])

    cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

    cv_results = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring={"neg_mse": mse_scorer, "neg_mae": mae_scorer, "r2": "r2"},
        return_train_score=False,
        n_jobs=-1
    )

    # Summarize CV (convert neg errors back to positive)
    summary = {
        "position": pos,
        "n_rows": len(df),
        "n_features": X.shape[1],
        "mse_mean": float((-cv_results["test_neg_mse"]).mean()),
        "mse_std": float((-cv_results["test_neg_mse"]).std(ddof=1)),
        "mae_mean": float((-cv_results["test_neg_mae"]).mean()),
        "mae_std": float((-cv_results["test_neg_mae"]).std(ddof=1)),
        "r2_mean": float(cv_results["test_r2"].mean()),
        "r2_std": float(cv_results["test_r2"].std(ddof=1)),
    }

    # Fit final model on all training data
    pipe.fit(X, y)

    return pipe, summary


def main():
    models = {}
    summaries = []

    for pos, path in TRAIN_PATHS.items():
        pipe, summary = train_and_cv_position(pos, path)
        models[pos] = pipe
        summaries.append(summary)

    results_df = pd.DataFrame(summaries).sort_values("position")
    print("\nCross-validated performance (5-fold):")
    print(results_df.to_string(index=False))

    # If you want to save models next step, we can joblib.dump() them.
    return models, results_df


if __name__ == "__main__":
    models, results_df = main()


Cross-validated performance (5-fold):
position  n_rows  n_features  mse_mean  mse_std  mae_mean  mae_std  r2_mean   r2_std
      QB     807          29 28.005955 3.465657  4.156378 0.185713 0.436087 0.052304
      RB    1725          33 14.383724 1.883987  2.893085 0.165525 0.534169 0.048449
      TE    1303          33  6.851728 0.571218  1.995766 0.140507 0.575600 0.037227
      WR    2285          33 12.231286 0.840341  2.763663 0.093005 0.585513 0.041375


In [7]:
PREDICT_PATHS = {
    "QB": "../data/processed/nfl_to_nfl_qb_predict_2026.csv",
    "RB": "../data/processed/nfl_to_nfl_rb_predict_2026.csv",
    "WR": "../data/processed/nfl_to_nfl_wr_predict_2026.csv",
    "TE": "../data/processed/nfl_to_nfl_te_predict_2026.csv",
}

def predict_position(pos: str, model, path: str) -> pd.DataFrame:
    df = pd.read_csv(path)

    # Keep identifiers for output
    id_cols = [c for c in ["player_id", "season"] if c in df.columns]

    X = df.drop(columns=[c for c in id_cols if c in df.columns]).copy()

    # Ensure numeric (any bad parse -> NaN, handled by imputer inside pipeline)
    X = X.apply(pd.to_numeric, errors="coerce")

    preds = model.predict(X)

    out = df[id_cols].copy() if id_cols else pd.DataFrame(index=df.index)
    out["pred_fp_ppr_2026"] = preds
    out["position"] = pos

    # rank within position
    out = out.sort_values("pred_fp_ppr_2026", ascending=False).reset_index(drop=True)
    out["rank_pos"] = np.arange(1, len(out) + 1)

    return out

# Generate predictions + save ranked outputs
all_preds = []
for pos, path in PREDICT_PATHS.items():
    pred_df = predict_position(pos, models[pos], path)
    all_preds.append(pred_df)

    # save per position
    pred_df.to_csv(f"../data/processed/nfl_to_nfl_{pos.lower()}_predictions_2026.csv", index=False)

# combined file
all_preds_df = pd.concat(all_preds, ignore_index=True)
all_preds_df.to_csv("../data/processed/nfl_to_nfl_all_predictions_2026.csv", index=False)

print(all_preds_df.groupby("position").head(10).to_string(index=False))

 player_id  season  pred_fp_ppr_2026 position  rank_pos
00-0034857    2025         20.197942       QB         1
00-0039851    2025         19.809860       QB         2
00-0033873    2025         18.635076       QB         3
00-0026498    2025         17.927536       QB         4
00-0036355    2025         17.772165       QB         5
00-0036971    2025         17.580482       QB         6
00-0036389    2025         17.300487       QB         7
00-0039732    2025         16.841999       QB         8
00-0037834    2025         16.690418       QB         9
00-0033077    2025         16.567544       QB        10
00-0033280    2025         22.652920       RB         1
00-0038542    2025         18.974998       RB         2
00-0039139    2025         17.677962       RB         3
00-0039040    2025         17.557218       RB         4
00-0036223    2025         15.535954       RB         5
00-0040715    2025         14.809412       RB         6
00-0038597    2025         14.353827       RB   

In [9]:
nfl_stats = pd.read_csv('../data/raw/nfl_player_stats_2011_2025.csv')
# 1) Build lookup from your original weekly stats (pick the right name column)
# Replace 'player_display_name' with whatever column you have.
NAME_COL = "player_display_name"  # <-- change if needed

lookup = (
    nfl_stats[["player_id", NAME_COL]]
    .dropna()
    .drop_duplicates(subset=["player_id"])
    .rename(columns={NAME_COL: "player_name"})
)

# 2) Add names to combined predictions (or do this per-position)
all_preds_df = pd.read_csv("../data/processed/nfl_to_nfl_all_predictions_2026.csv")

all_preds_named = all_preds_df.merge(lookup, on="player_id", how="left")

# (optional) reorder columns
cols = ["position", "rank_pos", "player_id", "player_name", "season", "pred_fp_ppr_2026"]
cols = [c for c in cols if c in all_preds_named.columns] + \
       [c for c in all_preds_named.columns if c not in cols]
all_preds_named = all_preds_named[cols]

all_preds_named.to_csv("../data/processed/nfl_to_nfl_all_predictions_2026_named.csv", index=False)

# quick peek
print(all_preds_named.groupby("position").head(10).to_string(index=False))

  nfl_stats = pd.read_csv('../data/raw/nfl_player_stats_2011_2025.csv')


position  rank_pos  player_id         player_name  season  pred_fp_ppr_2026
      QB         1 00-0034857          Josh Allen    2025         20.197942
      QB         2 00-0039851          Drake Maye    2025         19.809860
      QB         3 00-0033873     Patrick Mahomes    2025         18.635076
      QB         4 00-0026498    Matthew Stafford    2025         17.927536
      QB         5 00-0036355      Justin Herbert    2025         17.772165
      QB         6 00-0036971     Trevor Lawrence    2025         17.580482
      QB         7 00-0036389         Jalen Hurts    2025         17.300487
      QB         8 00-0039732              Bo Nix    2025         16.841999
      QB         9 00-0037834         Brock Purdy    2025         16.690418
      QB        10 00-0033077        Dak Prescott    2025         16.567544
      RB         1 00-0033280 Christian McCaffrey    2025         22.652920
      RB         2 00-0038542      Bijan Robinson    2025         18.974998
      RB    