In [111]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

def rmse_score(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [112]:
#data
path = "training/players.csv"
data = pd.read_csv(path)

target_col = "Fantasy_PPR_NextYear"
dropped_cols = {target_col, "Unnamed: 0", "Name", "Team", "Position", "Team_NextYear"}
feature_cols = [i for i in data.columns if i not in dropped_cols]

positions = ["QB", "RB", "WR", "TE"]
position_dataframes = {}

for position in positions:
    position_data = data[data["Position"] == position].copy()
    X = position_data[feature_cols]
    y = position_data[target_col]
    position_dataframes[position] = {"X": X, "y": y}

#QB_data
#RB_data
#WR_data
#TE_data

In [113]:
#train/test/validation split
data_splits = {}

for position, df in position_dataframes.items():
    X_train, X_temp, y_train, y_temp = train_test_split(df["X"], df["y"], test_size = 0.30, random_state = 42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 2/3, random_state = 42)

    data_splits[position] = {"X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "X_val": X_val, "y_val": y_val}

In [114]:
#Models

model_pipeline = {
    "Random Forest": (
        RandomForestRegressor(random_state = 42, n_jobs = 1),
        {
            "model__n_estimators": [200, 300],
            "model__max_depth": [10, 20],
            "model__min_samples_split": [2, 5],
        }, 
    ), 

    "Lasso": (
       Lasso(max_iter = 50000, random_state = 42),
        {
            "model__alpha": [.01, .1, 1, 10]
        }, 
    ), 

    "Neural Network": (
        MLPRegressor(max_iter = 4000, early_stopping = True, random_state = 42),
        {
            "model__hidden_layer_sizes": [(64, ), (128, ), (64, 32)],
            "model__alpha": [.0001, .001],
            "model__learning_rate_init": [.001, .01],
        }, 
    ), 
    "Support Vector Regression": (
        SVR(),
        {
            "model__C": [.1, 1, 10],
            "model__gamma": ["scale", "auto"],
            "model__kernel": ["rbf", "linear"],
        }, 
    ), 
    "XGBoost": (
        XGBRegressor(random_state = 42, n_jobs = -1, tree_method = "hist", n_estimators = 300, objective = "reg:squarederror"),
        {
            "model__n_estimators": [300, 500],
            "model__max_depth": [3, 5, 7],
            "model__learning_rate": [.01, 1],
            "model__subsample": [.8, 1],
            "model__colsample_bytree": [.8 ,1]
        }, 
    ), 
}


In [115]:
#Backward feature elimination

def backward_feature_elimination(model, X_train, y_train, X_test, y_test, min_features = 5):
    included_features = list(X_train.columns)
    best_rmse = float("inf")
    best_features = included_features.copy()

    while len(included_features) > min_features:
        model.fit(X_train[included_features], y_train)
        predictions = model.predict(X_test[included_features])
        current_rmse = rmse_score(y_test, predictions)

        predictor = model.named_steps["model"]

        if hasattr(predictor, "feature_importances_"):
            importances = np.asarray(predictor.feature_importances_)
        elif hasattr(predictor, "coef_"):
            importances = np.abs(np.ravel(predictor.coef_))
        else:
            return best_features, current_rmse
        
        least_important_index = int(np.argmin(importances))
        least_features = included_features[least_important_index]
        included_features.remove(least_features)

        if current_rmse < best_rmse:
            best_rmse = current_rmse
            best_features = included_features.copy()
        else:
            break
        
    return best_features, best_rmse

In [116]:
#QB Tuning

X_train = data_splits["QB"]["X_train"]
X_test = data_splits["QB"]["X_test"]
y_train = data_splits["QB"]["y_train"]
y_test = data_splits["QB"]["y_test"]

QB_results = []

for name, (base_model, param_grid) in model_pipeline.items():
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler()),
        ("model", base_model)
        ])
    
    grid = GridSearchCV(pipe, param_grid = param_grid, scoring = rmse_scorer, cv = 3, n_jobs = -1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_parameters = grid.best_params_

    best_features, test_rmse = backward_feature_elimination(best_model, X_train, y_train, X_test, y_test)

    QB_results.append({
        "Model": name, 
        "Best Parameters": best_parameters,
        "Selected Features": len(best_features),
        "Test_rmse": test_rmse})
    
results_df = pd.DataFrame(QB_results).sort_values("Test_rmse")

  arr = np.array(param_list)


In [117]:
#QB Prediction

data_2024_path = "clean_data/data_2024.csv"
data_2024 = pd.read_csv(data_2024_path)
QB_2024 = data_2024[data_2024["Position"] == "QB"].copy()

train_cols = feature_cols
missing_2024_cols = [c for c in train_cols if c not in QB_2024.columns]
for c in missing_2024_cols:
    QB_2024[c] = np.nan

X_2024 = QB_2024[feature_cols]

best_QB = results_df.iloc[results_df["Test_rmse"].idxmin()]
best_QB_model = best_QB["Model"]
best_QB_parameters = best_QB["Best Parameters"]

base_model, _ = model_pipeline[best_QB_model]

best_QB_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler()),
        ("model", base_model.set_params(**{k.split("__")[1]: v for k, v in best_QB_parameters.items()}))
        ])

X_train = data_splits["QB"]["X_train"]
y_train = data_splits["QB"]["y_train"]

best_QB_pipeline.fit(X_train, y_train)
QB_2024["Fantasy_PPR_NextYear"] = best_QB_pipeline.predict(X_2024)

QB_2024



Unnamed: 0,Name,Team,Position,Age,Games,Passing_Cmp,Passing_Att,Passing_Yds,Passing_TD,Passing_Int,...,Fumbles_Lost,2PM,2PP,Fantasy_PPR,Fantasy_PPR_NextYear,Team_NextYear,Year,Next_Season,ASA_zscore,Preseason_Rank
3,Lamar Jackson,BAL,QB,27,17,316.0,474.0,4172.0,41.0,4.0,...,5.0,1.0,0.0,430.4,274.210157,,,,,
7,Josh Allen,BUF,QB,28,17,307.0,483.0,3731.0,28.0,6.0,...,2.0,0.0,1.0,379.0,274.994508,,,,,
8,Joe Burrow,CIN,QB,28,17,460.0,652.0,4918.0,43.0,9.0,...,5.0,0.0,0.0,372.8,242.745416,,,,,
12,Baker Mayfield,TAM,QB,29,17,407.0,570.0,4500.0,41.0,16.0,...,2.0,0.0,1.0,365.8,241.942916,,,,,
15,Jayden Daniels,WAS,QB,24,17,331.0,480.0,3568.0,25.0,9.0,...,0.0,1.0,2.0,355.8,240.326370,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,Jake Browning,CIN,QB,28,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.2,14.888299,,,,,
548,Jeff Driskel,WAS,QB,31,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,16.258093,,,,,
602,Kyle Trask,TAM,QB,26,4,1.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,-0.2,15.351262,,,,,
623,Sam Howell,SEA,QB,24,2,5.0,14.0,24.0,0.0,1.0,...,0.0,0.0,0.0,-0.8,18.386837,,,,,
