In [203]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

def rmse_score(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor


In [204]:
#data
path = "training/players.csv"
fantasy_data = pd.read_csv(path)

fantasy_data_2024 = fantasy_data[fantasy_data["Next_Season"] == 2024]
fantasy_data = fantasy_data[fantasy_data["Next_Season"] != 2024]

target_col = "Fantasy_PPR_NextYear"
dropped_cols = {target_col, "Unnamed: 0", "Name", "Team", "Position", "Team_NextYear"}

QB_data = fantasy_data[(fantasy_data["Position"] == "QB") & (fantasy_data["Games"] > 5) & (fantasy_data[target_col] > 100.0)].drop(columns = dropped_cols)
RB_data = fantasy_data[fantasy_data["Position"] == "RB"].drop(columns = dropped_cols)
WR_data = fantasy_data[fantasy_data["Position"] == "WR"].drop(columns = dropped_cols)
TE_data = fantasy_data[fantasy_data["Position"] == "TE"].drop(columns = dropped_cols)

QB_y = fantasy_data.loc[QB_data.index, target_col]
RB_y = fantasy_data.loc[RB_data.index, target_col]
WR_y = fantasy_data.loc[WR_data.index, target_col]
TE_y = fantasy_data.loc[TE_data.index, target_col]

QB_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "QB"].drop(columns = dropped_cols)
RB_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "RB"].drop(columns = dropped_cols)
WR_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "WR"].drop(columns = dropped_cols)
TE_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "TE"].drop(columns = dropped_cols)

In [205]:
#train/test/validation split

X_train_QB, X_test_QB, y_train_QB, y_test_QB = train_test_split(QB_data, QB_y, test_size = 0.20, random_state = 4)

X_train_RB, X_test_RB, y_train_RB, y_test_RB = train_test_split(RB_data, RB_y, test_size = 0.20, random_state = 4)

X_train_WR, X_test_WR, y_train_WR, y_test_WR = train_test_split(WR_data, WR_y, test_size = 0.20, random_state = 4)

X_train_TE, X_test_TE, y_train_TE, y_test_TE = train_test_split(TE_data, TE_y, test_size = 0.20, random_state = 4)

In [198]:
#models

model_pipeline = {
    "Random Forest": (
        RandomForestRegressor(random_state = 42, n_jobs = 1),
        {
            "model__n_estimators": [300, 400],
            "model__max_depth": [10, 20],
            "model__min_samples_split": [2, 5],
        }, 
    ), 

    "Lasso": (
       Lasso(max_iter = 50000, random_state = 42),
        {
            "model__alpha": [.01, .1, 1, 10]
        }, 
    ), 

    "Neural Network": (
        MLPRegressor(max_iter = 4000, early_stopping = True, random_state = 42),
        {
            "model__hidden_layer_sizes": [(64, ), (128, ), (64, 32)],
            "model__alpha": [.0001, .001],
            "model__learning_rate_init": [.001, .01],
        }, 
    ), 
    "Support Vector Regression": (
        SVR(),
        {
            "model__C": [.1, 1, 10],
            "model__gamma": ["scale", "auto"],
            "model__kernel": ["rbf", "linear"],
        }, 
    ), 
    "XGBoost": (
        XGBRegressor(random_state = 42, n_jobs = -1, tree_method = "hist", n_estimators = 300, objective = "reg:squarederror"),
        {
            "model__n_estimators": [300, 500],
            "model__max_depth": [3, 5, 7],
            "model__learning_rate": [.01, 1],
            "model__subsample": [.8, 1],
            "model__colsample_bytree": [.8 ,1]
        }, 
    ), 
}


In [199]:
#backward feature elimination

def backward_feature_elimination(model, X_train, y_train, X_test, y_test, min_features = 5):

    included_features = list(X_train.columns)
    
    model.fit(X_train[included_features], y_train)
    y_pred = model.predict(X_test[included_features])
    best_rmse = rmse_score(y_test, y_pred)
    best_features = included_features.copy()

    predictor = model.named_steps["model"]

    if hasattr(predictor, "feature_importances_"):
        def important():
            return np.asarray(model.named_steps["model"].feature_importances_)   
    elif hasattr(predictor, "coef_"):
        def important():
            return np.abs(np.ravel(model.named_steps["model"].coef_))  
    else:
        return best_features, best_rmse

    while len(included_features) > min_features:
        importances = important()
        dropped_index = int(np.argmin(importances))
        dropped_feature = included_features[dropped_index]
        
        current = included_features.copy()
        current.remove(dropped_feature)
        model.fit(X_train[current], y_train)
        y_pred = model.predict(X_test[current])
        current_rmse = rmse_score(y_test, y_pred)

        if current_rmse <= best_rmse:
            included_features = current
            best_rmse = current_rmse
            best_features = current.copy()
            
            predictor = model.named_steps["model"]
        else:
            break
        
    return best_features, best_rmse

In [208]:
#QB Tuning

QB_results = []

for name, (base_model, param_grid) in model_pipeline.items():
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler()),
        ("model", base_model)
        ])
    
    grid = GridSearchCV(pipe, param_grid = param_grid, scoring = rmse_scorer, cv = 3, n_jobs = -1)
    grid.fit(X_train_QB, y_train_QB)

    best_model = grid.best_estimator_
    best_parameters = grid.best_params_

    best_features, test_rmse = backward_feature_elimination(best_model, X_train_QB, y_train_QB, X_test_QB, y_test_QB)

    QB_results.append({
        "Model": name, 
        "Best Parameters": best_parameters,
        "Selected Features": len(best_features),
        "Test_rmse": test_rmse})
    
QB_results_df = pd.DataFrame(QB_results).sort_values("Test_rmse")
QB_results_df


  arr = np.array(param_list)


Unnamed: 0,Model,Best Parameters,Selected Features,Test_rmse
4,XGBoost,"{'model__colsample_bytree': 0.8, 'model__learn...",21,54.659573
0,Random Forest,"{'model__max_depth': 10, 'model__min_samples_s...",21,55.437111
3,Support Vector Regression,"{'model__C': 1, 'model__gamma': 'scale', 'mode...",21,65.514934
1,Lasso,{'model__alpha': 1},22,71.570975
2,Neural Network,"{'model__alpha': 0.0001, 'model__hidden_layer_...",22,81.063104


In [210]:
#QB predictions

best_model = grid.best_estimator_

best_features, test_rmse = backward_feature_elimination(best_model, X_train_QB, y_train_QB, X_test_QB, y_test_QB)

best_model.fit(X_train_QB[best_features], y_train_QB)

y_pred_QB = best_model.predict(X_test_QB[best_features])


QB_test_results = pd.DataFrame({
    "Actual_Points": y_test_QB,
    "Predicted_Points": y_pred_QB
}, index = X_test_QB.index)

QB_test_results = fantasy_data.loc[QB_test_results.index, ["Name", "Team", "Position"]].join(QB_test_results)
QB_test_results

Unnamed: 0,Name,Team,Position,Actual_Points,Predicted_Points
87,Daniel Jones,NYG,QB,180.0,197.774033
75,Tom Brady,NWE,QB,337.9,265.838165
1673,Baker Mayfield,2TM,QB,274.1,223.598801
82,Baker Mayfield,CLE,QB,248.6,206.909363
1118,Lamar Jackson,BAL,QB,236.1,243.022949
92,Matthew Stafford,DET,QB,260.6,227.243881
1124,Trevor Lawrence,JAX,QB,295.6,209.226349
1121,Matt Ryan,ATL,QB,155.3,227.551636
584,Teddy Bridgewater,CAR,QB,202.7,195.440308
596,Sam Darnold,NYJ,QB,157.3,207.534531


In [212]:
#RB Tuning

RB_results = []

for name, (base_model, param_grid) in model_pipeline.items():
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler()),
        ("model", base_model)
        ])
    
    grid = GridSearchCV(pipe, param_grid = param_grid, scoring = rmse_scorer, cv = 3, n_jobs = -1)
    grid.fit(X_train_RB, y_train_RB)

    best_model = grid.best_estimator_
    best_parameters = grid.best_params_

    best_features, test_rmse = backward_feature_elimination(best_model, X_train_RB, y_train_RB, X_test_RB, y_test_RB)

    RB_results.append({
        "Model": name, 
        "Best Parameters": best_parameters,
        "Selected Features": len(best_features),
        "Test_rmse": test_rmse})
    
RB_results_df = pd.DataFrame(RB_results).sort_values("Test_rmse")
RB_results_df

  arr = np.array(param_list)


Unnamed: 0,Model,Best Parameters,Selected Features,Test_rmse
1,Lasso,{'model__alpha': 10},21,62.320658
4,XGBoost,"{'model__colsample_bytree': 0.8, 'model__learn...",21,62.977514
0,Random Forest,"{'model__max_depth': 10, 'model__min_samples_s...",22,64.132952
3,Support Vector Regression,"{'model__C': 0.1, 'model__gamma': 'scale', 'mo...",19,67.972782
2,Neural Network,"{'model__alpha': 0.001, 'model__hidden_layer_s...",22,75.742249


In [213]:
#RB predictions

best_model = grid.best_estimator_

best_features, test_rmse = backward_feature_elimination(best_model, X_train_RB, y_train_RB, X_test_RB, y_test_RB)

best_model.fit(X_train_RB[best_features], y_train_RB)

y_pred_RB = best_model.predict(X_test_RB[best_features])


RB_test_results = pd.DataFrame({
    "Actual_Points": y_test_RB,
    "Predicted_Points": y_pred_RB
}, index = X_test_RB.index)

RB_test_results = fantasy_data.loc[RB_test_results.index, ["Name", "Team", "Position"]].join(RB_test_results)
RB_test_results

Unnamed: 0,Name,Team,Position,Actual_Points,Predicted_Points
232,Brandon Bolden,NWE,RB,124.1,29.528818
608,Wayne Gallman,NYG,RB,13.5,75.646378
497,Trayveon Williams,CIN,RB,21.7,28.549543
1145,Rashaad Penny,SEA,RB,52.2,125.028313
241,Dare Ogunbowale,TAM,RB,29.9,52.906124
...,...,...,...,...,...
35,Josh Jacobs,OAK,RB,231.3,139.994568
648,Zack Moss,BUF,RB,105.2,90.993637
1110,AJ Dillon,GNB,RB,167.6,145.780563
360,Derek Watt,LAC,RB,0.0,19.648392


In [215]:
#WR Tuning

WR_results = []

for name, (base_model, param_grid) in model_pipeline.items():
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler()),
        ("model", base_model)
        ])
    
    grid = GridSearchCV(pipe, param_grid = param_grid, scoring = rmse_scorer, cv = 3, n_jobs = -1)
    grid.fit(X_train_WR, y_train_WR)

    best_model = grid.best_estimator_
    best_parameters = grid.best_params_

    best_features, test_rmse = backward_feature_elimination(best_model, X_train_WR, y_train_WR, X_test_WR, y_test_WR)

    WR_results.append({
        "Model": name, 
        "Best Parameters": best_parameters,
        "Selected Features": len(best_features),
        "Test_rmse": test_rmse})
    
WR_results_df = pd.DataFrame(WR_results).sort_values("Test_rmse")
WR_results_df

  arr = np.array(param_list)


Unnamed: 0,Model,Best Parameters,Selected Features,Test_rmse
2,Neural Network,"{'model__alpha': 0.0001, 'model__hidden_layer_...",22,54.464707
4,XGBoost,"{'model__colsample_bytree': 0.8, 'model__learn...",22,54.877674
0,Random Forest,"{'model__max_depth': 10, 'model__min_samples_s...",21,55.056089
1,Lasso,{'model__alpha': 1},22,55.162495
3,Support Vector Regression,"{'model__C': 1, 'model__gamma': 'scale', 'mode...",22,55.827516


In [216]:
#WR predictions

best_model = grid.best_estimator_

best_features, test_rmse = backward_feature_elimination(best_model, X_train_WR, y_train_WR, X_test_WR, y_test_WR)

best_model.fit(X_train_WR[best_features], y_train_WR)

y_pred_WR = best_model.predict(X_test_WR[best_features])


WR_test_results = pd.DataFrame({
    "Actual_Points": y_test_WR,
    "Predicted_Points": y_pred_WR
}, index = X_test_WR.index)

WR_test_results = fantasy_data.loc[WR_test_results.index, ["Name", "Team", "Position"]].join(WR_test_results)
WR_test_results

Unnamed: 0,Name,Team,Position,Actual_Points,Predicted_Points
421,Chris Moore,BAL,WR,0.0,20.695086
481,Jaydon Mickens,TAM,WR,12.8,20.786741
1545,Mason Kinsey,TEN,WR,1.3,23.868944
1748,Josh Reynolds,DET,WR,128.8,67.645523
1509,Kirk Merritt,MIA,WR,0.0,21.699692
...,...,...,...,...,...
974,Keith Kirkwood,CAR,WR,4.7,22.215343
1035,DeAndre Carter,2TM,WR,86.5,20.625801
685,Damiere Byrd,NWE,WR,64.9,105.206581
1510,Stanley Morgan Jr.,CIN,WR,0.0,20.715971


In [217]:
#TE Tuning

TE_results = []

for name, (base_model, param_grid) in model_pipeline.items():
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler()),
        ("model", base_model)
        ])
    
    grid = GridSearchCV(pipe, param_grid = param_grid, scoring = rmse_scorer, cv = 3, n_jobs = -1)
    grid.fit(X_train_TE, y_train_TE)

    best_model = grid.best_estimator_
    best_parameters = grid.best_params_

    best_features, test_rmse = backward_feature_elimination(best_model, X_train_TE, y_train_TE, X_test_TE, y_test_TE)

    TE_results.append({
        "Model": name, 
        "Best Parameters": best_parameters,
        "Selected Features": len(best_features),
        "Test_rmse": test_rmse})
    
TE_results_df = pd.DataFrame(TE_results).sort_values("Test_rmse")
TE_results_df

  arr = np.array(param_list)


Unnamed: 0,Model,Best Parameters,Selected Features,Test_rmse
4,XGBoost,"{'model__colsample_bytree': 0.8, 'model__learn...",22,39.173548
0,Random Forest,"{'model__max_depth': 10, 'model__min_samples_s...",21,40.330979
1,Lasso,{'model__alpha': 1},21,41.34635
3,Support Vector Regression,"{'model__C': 1, 'model__gamma': 'scale', 'mode...",21,42.733986
2,Neural Network,"{'model__alpha': 0.0001, 'model__hidden_layer_...",22,45.289785


In [218]:
#TE predictions

best_model = grid.best_estimator_

best_features, test_rmse = backward_feature_elimination(best_model, X_train_TE, y_train_TE, X_test_TE, y_test_TE)

best_model.fit(X_train_TE[best_features], y_train_TE)

y_pred_TE = best_model.predict(X_test_TE[best_features])


TE_test_results = pd.DataFrame({
    "Actual_Points": y_test_TE,
    "Predicted_Points": y_pred_TE
}, index = X_test_TE.index)

TE_test_results = fantasy_data.loc[TE_test_results.index, ["Name", "Team", "Position"]].join(TE_test_results)
TE_test_results

Unnamed: 0,Name,Team,Position,Actual_Points,Predicted_Points
1390,Jesper Horsted,CHI,TE,4.9,16.529345
1956,Andrew Beck,DEN,TE,38.8,20.835129
2047,J.P. Holtz,NOR,TE,0.0,11.272410
1645,Dawson Knox,BUF,TE,54.6,101.715630
755,Cameron Brate,TAM,TE,78.5,43.832664
...,...,...,...,...,...
674,Dallas Goedert,PHI,TE,165.0,123.234993
1298,Josiah Deguara,GNB,TE,24.4,51.481026
220,Foster Moreau,OAK,TE,33.0,50.688065
1440,Luke Farrell,JAX,TE,8.0,23.458382
