In [76]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

def rmse_score(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [77]:
#data
path = "training/players.csv"
fantasy_data = pd.read_csv(path)

fantasy_data_2024 = fantasy_data[fantasy_data["Next_Season"] == 2024]
fantasy_data = fantasy_data[fantasy_data["Next_Season"] != 2024]

target_col = "Fantasy_PPR_NextYear"
dropped_cols = {target_col, "Unnamed: 0", "Name", "Team", "Position", "Team_NextYear"}

QB_data = fantasy_data[fantasy_data["Position"] == "QB"].drop(columns = dropped_cols)
RB_data = fantasy_data[fantasy_data["Position"] == "RB"].drop(columns = dropped_cols)
WR_data = fantasy_data[fantasy_data["Position"] == "WR"].drop(columns = dropped_cols)
TE_data = fantasy_data[fantasy_data["Position"] == "TE"].drop(columns = dropped_cols)

QB_y = fantasy_data.loc[QB_data.index, target_col]
RB_y = fantasy_data.loc[RB_data.index, target_col]
WR_y = fantasy_data.loc[WR_data.index, target_col]
TE_y = fantasy_data.loc[TE_data.index, target_col]

QB_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "QB"].drop(columns = dropped_cols)
RB_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "RB"].drop(columns = dropped_cols)
WR_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "WR"].drop(columns = dropped_cols)
TE_data_2024 = fantasy_data_2024[fantasy_data_2024["Position"] == "TE"].drop(columns = dropped_cols)

In [78]:
#train/test/validation split

X_train_QB, X_test_QB, y_train_QB, y_test_QB = train_test_split(QB_data, QB_y, test_size = 0.20)

X_train_RB, X_test_RB, y_train_RB, y_test_RB = train_test_split(RB_data, RB_y, test_size = 0.20)

X_train_WR, X_test_WR, y_train_WR, y_test_WR = train_test_split(WR_data, WR_y, test_size = 0.20)

X_train_TE, X_test_TE, y_train_TE, y_test_TE = train_test_split(TE_data, TE_y, test_size = 0.20)


In [79]:
#models

model_pipeline = {
    "Random Forest": (
        RandomForestRegressor(random_state = 42, n_jobs = 1),
        {
            "model__n_estimators": [300, 400],
            "model__max_depth": [10, 20],
            "model__min_samples_split": [2, 5],
        }, 
    ), 

    "Lasso": (
       Lasso(max_iter = 50000, random_state = 42),
        {
            "model__alpha": [.01, .1, 1, 10]
        }, 
    ), 

    "Neural Network": (
        MLPRegressor(max_iter = 4000, early_stopping = True, random_state = 42),
        {
            "model__hidden_layer_sizes": [(64, ), (128, ), (64, 32)],
            "model__alpha": [.0001, .001],
            "model__learning_rate_init": [.001, .01],
        }, 
    ), 
    "Support Vector Regression": (
        SVR(),
        {
            "model__C": [.1, 1, 10],
            "model__gamma": ["scale", "auto"],
            "model__kernel": ["rbf", "linear"],
        }, 
    ), 
    "XGBoost": (
        XGBRegressor(random_state = 42, n_jobs = -1, tree_method = "hist", n_estimators = 300, objective = "reg:squarederror"),
        {
            "model__n_estimators": [300, 500],
            "model__max_depth": [3, 5, 7],
            "model__learning_rate": [.01, 1],
            "model__subsample": [.8, 1],
            "model__colsample_bytree": [.8 ,1]
        }, 
    ), 
}


In [80]:
#Backward feature elimination

def backward_feature_elimination(model, X_train, y_train, X_test, y_test, min_features = 5):

    included_features = list(X_train.columns)
    
    model.fit(X_train[included_features], y_train)
    y_pred = model.predict(X_test[included_features])
    best_rmse = rmse_score(y_test, y_pred)
    best_features = included_features.copy()

    predictor = model.named_steps["model"]

    if hasattr(predictor, "feature_importances_"):
        def important():
            return np.asarray(model.named_steps["model"].feature_importances_)   
    elif hasattr(predictor, "coef_"):
        def important():
            return np.abs(np.ravel(model.named_steps["model"].coef_))  
    else:
        return best_features, best_rmse

    while len(included_features) > min_features:
        importances = important()
        dropped_index = int(np.argmin(importances))
        dropped_feature = included_features[dropped_index]
        
        current = included_features.copy()
        current.remove(dropped_feature)
        model.fit(X_train[current], y_train)
        y_pred = model.predict(X_test[current])
        current_rmse = rmse_score(y_test, y_pred)

        if current_rmse <= best_rmse:
            included_features = current
            best_rmse = current_rmse
            best_features = current.copy()
            
            predictor = model.named_steps["model"]
        else:
            break
        
    return best_features, best_rmse

In [81]:
#QB Tuning

QB_results = []

for name, (base_model, param_grid) in model_pipeline.items():
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler()),
        ("model", base_model)
        ])
    
    grid = GridSearchCV(pipe, param_grid = param_grid, scoring = rmse_scorer, cv = 3, n_jobs = -1)
    grid.fit(X_train_QB, y_train_QB)

    best_model = grid.best_estimator_
    best_parameters = grid.best_params_

    best_features, test_rmse = backward_feature_elimination(best_model, X_train_QB, y_train_QB, X_test_QB, y_test_QB)

    QB_results.append({
        "Model": name, 
        "Best Parameters": best_parameters,
        "Selected Features": len(best_features),
        "Test_rmse": test_rmse})
    
results_df = pd.DataFrame(QB_results).sort_values("Test_rmse")

results_df


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  arr = np.array(param_list)


Unnamed: 0,Model,Best Parameters,Selected Features,Test_rmse
1,Lasso,{'model__alpha': 10},5,92.523066
0,Random Forest,"{'model__max_depth': 10, 'model__min_samples_s...",22,96.01886
3,Support Vector Regression,"{'model__C': 1, 'model__gamma': 'scale', 'mode...",20,96.629301
4,XGBoost,"{'model__colsample_bytree': 0.8, 'model__learn...",21,96.712759
2,Neural Network,"{'model__alpha': 0.001, 'model__hidden_layer_s...",22,97.479235


In [82]:
#QB predictions

best_model = grid.best_estimator_

best_features, test_rmse = backward_feature_elimination(best_model, X_train_QB, y_train_QB, X_test_QB, y_test_QB)

best_model.fit(X_train_QB[best_features], y_train_QB)

y_pred_QB = best_model.predict(X_test_QB[best_features])


QB_test_results = pd.DataFrame({
    "Actual_Points": y_test_QB,
    "Predicted_Points": y_pred_QB
}, index = X_test_QB.index)

QB_test_results = fantasy_data.loc[QB_test_results.index, ["Name", "Team", "Position"]].join(QB_test_results)
QB_test_results

Unnamed: 0,Name,Team,Position,Actual_Points,Predicted_Points
1339,Tim Boyle,DET,QB,-2.9,46.513294
728,Brandon Allen,CIN,QB,13.9,63.855732
1063,Dak Prescott,DAL,QB,198.6,296.25705
1652,Kyler Murray,ARI,QB,146.4,187.312851
1135,Justin Fields,CHI,QB,296.0,136.276276
2031,Kyle Trask,TAM,QB,-0.1,58.447121
2075,C.J. Beathard,JAX,QB,21.5,31.232409
1928,Trey Lance,SFO,QB,12.7,71.071175
512,Aaron Rodgers,GNB,QB,333.3,287.684631
975,A.J. McCarron,HOU,QB,0.8,27.889524
