In [89]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import optuna
import logging
import xgboost
import lightgbm
import numpy as np
import os
import sys
import joblib
# import scipy

current_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))

if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
from config.config import Config
from visualization.visualization import make_highlighted_table, team_styled_table

config = Config()

In [90]:
pd.set_option('display.max_columns', None)

In [91]:
current_player_df = pd.read_parquet("../data/parquet/playoff_player_data_season_2.parquet")
prev_player_df = pd.read_parquet("../data/parquet/previous_season_player_data.parquet")

current_team_df = pd.read_parquet("../data/parquet/playoff_team_data_season_2.parquet")
prev_team_df = pd.read_parquet("../data/parquet/previous_season_team_data.parquet")

In [92]:
current_player_df["Fantasy Points"] = (
    current_player_df["Goals Per Game"] * config.goals_pts +
    current_player_df["Assists Per Game"] * config.assists_pts +
    current_player_df["Saves Per Game"] * config.saves_pts +
    current_player_df["Shots Per Game"] * config.shots_pts +
    current_player_df["Demos Inf. Per Game"] * config.demos_inf_pts +
    current_player_df["Demos Taken Per Game"] * config.demos_taken_pts 
) * 100

prev_player_df["Fantasy Points"] = (
    prev_player_df["Goals Per Game"] * config.goals_pts +
    prev_player_df["Assists Per Game"] * config.assists_pts +
    prev_player_df["Saves Per Game"] * config.saves_pts +
    prev_player_df["Shots Per Game"] * config.shots_pts +
    prev_player_df["Demos Inf. Per Game"] * config.demos_inf_pts +
    prev_player_df["Demos Taken Per Game"] * config.demos_taken_pts
) * 100

In [93]:
prev_player_df = prev_player_df.add_suffix("_prev")
prev_player_df = prev_player_df.rename(columns={"Player_prev": "Player"})

merged_df = pd.merge(prev_player_df, current_player_df, on="Player", how="inner")

In [94]:
feature_cols = [
    "Avg Score_prev",
    "Goals Per Game_prev",
    "Assists Per Game_prev",
    "Saves Per Game_prev",
    "Shots Per Game_prev",
    "Demos Inf. Per Game_prev",
    "Demos Taken Per Game_prev"
]
target_col = "Fantasy Points_prev"

In [95]:
X = merged_df[feature_cols]
y = merged_df[target_col]

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
def objective(trial):
    # Define the pipeline steps
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
        ('regressor', trial.suggest_categorical('regressor', ['RandomForest', 'XGBoost', 'LightGBM']))
    ])
    
    # Suggest hyperparameters based on the chosen regressor
    regressor_name = pipeline.named_steps['regressor']
    
    if regressor_name == 'RandomForest':
        pipeline.set_params(regressor=RandomForestRegressor(
            n_estimators=trial.suggest_int('rf_n_estimators', 100, 1000),
            max_depth=trial.suggest_int('rf_max_depth', 10, 50),
            min_samples_split=trial.suggest_int('rf_min_samples_split', 2, 10),
            random_state=42
        ))
    elif regressor_name == 'XGBoost':
        pipeline.set_params(regressor=xgboost.XGBRegressor(
            n_estimators=trial.suggest_int('xgb_n_estimators', 100, 1000),
            max_depth=trial.suggest_int('xgb_max_depth', 3, 10),
            learning_rate=trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
            subsample=trial.suggest_uniform('xgb_subsample', 0.5, 1.0),
            random_state=42,
            objective='reg:squarederror'
        ))
    elif regressor_name == 'LightGBM':
        pipeline.set_params(regressor=lightgbm.LGBMRegressor(
            n_estimators=trial.suggest_int('lgbm_n_estimators', 100, 1000),
            max_depth=trial.suggest_int('lgbm_max_depth', -1, 50),
            learning_rate=trial.suggest_loguniform('lgbm_learning_rate', 1e-4, 1e-1),
            num_leaves=trial.suggest_int('lgbm_num_leaves', 20, 300),
            subsample=trial.suggest_uniform('lgbm_subsample', 0.5, 1.0),
            random_state=42
        ))
    
    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = pipeline.predict(X_test)
    
    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    
    return mse


In [98]:
def main():
    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger()
    
    # Create a study object and specify the direction as minimizing MSE
    study = optuna.create_study(direction='minimize')
    
    # Optimize the objective function
    study.optimize(objective, n_trials=50, timeout=600)  # Adjust n_trials and timeout as needed
    
    # Log the best trial
    logger.info("Number of finished trials: {}".format(len(study.trials)))
    logger.info("Best trial:")
    trial = study.best_trial
    
    logger.info("  Value: {}".format(trial.value))
    
    logger.info("  Params: ")
    for key, value in trial.params.items():
        logger.info("    {}: {}".format(key, value))
    
    # Save the best model
    best_model = study.best_trial.user_attrs.get('model', None)
    joblib.dump(study.best_trial, "../data/processed/optuna_best_trial.pkl")
    logger.info("Best trial saved.")
    
    # Optionally, visualize the optimization history
    fig = optuna.visualization.plot_optimization_history(study)
    # fig.savefig("../images/optuna_optimization_history.png")
    
    # Visualize parameter importance
    fig = optuna.visualization.plot_param_importances(study)
    # fig.savefig("../images/optuna_param_importances.png")
    
    # Plot feature importances for the best model if possible
    # This requires accessing the pipeline's regressor
    # Load the pipeline
    best_pipeline = study.best_trial.user_attrs.get('model_pipeline', None)
    if best_pipeline:
        if isinstance(best_pipeline.named_steps['regressor'], RandomForestRegressor) or \
           isinstance(best_pipeline.named_steps['regressor'], xgboost.XGBRegressor) or \
           isinstance(best_pipeline.named_steps['regressor'], lightgbm.LGBMRegressor):
            importances = best_pipeline.named_steps['regressor'].feature_importances_
            feature_names = best_pipeline.named_steps['poly_features'].get_feature_names_out(feature_cols)
            feature_importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importances
            }).sort_values(by='importance', ascending=False)
            
            plt.figure(figsize=(12, 8))
            sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20))
            plt.title('Top 20 Feature Importances - Best Model')
            plt.tight_layout()
            plt.savefig('../images/best_model_feature_importances.png')
            plt.close()
            logger.info("Best model feature importances plot saved.")
    
if __name__ == "__main__":
    main()


[I 2024-11-17 13:54:24,093] A new study created in memory with name: no-name-b94492d4-1caa-4600-a82a-d3a168f61240
[I 2024-11-17 13:54:24,113] Trial 0 finished with value: 226488.98448652582 and parameters: {'regressor': 'LightGBM', 'lgbm_n_estimators': 464, 'lgbm_max_depth': 7, 'lgbm_learning_rate': 0.0008678314476513422, 'lgbm_num_leaves': 166, 'lgbm_subsample': 0.5895438082052575}. Best is trial 0 with value: 226488.98448652582.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 19, number of used features: 0
[LightGBM] [Info] Start training from score 1138.042249


[I 2024-11-17 13:54:24,491] Trial 1 finished with value: 97711.62909803884 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 484, 'rf_max_depth': 29, 'rf_min_samples_split': 8}. Best is trial 1 with value: 97711.62909803884.
[I 2024-11-17 13:54:24,503] Trial 2 finished with value: 226488.98448652582 and parameters: {'regressor': 'LightGBM', 'lgbm_n_estimators': 383, 'lgbm_max_depth': 32, 'lgbm_learning_rate': 0.00013550464476804085, 'lgbm_num_leaves': 42, 'lgbm_subsample': 0.9675868484772749}. Best is trial 1 with value: 97711.62909803884.


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 19, number of used features: 0
[LightGBM] [Info] Start training from score 1138.042249


[I 2024-11-17 13:54:24,809] Trial 3 finished with value: 203416.1830784104 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 707, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.00022432282182211828, 'xgb_subsample': 0.8370179053025495}. Best is trial 1 with value: 97711.62909803884.
[I 2024-11-17 13:54:25,277] Trial 4 finished with value: 87283.20492186198 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 610, 'rf_max_depth': 35, 'rf_min_samples_split': 5}. Best is trial 4 with value: 87283.20492186198.
[I 2024-11-17 13:54:25,887] Trial 5 finished with value: 107675.10420540083 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 959, 'rf_max_depth': 27, 'rf_min_samples_split': 10}. Best is trial 4 with value: 87283.20492186198.
[I 2024-11-17 13:54:26,226] Trial 6 finished with value: 48704.81693250408 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 914, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.06756562542413151, 'xgb_subsample': 0.91867

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 19, number of used features: 0
[LightGBM] [Info] Start training from score 1138.042249


[I 2024-11-17 13:54:27,107] Trial 10 finished with value: 48666.046924066475 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 982, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.06241300758291469, 'xgb_subsample': 0.9913194691315299}. Best is trial 10 with value: 48666.046924066475.
[I 2024-11-17 13:54:27,442] Trial 11 finished with value: 49063.296124060944 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 966, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.06795006294235953, 'xgb_subsample': 0.9980986861973877}. Best is trial 10 with value: 48666.046924066475.
[I 2024-11-17 13:54:27,748] Trial 12 finished with value: 48508.01335132759 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 997, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.0873170493831795, 'xgb_subsample': 0.9965906093931669}. Best is trial 12 with value: 48508.01335132759.
[I 2024-11-17 13:54:28,106] Trial 13 finished with value: 63254.87239805363 and parameters: {'regressor': 'XGBoost', 'xgb_n_

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 19, number of used features: 0
[LightGBM] [Info] Start training from score 1138.042249


[I 2024-11-17 13:54:29,976] Trial 20 finished with value: 50122.64627518823 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 802, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.014016964528265946, 'xgb_subsample': 0.8947011517020977}. Best is trial 12 with value: 48508.01335132759.
[I 2024-11-17 13:54:30,304] Trial 21 finished with value: 49245.48476366564 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 997, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.07059629377710808, 'xgb_subsample': 0.953667938937818}. Best is trial 12 with value: 48508.01335132759.
[I 2024-11-17 13:54:30,555] Trial 22 finished with value: 48104.98678062153 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 838, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.09121787080318307, 'xgb_subsample': 0.9405195882995027}. Best is trial 22 with value: 48104.98678062153.
[I 2024-11-17 13:54:30,942] Trial 23 finished with value: 48828.06686123989 and parameters: {'regressor': 'XGBoost', 'xgb_n_estima

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 19, number of used features: 0
[LightGBM] [Info] Start training from score 1138.042249
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 19, number of used features: 0
[LightGBM] [Info] Start training from score 1138.042249


[I 2024-11-17 13:54:32,943] Trial 31 finished with value: 49912.615509823605 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 742, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.014377066216327582, 'xgb_subsample': 0.8741662406494022}. Best is trial 22 with value: 48104.98678062153.
[I 2024-11-17 13:54:33,215] Trial 32 finished with value: 48149.400936128695 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 535, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.037528949652502176, 'xgb_subsample': 0.9367941814130192}. Best is trial 22 with value: 48104.98678062153.
[I 2024-11-17 13:54:33,408] Trial 33 finished with value: 48082.763846890266 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 538, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.09454354195662111, 'xgb_subsample': 0.9590490357985809}. Best is trial 33 with value: 48082.763846890266.
[I 2024-11-17 13:54:33,606] Trial 34 finished with value: 48692.5950787892 and parameters: {'regressor': 'XGBoost', 'xgb_n_e

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 19, number of used features: 0
[LightGBM] [Info] Start training from score 1138.042249


[I 2024-11-17 13:54:35,294] Trial 40 finished with value: 48316.70028470429 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 591, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.09731091380641017, 'xgb_subsample': 0.9119409235993946}. Best is trial 33 with value: 48082.763846890266.
[I 2024-11-17 13:54:35,506] Trial 41 finished with value: 48868.83824555638 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 584, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.09551906525937784, 'xgb_subsample': 0.91461336654061}. Best is trial 33 with value: 48082.763846890266.
[I 2024-11-17 13:54:35,801] Trial 42 finished with value: 48391.55084149891 and parameters: {'regressor': 'XGBoost', 'xgb_n_estimators': 665, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.04644026544228636, 'xgb_subsample': 0.9566825924190075}. Best is trial 33 with value: 48082.763846890266.
[I 2024-11-17 13:54:36,003] Trial 43 finished with value: 48767.6914629642 and parameters: {'regressor': 'XGBoost', 'xgb_n_estima